165 files changed, 80123 insertions, 0 deletions
diff --git a/src/3rdparty/eigen/Eigen/src/Core/ArithmeticSequence.h b/src/3rdparty/eigen/Eigen/src/Core/ArithmeticSequence.h
new file mode 100644
index 000000000..b6200fac1
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/ArithmeticSequence.h
@@ -0,0 +1,413 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARITHMETIC_SEQUENCE_H
+#define EIGEN_ARITHMETIC_SEQUENCE_H
+
+namespace Eigen {
+
+namespace internal {
+
+#if (!EIGEN_HAS_CXX11) || !((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48)
+template<typename T> struct aseq_negate {};
+
+template<> struct aseq_negate<Index> {
+  typedef Index type;
+};
+
+template<int N> struct aseq_negate<FixedInt<N> > {
+  typedef FixedInt<-N> type;
+};
+
+// Compilation error in the following case:
+template<> struct aseq_negate<FixedInt<DynamicIndex> > {};
+
+template<typename FirstType,typename SizeType,typename IncrType,
+         bool FirstIsSymbolic=symbolic::is_symbolic<FirstType>::value,
+         bool SizeIsSymbolic =symbolic::is_symbolic<SizeType>::value>
+struct aseq_reverse_first_type {
+  typedef Index type;
+};
+
+template<typename FirstType,typename SizeType,typename IncrType>
+struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,true> {
+  typedef symbolic::AddExpr<FirstType,
+                            symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
+                                                  symbolic::ValueExpr<IncrType> >
+                           > type;
+};
+
+template<typename SizeType,typename IncrType,typename EnableIf = void>
+struct aseq_reverse_first_type_aux {
+  typedef Index type;
+};
+
+template<typename SizeType,typename IncrType>
+struct aseq_reverse_first_type_aux<SizeType,IncrType,typename internal::enable_if<bool((SizeType::value+IncrType::value)|0x1)>::type> {
+  typedef FixedInt<(SizeType::value-1)*IncrType::value> type;
+};
+
+template<typename FirstType,typename SizeType,typename IncrType>
+struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,false> {
+  typedef typename aseq_reverse_first_type_aux<SizeType,IncrType>::type Aux;
+  typedef symbolic::AddExpr<FirstType,symbolic::ValueExpr<Aux> > type;
+};
+
+template<typename FirstType,typename SizeType,typename IncrType>
+struct aseq_reverse_first_type<FirstType,SizeType,IncrType,false,true> {
+  typedef symbolic::AddExpr<symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
+                                                  symbolic::ValueExpr<IncrType> >,
+                            symbolic::ValueExpr<> > type;
+};
+#endif
+
+// Helper to cleanup the type of the increment:
+template<typename T> struct cleanup_seq_incr {
+  typedef typename cleanup_index_type<T,DynamicIndex>::type type;
+};
+
+}
+
+//--------------------------------------------------------------------------------
+// seq(first,last,incr) and seqN(first,size,incr)
+//--------------------------------------------------------------------------------
+
+template<typename FirstType=Index,typename SizeType=Index,typename IncrType=internal::FixedInt<1> >
+class ArithmeticSequence;
+
+template<typename FirstType,typename SizeType,typename IncrType>
+ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
+                   typename internal::cleanup_index_type<SizeType>::type,
+                   typename internal::cleanup_seq_incr<IncrType>::type >
+seqN(FirstType first, SizeType size, IncrType incr);
+
+/** \class ArithmeticSequence
+  * \ingroup Core_Module
+  *
+  * This class represents an arithmetic progression \f$ a_0, a_1, a_2, ..., a_{n-1}\f$ defined by
+  * its \em first value \f$ a_0 \f$, its \em size (aka length) \em n, and the \em increment (aka stride)
+  * that is equal to \f$ a_{i+1}-a_{i}\f$ for any \em i.
+  *
+  * It is internally used as the return type of the Eigen::seq and Eigen::seqN functions, and as the input arguments
+  * of DenseBase::operator()(const RowIndices&, const ColIndices&), and most of the time this is the
+  * only way it is used.
+  *
+  * \tparam FirstType type of the first element, usually an Index,
+  *                   but internally it can be a symbolic expression
+  * \tparam SizeType type representing the size of the sequence, usually an Index
+  *                  or a compile time integral constant. Internally, it can also be a symbolic expression
+  * \tparam IncrType type of the increment, can be a runtime Index, or a compile time integral constant (default is compile-time 1)
+  *
+  * \sa Eigen::seq, Eigen::seqN, DenseBase::operator()(const RowIndices&, const ColIndices&), class IndexedView
+  */
+template<typename FirstType,typename SizeType,typename IncrType>
+class ArithmeticSequence
+{
+public:
+  ArithmeticSequence(FirstType first, SizeType size) : m_first(first), m_size(size) {}
+  ArithmeticSequence(FirstType first, SizeType size, IncrType incr) : m_first(first), m_size(size), m_incr(incr) {}
+
+  enum {
+    SizeAtCompileTime = internal::get_fixed_value<SizeType>::value,
+    IncrAtCompileTime = internal::get_fixed_value<IncrType,DynamicIndex>::value
+  };
+
+  /** \returns the size, i.e., number of elements, of the sequence */
+  Index size()  const { return m_size; }
+
+  /** \returns the first element \f$ a_0 \f$ in the sequence */
+  Index first()  const { return m_first; }
+
+  /** \returns the value \f$ a_i \f$ at index \a i in the sequence. */
+  Index operator[](Index i) const { return m_first + i * m_incr; }
+
+  const FirstType& firstObject() const { return m_first; }
+  const SizeType&  sizeObject()  const { return m_size; }
+  const IncrType&  incrObject()  const { return m_incr; }
+
+protected:
+  FirstType m_first;
+  SizeType  m_size;
+  IncrType  m_incr;
+
+public:
+
+#if EIGEN_HAS_CXX11 && ((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48)
+  auto reverse() const -> decltype(Eigen::seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr)) {
+    return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr);
+  }
+#else
+protected:
+  typedef typename internal::aseq_negate<IncrType>::type ReverseIncrType;
+  typedef typename internal::aseq_reverse_first_type<FirstType,SizeType,IncrType>::type ReverseFirstType;
+public:
+  ArithmeticSequence<ReverseFirstType,SizeType,ReverseIncrType>
+  reverse() const {
+    return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr);
+  }
+#endif
+};
+
+/** \returns an ArithmeticSequence starting at \a first, of length \a size, and increment \a incr
+  *
+  * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
+template<typename FirstType,typename SizeType,typename IncrType>
+ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type,typename internal::cleanup_seq_incr<IncrType>::type >
+seqN(FirstType first, SizeType size, IncrType incr)  {
+  return ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type,typename internal::cleanup_seq_incr<IncrType>::type>(first,size,incr);
+}
+
+/** \returns an ArithmeticSequence starting at \a first, of length \a size, and unit increment
+  *
+  * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType) */
+template<typename FirstType,typename SizeType>
+ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type >
+seqN(FirstType first, SizeType size)  {
+  return ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type>(first,size);
+}
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+
+/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and with positive (or negative) increment \a incr
+  *
+  * It is essentially an alias to:
+  * \code
+  * seqN(f, (l-f+incr)/incr, incr);
+  * \endcode
+  *
+  * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType)
+  */
+template<typename FirstType,typename LastType, typename IncrType>
+auto seq(FirstType f, LastType l, IncrType incr);
+
+/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and unit increment
+  *
+  * It is essentially an alias to:
+  * \code
+  * seqN(f,l-f+1);
+  * \endcode
+  *
+  * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType)
+  */
+template<typename FirstType,typename LastType>
+auto seq(FirstType f, LastType l);
+
+#else // EIGEN_PARSED_BY_DOXYGEN
+
+#if EIGEN_HAS_CXX11
+template<typename FirstType,typename LastType>
+auto seq(FirstType f, LastType l) -> decltype(seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+                                                   (  typename internal::cleanup_index_type<LastType>::type(l)
+                                                    - typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>())))
+{
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              (typename internal::cleanup_index_type<LastType>::type(l)
+               -typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
+}
+
+template<typename FirstType,typename LastType, typename IncrType>
+auto seq(FirstType f, LastType l, IncrType incr)
+  -> decltype(seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+                   (   typename internal::cleanup_index_type<LastType>::type(l)
+                     - typename internal::cleanup_index_type<FirstType>::type(f)+typename internal::cleanup_seq_incr<IncrType>::type(incr)
+                   ) / typename internal::cleanup_seq_incr<IncrType>::type(incr),
+                   typename internal::cleanup_seq_incr<IncrType>::type(incr)))
+{
+  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              ( typename internal::cleanup_index_type<LastType>::type(l)
+               -typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr),
+              CleanedIncrType(incr));
+}
+
+#else // EIGEN_HAS_CXX11
+
+template<typename FirstType,typename LastType>
+typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
+                             ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index> >::type
+seq(FirstType f, LastType l)
+{
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              Index((typename internal::cleanup_index_type<LastType>::type(l)-typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>())));
+}
+
+template<typename FirstTypeDerived,typename LastType>
+typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
+    ArithmeticSequence<FirstTypeDerived, symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,symbolic::ValueExpr<> >,
+                                                            symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l)
+{
+  return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+fix<1>()));
+}
+
+template<typename FirstType,typename LastTypeDerived>
+typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
+    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
+                        symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
+                                          symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
+seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l)
+{
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),(l.derived()-typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
+}
+
+template<typename FirstTypeDerived,typename LastTypeDerived>
+ArithmeticSequence<FirstTypeDerived,
+                    symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::NegateExpr<FirstTypeDerived> >,symbolic::ValueExpr<internal::FixedInt<1> > > >
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l)
+{
+  return seqN(f.derived(),(l.derived()-f.derived()+fix<1>()));
+}
+
+
+template<typename FirstType,typename LastType, typename IncrType>
+typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
+    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index,typename internal::cleanup_seq_incr<IncrType>::type> >::type
+seq(FirstType f, LastType l, IncrType incr)
+{
+  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              Index((typename internal::cleanup_index_type<LastType>::type(l)-typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr)), incr);
+}
+
+template<typename FirstTypeDerived,typename LastType, typename IncrType>
+typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
+    ArithmeticSequence<FirstTypeDerived,
+                        symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,
+                                                                                   symbolic::ValueExpr<> >,
+                                                                 symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                              symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                        typename internal::cleanup_seq_incr<IncrType>::type> >::type
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l, IncrType incr)
+{
+  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
+  return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
+}
+
+template<typename FirstType,typename LastTypeDerived, typename IncrType>
+typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
+    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
+                        symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
+                                                                 symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                               symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                        typename internal::cleanup_seq_incr<IncrType>::type> >::type
+seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
+{
+  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              (l.derived()-typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
+}
+
+template<typename FirstTypeDerived,typename LastTypeDerived, typename IncrType>
+ArithmeticSequence<FirstTypeDerived,
+                    symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,
+                                                                               symbolic::NegateExpr<FirstTypeDerived> >,
+                                                             symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                          symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                    typename internal::cleanup_seq_incr<IncrType>::type>
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
+{
+  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
+  return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
+}
+#endif // EIGEN_HAS_CXX11
+
+#endif // EIGEN_PARSED_BY_DOXYGEN
+
+
+#if EIGEN_HAS_CXX11 || defined(EIGEN_PARSED_BY_DOXYGEN)
+/** \cpp11
+  * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
+  *
+  * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
+  * 
+  * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
+template<typename SizeType,typename IncrType>
+auto lastN(SizeType size, IncrType incr)
+-> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr))
+{
+  return seqN(Eigen::last-(size-fix<1>())*incr, size, incr);
+}
+
+/** \cpp11
+  * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment.
+  *
+  *  It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode
+  * 
+  * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */
+template<typename SizeType>
+auto lastN(SizeType size)
+-> decltype(seqN(Eigen::last+fix<1>()-size, size))
+{
+  return seqN(Eigen::last+fix<1>()-size, size);
+}
+#endif
+
+namespace internal {
+
+// Convert a symbolic span into a usable one (i.e., remove last/end "keywords")
+template<typename T>
+struct make_size_type {
+  typedef typename internal::conditional<symbolic::is_symbolic<T>::value, Index, T>::type type;
+};
+
+template<typename FirstType,typename SizeType,typename IncrType,int XprSize>
+struct IndexedViewCompatibleType<ArithmeticSequence<FirstType,SizeType,IncrType>, XprSize> {
+  typedef ArithmeticSequence<Index,typename make_size_type<SizeType>::type,IncrType> type;
+};
+
+template<typename FirstType,typename SizeType,typename IncrType>
+ArithmeticSequence<Index,typename make_size_type<SizeType>::type,IncrType>
+makeIndexedViewCompatible(const ArithmeticSequence<FirstType,SizeType,IncrType>& ids, Index size,SpecializedType) {
+  return ArithmeticSequence<Index,typename make_size_type<SizeType>::type,IncrType>(
+            eval_expr_given_size(ids.firstObject(),size),eval_expr_given_size(ids.sizeObject(),size),ids.incrObject());
+}
+
+template<typename FirstType,typename SizeType,typename IncrType>
+struct get_compile_time_incr<ArithmeticSequence<FirstType,SizeType,IncrType> > {
+  enum { value = get_fixed_value<IncrType,DynamicIndex>::value };
+};
+
+} // end namespace internal
+
+/** \namespace Eigen::indexing
+  * \ingroup Core_Module
+  * 
+  * The sole purpose of this namespace is to be able to import all functions
+  * and symbols that are expected to be used within operator() for indexing
+  * and slicing. If you already imported the whole Eigen namespace:
+  * \code using namespace Eigen; \endcode
+  * then you are already all set. Otherwise, if you don't want/cannot import
+  * the whole Eigen namespace, the following line:
+  * \code using namespace Eigen::indexing; \endcode
+  * is equivalent to:
+  * \code
+  using Eigen::all;
+  using Eigen::seq;
+  using Eigen::seqN;
+  using Eigen::lastN; // c++11 only
+  using Eigen::last;
+  using Eigen::lastp1;
+  using Eigen::fix;
+  \endcode
+  */
+namespace indexing {
+  using Eigen::all;
+  using Eigen::seq;
+  using Eigen::seqN;
+  #if EIGEN_HAS_CXX11
+  using Eigen::lastN;
+  #endif
+  using Eigen::last;
+  using Eigen::lastp1;
+  using Eigen::fix;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_ARITHMETIC_SEQUENCE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Array.h b/src/3rdparty/eigen/Eigen/src/Core/Array.h
new file mode 100644
index 000000000..20c789b10
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Array.h
@@ -0,0 +1,417 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARRAY_H
+#define EIGEN_ARRAY_H
+
+namespace Eigen {
+
+namespace internal {
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+  typedef ArrayXpr XprKind;
+  typedef ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > XprBase;
+};
+}
+
+/** \class Array
+  * \ingroup Core_Module
+  *
+  * \brief General-purpose arrays with easy API for coefficient-wise operations
+  *
+  * The %Array class is very similar to the Matrix class. It provides
+  * general-purpose one- and two-dimensional arrays. The difference between the
+  * %Array and the %Matrix class is primarily in the API: the API for the
+  * %Array class provides easy access to coefficient-wise operations, while the
+  * API for the %Matrix class provides easy access to linear-algebra
+  * operations.
+  *
+  * See documentation of class Matrix for detailed information on the template parameters
+  * storage layout.
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
+  *
+  * \sa \blank \ref TutorialArrayClass, \ref TopicClassHierarchy
+  */
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+class Array
+  : public PlainObjectBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+  public:
+
+    typedef PlainObjectBase<Array> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Array)
+
+    enum { Options = _Options };
+    typedef typename Base::PlainObject PlainObject;
+
+  protected:
+    template <typename Derived, typename OtherDerived, bool IsVector>
+    friend struct internal::conservative_resize_like_impl;
+
+    using Base::m_storage;
+
+  public:
+
+    using Base::base;
+    using Base::coeff;
+    using Base::coeffRef;
+
+    /**
+      * The usage of
+      *   using Base::operator=;
+      * fails on MSVC. Since the code below is working with GCC and MSVC, we skipped
+      * the usage of 'using'. This should be done only for operator=.
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array& operator=(const EigenBase<OtherDerived> &other)
+    {
+      return Base::operator=(other);
+    }
+
+    /** Set all the entries to \a value.
+      * \sa DenseBase::setConstant(), DenseBase::fill()
+      */
+    /* This overload is needed because the usage of
+      *   using Base::operator=;
+      * fails on MSVC. Since the code below is working with GCC and MSVC, we skipped
+      * the usage of 'using'. This should be done only for operator=.
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array& operator=(const Scalar &value)
+    {
+      Base::setConstant(value);
+      return *this;
+    }
+
+    /** Copies the value of the expression \a other into \c *this with automatic resizing.
+      *
+      * *this might be resized to match the dimensions of \a other. If *this was a null matrix (not already initialized),
+      * it will be initialized.
+      *
+      * Note that copying a row-vector into a vector (and conversely) is allowed.
+      * The resizing, if any, is then done in the appropriate way so that row-vectors
+      * remain row-vectors and vectors remain vectors.
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array& operator=(const DenseBase<OtherDerived>& other)
+    {
+      return Base::_set(other);
+    }
+
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array& operator=(const Array& other)
+    {
+      return Base::_set(other);
+    }
+
+    /** Default constructor.
+      *
+      * For fixed-size matrices, does nothing.
+      *
+      * For dynamic-size matrices, creates an empty matrix of size 0. Does not allocate any array. Such a matrix
+      * is called a null matrix. This constructor is the unique way to create null matrices: resizing
+      * a matrix to 0 is not supported.
+      *
+      * \sa resize(Index,Index)
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array() : Base()
+    {
+      Base::_check_template_params();
+      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    // FIXME is it still needed ??
+    /** \internal */
+    EIGEN_DEVICE_FUNC
+    Array(internal::constructor_without_unaligned_array_assert)
+      : Base(internal::constructor_without_unaligned_array_assert())
+    {
+      Base::_check_template_params();
+      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+#endif
+
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    Array(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
+      : Base(std::move(other))
+    {
+      Base::_check_template_params();
+    }
+    EIGEN_DEVICE_FUNC
+    Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
+    {
+      Base::operator=(std::move(other));
+      return *this;
+    }
+#endif
+
+    #if EIGEN_HAS_CXX11
+    /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+     *
+     * Example: \include Array_variadic_ctor_cxx11.cpp
+     * Output: \verbinclude Array_variadic_ctor_cxx11.out
+     *
+     * \sa Array(const std::initializer_list<std::initializer_list<Scalar>>&)
+     * \sa Array(const Scalar&), Array(const Scalar&,const Scalar&)
+     */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      : Base(a0, a1, a2, a3, args...) {}
+
+    /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
+      *
+      * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
+      *
+      * Example: \include Array_initializer_list_23_cxx11.cpp
+      * Output: \verbinclude Array_initializer_list_23_cxx11.out
+      *
+      * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
+      *
+      * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed.
+      * Therefore <code> Array<int,Dynamic,1>{{1,2,3,4,5}}</code> is legal and the more verbose syntax
+      * <code>Array<int,Dynamic,1>{{1},{2},{3},{4},{5}}</code> can be avoided:
+      *
+      * Example: \include Array_initializer_list_vector_cxx11.cpp
+      * Output: \verbinclude Array_initializer_list_vector_cxx11.out
+      *
+      * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes,
+      * and implicit transposition is allowed for compile-time 1D arrays only.
+      *
+      * \sa  Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
+    #endif // end EIGEN_HAS_CXX11
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE explicit Array(const T& x)
+    {
+      Base::_check_template_params();
+      Base::template _init1<T>(x);
+    }
+
+    template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array(const T0& val0, const T1& val1)
+    {
+      Base::_check_template_params();
+      this->template _init2<T0,T1>(val0, val1);
+    }
+
+    #else
+    /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */
+    EIGEN_DEVICE_FUNC explicit Array(const Scalar *data);
+    /** Constructs a vector or row-vector with given dimension. \only_for_vectors
+      *
+      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
+      * it is redundant to pass the dimension here, so it makes more sense to use the default
+      * constructor Array() instead.
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE explicit Array(Index dim);
+    /** constructs an initialized 1x1 Array with the given coefficient
+      * \sa const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args */
+    Array(const Scalar& value);
+    /** constructs an uninitialized array with \a rows rows and \a cols columns.
+      *
+      * This is useful for dynamic-size arrays. For fixed-size arrays,
+      * it is redundant to pass these parameters, so one should use the default constructor
+      * Array() instead. */
+    Array(Index rows, Index cols);
+    /** constructs an initialized 2D vector with given coefficients
+      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */
+    Array(const Scalar& val0, const Scalar& val1);
+    #endif  // end EIGEN_PARSED_BY_DOXYGEN
+
+    /** constructs an initialized 3D vector with given coefficients
+      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2)
+    {
+      Base::_check_template_params();
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Array, 3)
+      m_storage.data()[0] = val0;
+      m_storage.data()[1] = val1;
+      m_storage.data()[2] = val2;
+    }
+    /** constructs an initialized 4D vector with given coefficients
+      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3)
+    {
+      Base::_check_template_params();
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Array, 4)
+      m_storage.data()[0] = val0;
+      m_storage.data()[1] = val1;
+      m_storage.data()[2] = val2;
+      m_storage.data()[3] = val3;
+    }
+
+    /** Copy constructor */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array(const Array& other)
+            : Base(other)
+    { }
+
+  private:
+    struct PrivateType {};
+  public:
+
+    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other,
+                              typename internal::enable_if<internal::is_convertible<typename OtherDerived::Scalar,Scalar>::value,
+                                                           PrivateType>::type = PrivateType())
+      : Base(other.derived())
+    { }
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT{ return 1; }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); }
+
+    #ifdef EIGEN_ARRAY_PLUGIN
+    #include EIGEN_ARRAY_PLUGIN
+    #endif
+
+  private:
+
+    template<typename MatrixType, typename OtherDerived, bool SwapPointers>
+    friend struct internal::matrix_swap_impl;
+};
+
+/** \defgroup arraytypedefs Global array typedefs
+  * \ingroup Core_Module
+  *
+  * %Eigen defines several typedef shortcuts for most common 1D and 2D array types.
+  *
+  * The general patterns are the following:
+  *
+  * \c ArrayRowsColsType where \c Rows and \c Cols can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size,
+  * and where \c Type can be \c i for integer, \c f for float, \c d for double, \c cf for complex float, \c cd
+  * for complex double.
+  *
+  * For example, \c Array33d is a fixed-size 3x3 array type of doubles, and \c ArrayXXf is a dynamic-size matrix of floats.
+  *
+  * There are also \c ArraySizeType which are self-explanatory. For example, \c Array4cf is
+  * a fixed-size 1D array of 4 complex floats.
+  *
+  * With \cpp11, template alias are also defined for common sizes.
+  * They follow the same pattern as above except that the scalar type suffix is replaced by a
+  * template parameter, i.e.:
+  *   - `ArrayRowsCols<Type>` where `Rows` and `Cols` can be \c 2,\c 3,\c 4, or \c X for fixed or dynamic size.
+  *   - `ArraySize<Type>` where `Size` can be \c 2,\c 3,\c 4 or \c X for fixed or dynamic size 1D arrays.
+  *
+  * \sa class Array
+  */
+
+#define EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix)   \
+/** \ingroup arraytypedefs */                                    \
+typedef Array<Type, Size, Size> Array##SizeSuffix##SizeSuffix##TypeSuffix;  \
+/** \ingroup arraytypedefs */                                    \
+typedef Array<Type, Size, 1>    Array##SizeSuffix##TypeSuffix;
+
+#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, Size)         \
+/** \ingroup arraytypedefs */                                    \
+typedef Array<Type, Size, Dynamic> Array##Size##X##TypeSuffix;  \
+/** \ingroup arraytypedefs */                                    \
+typedef Array<Type, Dynamic, Size> Array##X##Size##TypeSuffix;
+
+#define EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(Type, TypeSuffix) \
+EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, 2, 2) \
+EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, 3, 3) \
+EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, 4, 4) \
+EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, Dynamic, X) \
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, 2) \
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, 3) \
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, 4)
+
+EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(int,                  i)
+EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(float,                f)
+EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(double,               d)
+EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex<float>,  cf)
+EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)
+
+#undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES
+#undef EIGEN_MAKE_ARRAY_TYPEDEFS
+#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS
+
+#if EIGEN_HAS_CXX11
+
+#define EIGEN_MAKE_ARRAY_TYPEDEFS(Size, SizeSuffix)               \
+/** \ingroup arraytypedefs */                                     \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Array##SizeSuffix##SizeSuffix = Array<Type, Size, Size>;    \
+/** \ingroup arraytypedefs */                                     \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Array##SizeSuffix = Array<Type, Size, 1>;
+
+#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Size)                     \
+/** \ingroup arraytypedefs */                                     \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Array##Size##X = Array<Type, Size, Dynamic>;                \
+/** \ingroup arraytypedefs */                                     \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Array##X##Size = Array<Type, Dynamic, Size>;
+
+EIGEN_MAKE_ARRAY_TYPEDEFS(2, 2)
+EIGEN_MAKE_ARRAY_TYPEDEFS(3, 3)
+EIGEN_MAKE_ARRAY_TYPEDEFS(4, 4)
+EIGEN_MAKE_ARRAY_TYPEDEFS(Dynamic, X)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(2)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(3)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(4)
+
+#undef EIGEN_MAKE_ARRAY_TYPEDEFS
+#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS
+
+#endif // EIGEN_HAS_CXX11
+
+#define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
+using Eigen::Matrix##SizeSuffix##TypeSuffix; \
+using Eigen::Vector##SizeSuffix##TypeSuffix; \
+using Eigen::RowVector##SizeSuffix##TypeSuffix;
+
+#define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(TypeSuffix) \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 2) \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 3) \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 4) \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, X) \
+
+#define EIGEN_USING_ARRAY_TYPEDEFS \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(i) \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(f) \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(d) \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(cf) \
+EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(cd)
+
+} // end namespace Eigen
+
+#endif // EIGEN_ARRAY_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/ArrayBase.h b/src/3rdparty/eigen/Eigen/src/Core/ArrayBase.h
new file mode 100644
index 000000000..ea3dd1c3b
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/ArrayBase.h
@@ -0,0 +1,226 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARRAYBASE_H
+#define EIGEN_ARRAYBASE_H
+
+namespace Eigen { 
+
+template<typename ExpressionType> class MatrixWrapper;
+
+/** \class ArrayBase
+  * \ingroup Core_Module
+  *
+  * \brief Base class for all 1D and 2D array, and related expressions
+  *
+  * An array is similar to a dense vector or matrix. While matrices are mathematical
+  * objects with well defined linear algebra operators, an array is just a collection
+  * of scalar values arranged in a one or two dimensionnal fashion. As the main consequence,
+  * all operations applied to an array are performed coefficient wise. Furthermore,
+  * arrays support scalar math functions of the c++ standard library (e.g., std::sin(x)), and convenient
+  * constructors allowing to easily write generic code working for both scalar values
+  * and arrays.
+  *
+  * This class is the base that is inherited by all array expression types.
+  *
+  * \tparam Derived is the derived type, e.g., an array or an expression type.
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_ARRAYBASE_PLUGIN.
+  *
+  * \sa class MatrixBase, \ref TopicClassHierarchy
+  */
+template<typename Derived> class ArrayBase
+  : public DenseBase<Derived>
+{
+  public:
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** The base class for a given storage type. */
+    typedef ArrayBase StorageBaseType;
+
+    typedef ArrayBase Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl;
+
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    typedef DenseBase<Derived> Base;
+    using Base::RowsAtCompileTime;
+    using Base::ColsAtCompileTime;
+    using Base::SizeAtCompileTime;
+    using Base::MaxRowsAtCompileTime;
+    using Base::MaxColsAtCompileTime;
+    using Base::MaxSizeAtCompileTime;
+    using Base::IsVectorAtCompileTime;
+    using Base::Flags;
+    
+    using Base::derived;
+    using Base::const_cast_derived;
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::coeff;
+    using Base::coeffRef;
+    using Base::lazyAssign;
+    using Base::operator-;
+    using Base::operator=;
+    using Base::operator+=;
+    using Base::operator-=;
+    using Base::operator*=;
+    using Base::operator/=;
+
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef typename Base::PlainObject PlainObject;
+
+    /** \internal Represents a matrix with all coefficients equal to one another*/
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
+#   include "../plugins/MatrixCwiseUnaryOps.h"
+#   include "../plugins/ArrayCwiseUnaryOps.h"
+#   include "../plugins/CommonCwiseBinaryOps.h"
+#   include "../plugins/MatrixCwiseBinaryOps.h"
+#   include "../plugins/ArrayCwiseBinaryOps.h"
+#   ifdef EIGEN_ARRAYBASE_PLUGIN
+#     include EIGEN_ARRAYBASE_PLUGIN
+#   endif
+#undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS
+
+    /** Special case of the template operator=, in order to prevent the compiler
+      * from generating a default operator= (issue hit with g++ 4.1)
+      */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator=(const ArrayBase& other)
+    {
+      internal::call_assignment(derived(), other.derived());
+      return derived();
+    }
+    
+    /** Set all the entries to \a value.
+      * \sa DenseBase::setConstant(), DenseBase::fill() */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator=(const Scalar &value)
+    { Base::setConstant(value); return derived(); }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator+=(const Scalar& scalar);
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator-=(const Scalar& scalar);
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator+=(const ArrayBase<OtherDerived>& other);
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator-=(const ArrayBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator*=(const ArrayBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator/=(const ArrayBase<OtherDerived>& other);
+
+  public:
+    EIGEN_DEVICE_FUNC
+    ArrayBase<Derived>& array() { return *this; }
+    EIGEN_DEVICE_FUNC
+    const ArrayBase<Derived>& array() const { return *this; }
+
+    /** \returns an \link Eigen::MatrixBase Matrix \endlink expression of this array
+      * \sa MatrixBase::array() */
+    EIGEN_DEVICE_FUNC
+    MatrixWrapper<Derived> matrix() { return MatrixWrapper<Derived>(derived()); }
+    EIGEN_DEVICE_FUNC
+    const MatrixWrapper<const Derived> matrix() const { return MatrixWrapper<const Derived>(derived()); }
+
+//     template<typename Dest>
+//     inline void evalTo(Dest& dst) const { dst = matrix(); }
+
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(ArrayBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(ArrayBase)
+
+  private:
+    explicit ArrayBase(Index);
+    ArrayBase(Index,Index);
+    template<typename OtherDerived> explicit ArrayBase(const ArrayBase<OtherDerived>&);
+  protected:
+    // mixing arrays and matrices is not legal
+    template<typename OtherDerived> Derived& operator+=(const MatrixBase<OtherDerived>& )
+    {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;}
+    // mixing arrays and matrices is not legal
+    template<typename OtherDerived> Derived& operator-=(const MatrixBase<OtherDerived>& )
+    {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;}
+};
+
+/** replaces \c *this by \c *this - \a other.
+  *
+  * \returns a reference to \c *this
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
+ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
+{
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+/** replaces \c *this by \c *this + \a other.
+  *
+  * \returns a reference to \c *this
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
+ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
+{
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+/** replaces \c *this by \c *this * \a other coefficient wise.
+  *
+  * \returns a reference to \c *this
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
+ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
+{
+  call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+/** replaces \c *this by \c *this / \a other coefficient wise.
+  *
+  * \returns a reference to \c *this
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
+ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
+{
+  call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_ARRAYBASE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/ArrayWrapper.h b/src/3rdparty/eigen/Eigen/src/Core/ArrayWrapper.h
new file mode 100644
index 000000000..2e9555b53
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/ArrayWrapper.h
@@ -0,0 +1,209 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARRAYWRAPPER_H
+#define EIGEN_ARRAYWRAPPER_H
+
+namespace Eigen {
+
+/** \class ArrayWrapper
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a mathematical vector or matrix as an array object
+  *
+  * This class is the return type of MatrixBase::array(), and most of the time
+  * this is the only way it is use.
+  *
+  * \sa MatrixBase::array(), class MatrixWrapper
+  */
+
+namespace internal {
+template<typename ExpressionType>
+struct traits<ArrayWrapper<ExpressionType> >
+  : public traits<typename remove_all<typename ExpressionType::Nested>::type >
+{
+  typedef ArrayXpr XprKind;
+  // Let's remove NestByRefBit
+  enum {
+    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
+    LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
+    Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
+  };
+};
+}
+
+template<typename ExpressionType>
+class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
+{
+  public:
+    typedef ArrayBase<ArrayWrapper> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(ArrayWrapper)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ArrayWrapper)
+    typedef typename internal::remove_all<ExpressionType>::type NestedExpression;
+
+    typedef typename internal::conditional<
+                       internal::is_lvalue<ExpressionType>::value,
+                       Scalar,
+                       const Scalar
+                     >::type ScalarWithConstIfNotLvalue;
+
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType;
+
+    using Base::coeffRef;
+
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); }
+
+    EIGEN_DEVICE_FUNC
+    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
+    EIGEN_DEVICE_FUNC
+    inline const Scalar* data() const { return m_expression.data(); }
+
+    EIGEN_DEVICE_FUNC
+    inline const Scalar& coeffRef(Index rowId, Index colId) const
+    {
+      return m_expression.coeffRef(rowId, colId);
+    }
+
+    EIGEN_DEVICE_FUNC
+    inline const Scalar& coeffRef(Index index) const
+    {
+      return m_expression.coeffRef(index);
+    }
+
+    template<typename Dest>
+    EIGEN_DEVICE_FUNC
+    inline void evalTo(Dest& dst) const { dst = m_expression; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<NestedExpressionType>::type&
+    nestedExpression() const
+    {
+      return m_expression;
+    }
+
+    /** Forwards the resizing request to the nested expression
+      * \sa DenseBase::resize(Index)  */
+    EIGEN_DEVICE_FUNC
+    void resize(Index newSize) { m_expression.resize(newSize); }
+    /** Forwards the resizing request to the nested expression
+      * \sa DenseBase::resize(Index,Index)*/
+    EIGEN_DEVICE_FUNC
+    void resize(Index rows, Index cols) { m_expression.resize(rows,cols); }
+
+  protected:
+    NestedExpressionType m_expression;
+};
+
+/** \class MatrixWrapper
+  * \ingroup Core_Module
+  *
+  * \brief Expression of an array as a mathematical vector or matrix
+  *
+  * This class is the return type of ArrayBase::matrix(), and most of the time
+  * this is the only way it is use.
+  *
+  * \sa MatrixBase::matrix(), class ArrayWrapper
+  */
+
+namespace internal {
+template<typename ExpressionType>
+struct traits<MatrixWrapper<ExpressionType> >
+ : public traits<typename remove_all<typename ExpressionType::Nested>::type >
+{
+  typedef MatrixXpr XprKind;
+  // Let's remove NestByRefBit
+  enum {
+    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
+    LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
+    Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
+  };
+};
+}
+
+template<typename ExpressionType>
+class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
+{
+  public:
+    typedef MatrixBase<MatrixWrapper<ExpressionType> > Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(MatrixWrapper)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(MatrixWrapper)
+    typedef typename internal::remove_all<ExpressionType>::type NestedExpression;
+
+    typedef typename internal::conditional<
+                       internal::is_lvalue<ExpressionType>::value,
+                       Scalar,
+                       const Scalar
+                     >::type ScalarWithConstIfNotLvalue;
+
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType;
+
+    using Base::coeffRef;
+
+    EIGEN_DEVICE_FUNC
+    explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); }
+
+    EIGEN_DEVICE_FUNC
+    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
+    EIGEN_DEVICE_FUNC
+    inline const Scalar* data() const { return m_expression.data(); }
+
+    EIGEN_DEVICE_FUNC
+    inline const Scalar& coeffRef(Index rowId, Index colId) const
+    {
+      return m_expression.derived().coeffRef(rowId, colId);
+    }
+
+    EIGEN_DEVICE_FUNC
+    inline const Scalar& coeffRef(Index index) const
+    {
+      return m_expression.coeffRef(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<NestedExpressionType>::type&
+    nestedExpression() const
+    {
+      return m_expression;
+    }
+
+    /** Forwards the resizing request to the nested expression
+      * \sa DenseBase::resize(Index)  */
+    EIGEN_DEVICE_FUNC
+    void resize(Index newSize) { m_expression.resize(newSize); }
+    /** Forwards the resizing request to the nested expression
+      * \sa DenseBase::resize(Index,Index)*/
+    EIGEN_DEVICE_FUNC
+    void resize(Index rows, Index cols) { m_expression.resize(rows,cols); }
+
+  protected:
+    NestedExpressionType m_expression;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_ARRAYWRAPPER_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Assign.h b/src/3rdparty/eigen/Eigen/src/Core/Assign.h
new file mode 100644
index 000000000..655412efd
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Assign.h
@@ -0,0 +1,90 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Michael Olbrich <michael.olbrich@gmx.net>
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ASSIGN_H
+#define EIGEN_ASSIGN_H
+
+namespace Eigen {
+
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
+  ::lazyAssign(const DenseBase<OtherDerived>& other)
+{
+  enum{
+    SameType = internal::is_same<typename Derived::Scalar,typename OtherDerived::Scalar>::value
+  };
+
+  EIGEN_STATIC_ASSERT_LVALUE(Derived)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived)
+  EIGEN_STATIC_ASSERT(SameType,YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+  eigen_assert(rows() == other.rows() && cols() == other.cols());
+  internal::call_assignment_no_alias(derived(),other.derived());
+  
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
+{
+  internal::call_assignment(derived(), other.derived());
+  return derived();
+}
+
+template<typename Derived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase& other)
+{
+  internal::call_assignment(derived(), other.derived());
+  return derived();
+}
+
+template<typename Derived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& other)
+{
+  internal::call_assignment(derived(), other.derived());
+  return derived();
+}
+
+template<typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
+{
+  internal::call_assignment(derived(), other.derived());
+  return derived();
+}
+
+template<typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const EigenBase<OtherDerived>& other)
+{
+  internal::call_assignment(derived(), other.derived());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+{
+  other.derived().evalTo(derived());
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_ASSIGN_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/AssignEvaluator.h b/src/3rdparty/eigen/Eigen/src/Core/AssignEvaluator.h
new file mode 100644
index 000000000..7d76f0c25
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/AssignEvaluator.h
@@ -0,0 +1,1010 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ASSIGN_EVALUATOR_H
+#define EIGEN_ASSIGN_EVALUATOR_H
+
+namespace Eigen {
+
+// This implementation is based on Assign.h
+
+namespace internal {
+
+/***************************************************************************
+* Part 1 : the logic deciding a strategy for traversal and unrolling       *
+***************************************************************************/
+
+// copy_using_evaluator_traits is based on assign_traits
+
+template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc, int MaxPacketSize = -1>
+struct copy_using_evaluator_traits
+{
+  typedef typename DstEvaluator::XprType Dst;
+  typedef typename Dst::Scalar DstScalar;
+
+  enum {
+    DstFlags = DstEvaluator::Flags,
+    SrcFlags = SrcEvaluator::Flags
+  };
+
+public:
+  enum {
+    DstAlignment = DstEvaluator::Alignment,
+    SrcAlignment = SrcEvaluator::Alignment,
+    DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit,
+    JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
+  };
+
+private:
+  enum {
+    InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
+              : int(DstFlags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
+              : int(Dst::RowsAtCompileTime),
+    InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
+              : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
+              : int(Dst::MaxRowsAtCompileTime),
+    RestrictedInnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(InnerSize,MaxPacketSize),
+    RestrictedLinearSize = EIGEN_SIZE_MIN_PREFER_FIXED(Dst::SizeAtCompileTime,MaxPacketSize),
+    OuterStride = int(outer_stride_at_compile_time<Dst>::ret),
+    MaxSizeAtCompileTime = Dst::SizeAtCompileTime
+  };
+
+  // TODO distinguish between linear traversal and inner-traversals
+  typedef typename find_best_packet<DstScalar,RestrictedLinearSize>::type LinearPacketType;
+  typedef typename find_best_packet<DstScalar,RestrictedInnerSize>::type InnerPacketType;
+
+  enum {
+    LinearPacketSize = unpacket_traits<LinearPacketType>::size,
+    InnerPacketSize = unpacket_traits<InnerPacketType>::size
+  };
+
+public:
+  enum {
+    LinearRequiredAlignment = unpacket_traits<LinearPacketType>::alignment,
+    InnerRequiredAlignment = unpacket_traits<InnerPacketType>::alignment
+  };
+
+private:
+  enum {
+    DstIsRowMajor = DstFlags&RowMajorBit,
+    SrcIsRowMajor = SrcFlags&RowMajorBit,
+    StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)),
+    MightVectorize = bool(StorageOrdersAgree)
+                  && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
+                  && bool(functor_traits<AssignFunc>::PacketAccess),
+    MayInnerVectorize  = MightVectorize
+                       && int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0
+                       && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
+                       && (EIGEN_UNALIGNED_VECTORIZE  || int(JointAlignment)>=int(InnerRequiredAlignment)),
+    MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
+    MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess)
+                       && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
+      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
+         so it's only good for large enough sizes. */
+    MaySliceVectorize  = bool(MightVectorize) && bool(DstHasDirectAccess)
+                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=(EIGEN_UNALIGNED_VECTORIZE?InnerPacketSize:(3*InnerPacketSize)))
+      /* slice vectorization can be slow, so we only want it if the slices are big, which is
+         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
+         in a fixed-size matrix
+         However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */
+  };
+
+public:
+  enum {
+    Traversal =  int(Dst::SizeAtCompileTime) == 0 ? int(AllAtOnceTraversal) // If compile-size is zero, traversing will fail at compile-time.
+              : (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal)
+              : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
+              : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
+              : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
+              : int(MayLinearize)        ? int(LinearTraversal)
+                                         : int(DefaultTraversal),
+    Vectorized = int(Traversal) == InnerVectorizedTraversal
+              || int(Traversal) == LinearVectorizedTraversal
+              || int(Traversal) == SliceVectorizedTraversal
+  };
+
+  typedef typename conditional<int(Traversal)==LinearVectorizedTraversal, LinearPacketType, InnerPacketType>::type PacketType;
+
+private:
+  enum {
+    ActualPacketSize    = int(Traversal)==LinearVectorizedTraversal ? LinearPacketSize
+                        : Vectorized ? InnerPacketSize
+                        : 1,
+    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * ActualPacketSize,
+    MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic
+                       && int(Dst::SizeAtCompileTime) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit),
+    MayUnrollInner      = int(InnerSize) != Dynamic
+                       && int(InnerSize) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit)
+  };
+
+public:
+  enum {
+    Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal))
+                ? (
+                    int(MayUnrollCompletely) ? int(CompleteUnrolling)
+                  : int(MayUnrollInner)      ? int(InnerUnrolling)
+                                             : int(NoUnrolling)
+                  )
+              : int(Traversal) == int(LinearVectorizedTraversal)
+                ? ( bool(MayUnrollCompletely) && ( EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)))
+                          ? int(CompleteUnrolling)
+                          : int(NoUnrolling) )
+              : int(Traversal) == int(LinearTraversal)
+                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling)
+                                              : int(NoUnrolling) )
+#if EIGEN_UNALIGNED_VECTORIZE
+              : int(Traversal) == int(SliceVectorizedTraversal)
+                ? ( bool(MayUnrollInner) ? int(InnerUnrolling)
+                                         : int(NoUnrolling) )
+#endif
+              : int(NoUnrolling)
+  };
+
+#ifdef EIGEN_DEBUG_ASSIGN
+  static void debug()
+  {
+    std::cerr << "DstXpr: " << typeid(typename DstEvaluator::XprType).name() << std::endl;
+    std::cerr << "SrcXpr: " << typeid(typename SrcEvaluator::XprType).name() << std::endl;
+    std::cerr.setf(std::ios::hex, std::ios::basefield);
+    std::cerr << "DstFlags" << " = " << DstFlags << " (" << demangle_flags(DstFlags) << " )" << std::endl;
+    std::cerr << "SrcFlags" << " = " << SrcFlags << " (" << demangle_flags(SrcFlags) << " )" << std::endl;
+    std::cerr.unsetf(std::ios::hex);
+    EIGEN_DEBUG_VAR(DstAlignment)
+    EIGEN_DEBUG_VAR(SrcAlignment)
+    EIGEN_DEBUG_VAR(LinearRequiredAlignment)
+    EIGEN_DEBUG_VAR(InnerRequiredAlignment)
+    EIGEN_DEBUG_VAR(JointAlignment)
+    EIGEN_DEBUG_VAR(InnerSize)
+    EIGEN_DEBUG_VAR(InnerMaxSize)
+    EIGEN_DEBUG_VAR(LinearPacketSize)
+    EIGEN_DEBUG_VAR(InnerPacketSize)
+    EIGEN_DEBUG_VAR(ActualPacketSize)
+    EIGEN_DEBUG_VAR(StorageOrdersAgree)
+    EIGEN_DEBUG_VAR(MightVectorize)
+    EIGEN_DEBUG_VAR(MayLinearize)
+    EIGEN_DEBUG_VAR(MayInnerVectorize)
+    EIGEN_DEBUG_VAR(MayLinearVectorize)
+    EIGEN_DEBUG_VAR(MaySliceVectorize)
+    std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
+    EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost)
+    EIGEN_DEBUG_VAR(DstEvaluator::CoeffReadCost)
+    EIGEN_DEBUG_VAR(Dst::SizeAtCompileTime)
+    EIGEN_DEBUG_VAR(UnrollingLimit)
+    EIGEN_DEBUG_VAR(MayUnrollCompletely)
+    EIGEN_DEBUG_VAR(MayUnrollInner)
+    std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
+    std::cerr << std::endl;
+  }
+#endif
+};
+
+/***************************************************************************
+* Part 2 : meta-unrollers
+***************************************************************************/
+
+/************************
+*** Default traversal ***
+************************/
+
+template<typename Kernel, int Index, int Stop>
+struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling
+{
+  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
+  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
+  typedef typename DstEvaluatorType::XprType DstXprType;
+
+  enum {
+    outer = Index / DstXprType::InnerSizeAtCompileTime,
+    inner = Index % DstXprType::InnerSizeAtCompileTime
+  };
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    kernel.assignCoeffByOuterInner(outer, inner);
+    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
+};
+
+template<typename Kernel, int Index_, int Stop>
+struct copy_using_evaluator_DefaultTraversal_InnerUnrolling
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
+  {
+    kernel.assignCoeffByOuterInner(outer, Index_);
+    copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Index_+1, Stop>::run(kernel, outer);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index) { }
+};
+
+/***********************
+*** Linear traversal ***
+***********************/
+
+template<typename Kernel, int Index, int Stop>
+struct copy_using_evaluator_LinearTraversal_CompleteUnrolling
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel)
+  {
+    kernel.assignCoeff(Index);
+    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
+};
+
+/**************************
+*** Inner vectorization ***
+**************************/
+
+template<typename Kernel, int Index, int Stop>
+struct copy_using_evaluator_innervec_CompleteUnrolling
+{
+  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
+  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
+  typedef typename DstEvaluatorType::XprType DstXprType;
+  typedef typename Kernel::PacketType PacketType;
+
+  enum {
+    outer = Index / DstXprType::InnerSizeAtCompileTime,
+    inner = Index % DstXprType::InnerSizeAtCompileTime,
+    SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
+    DstAlignment = Kernel::AssignmentTraits::DstAlignment
+  };
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
+    enum { NextIndex = Index + unpacket_traits<PacketType>::size };
+    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
+};
+
+template<typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_InnerUnrolling
+{
+  typedef typename Kernel::PacketType PacketType;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
+  {
+    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Index_);
+    enum { NextIndex = Index_ + unpacket_traits<PacketType>::size };
+    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop, SrcAlignment, DstAlignment>::run(kernel, outer);
+  }
+};
+
+template<typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlignment, DstAlignment>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, Index) { }
+};
+
+/***************************************************************************
+* Part 3 : implementation of all cases
+***************************************************************************/
+
+// dense_assignment_loop is based on assign_impl
+
+template<typename Kernel,
+         int Traversal = Kernel::AssignmentTraits::Traversal,
+         int Unrolling = Kernel::AssignmentTraits::Unrolling>
+struct dense_assignment_loop;
+
+/************************
+***** Special Cases *****
+************************/
+
+// Zero-sized assignment is a no-op.
+template<typename Kernel, int Unrolling>
+struct dense_assignment_loop<Kernel, AllAtOnceTraversal, Unrolling>
+{
+  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel& /*kernel*/)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    EIGEN_STATIC_ASSERT(int(DstXprType::SizeAtCompileTime) == 0,
+      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT)
+  }
+};
+
+/************************
+*** Default traversal ***
+************************/
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>
+{
+  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel &kernel)
+  {
+    for(Index outer = 0; outer < kernel.outerSize(); ++outer) {
+      for(Index inner = 0; inner < kernel.innerSize(); ++inner) {
+        kernel.assignCoeffByOuterInner(outer, inner);
+      }
+    }
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+
+    const Index outerSize = kernel.outerSize();
+    for(Index outer = 0; outer < outerSize; ++outer)
+      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
+  }
+};
+
+/***************************
+*** Linear vectorization ***
+***************************/
+
+
+// The goal of unaligned_dense_assignment_loop is simply to factorize the handling
+// of the non vectorizable beginning and ending parts
+
+template <bool IsAligned = false>
+struct unaligned_dense_assignment_loop
+{
+  // if IsAligned = true, then do nothing
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index, Index) {}
+};
+
+template <>
+struct unaligned_dense_assignment_loop<false>
+{
+  // MSVC must not inline this functions. If it does, it fails to optimize the
+  // packet access path.
+  // FIXME check which version exhibits this issue
+#if EIGEN_COMP_MSVC
+  template <typename Kernel>
+  static EIGEN_DONT_INLINE void run(Kernel &kernel,
+                                    Index start,
+                                    Index end)
+#else
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel,
+                                      Index start,
+                                      Index end)
+#endif
+  {
+    for (Index index = start; index < end; ++index)
+      kernel.assignCoeff(index);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    const Index size = kernel.size();
+    typedef typename Kernel::Scalar Scalar;
+    typedef typename Kernel::PacketType PacketType;
+    enum {
+      requestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
+      packetSize = unpacket_traits<PacketType>::size,
+      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
+      dstAlignment = packet_traits<Scalar>::AlignedOnScalar ? int(requestedAlignment)
+                                                            : int(Kernel::AssignmentTraits::DstAlignment),
+      srcAlignment = Kernel::AssignmentTraits::JointAlignment
+    };
+    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(kernel.dstDataPtr(), size);
+    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
+
+    unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);
+
+    for(Index index = alignedStart; index < alignedEnd; index += packetSize)
+      kernel.template assignPacket<dstAlignment, srcAlignment, PacketType>(index);
+
+    unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::PacketType PacketType;
+
+    enum { size = DstXprType::SizeAtCompileTime,
+           packetSize =unpacket_traits<PacketType>::size,
+           alignedSize = (int(size)/packetSize)*packetSize };
+
+    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
+    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, alignedSize, size>::run(kernel);
+  }
+};
+
+/**************************
+*** Inner vectorization ***
+**************************/
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
+{
+  typedef typename Kernel::PacketType PacketType;
+  enum {
+    SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
+    DstAlignment = Kernel::AssignmentTraits::DstAlignment
+  };
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    const Index innerSize = kernel.innerSize();
+    const Index outerSize = kernel.outerSize();
+    const Index packetSize = unpacket_traits<PacketType>::size;
+    for(Index outer = 0; outer < outerSize; ++outer)
+      for(Index inner = 0; inner < innerSize; inner+=packetSize)
+        kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::AssignmentTraits Traits;
+    const Index outerSize = kernel.outerSize();
+    for(Index outer = 0; outer < outerSize; ++outer)
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime,
+                                                   Traits::SrcAlignment, Traits::DstAlignment>::run(kernel, outer);
+  }
+};
+
+/***********************
+*** Linear traversal ***
+***********************/
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    const Index size = kernel.size();
+    for(Index i = 0; i < size; ++i)
+      kernel.assignCoeff(i);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
+  }
+};
+
+/**************************
+*** Slice vectorization ***
+***************************/
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::Scalar Scalar;
+    typedef typename Kernel::PacketType PacketType;
+    enum {
+      packetSize = unpacket_traits<PacketType>::size,
+      requestedAlignment = int(Kernel::AssignmentTraits::InnerRequiredAlignment),
+      alignable = packet_traits<Scalar>::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar),
+      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
+      dstAlignment = alignable ? int(requestedAlignment)
+                               : int(Kernel::AssignmentTraits::DstAlignment)
+    };
+    const Scalar *dst_ptr = kernel.dstDataPtr();
+    if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0)
+    {
+      // the pointer is not aligned-on scalar, so alignment is not possible
+      return dense_assignment_loop<Kernel,DefaultTraversal,NoUnrolling>::run(kernel);
+    }
+    const Index packetAlignedMask = packetSize - 1;
+    const Index innerSize = kernel.innerSize();
+    const Index outerSize = kernel.outerSize();
+    const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0;
+    Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned<requestedAlignment>(dst_ptr, innerSize);
+
+    for(Index outer = 0; outer < outerSize; ++outer)
+    {
+      const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
+      // do the non-vectorizable part of the assignment
+      for(Index inner = 0; inner<alignedStart ; ++inner)
+        kernel.assignCoeffByOuterInner(outer, inner);
+
+      // do the vectorizable part of the assignment
+      for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
+        kernel.template assignPacketByOuterInner<dstAlignment, Unaligned, PacketType>(outer, inner);
+
+      // do the non-vectorizable part of the assignment
+      for(Index inner = alignedEnd; inner<innerSize ; ++inner)
+        kernel.assignCoeffByOuterInner(outer, inner);
+
+      alignedStart = numext::mini((alignedStart+alignedStep)%packetSize, innerSize);
+    }
+  }
+};
+
+#if EIGEN_UNALIGNED_VECTORIZE
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    typedef typename Kernel::PacketType PacketType;
+
+    enum { innerSize = DstXprType::InnerSizeAtCompileTime,
+           packetSize =unpacket_traits<PacketType>::size,
+           vectorizableSize = (int(innerSize) / int(packetSize)) * int(packetSize),
+           size = DstXprType::SizeAtCompileTime };
+
+    for(Index outer = 0; outer < kernel.outerSize(); ++outer)
+    {
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer);
+      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, innerSize>::run(kernel, outer);
+    }
+  }
+};
+#endif
+
+
+/***************************************************************************
+* Part 4 : Generic dense assignment kernel
+***************************************************************************/
+
+// This class generalize the assignment of a coefficient (or packet) from one dense evaluator
+// to another dense writable evaluator.
+// It is parametrized by the two evaluators, and the actual assignment functor.
+// This abstraction level permits to keep the evaluation loops as simple and as generic as possible.
+// One can customize the assignment using this generic dense_assignment_kernel with different
+// functors, or by completely overloading it, by-passing a functor.
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version = Specialized>
+class generic_dense_assignment_kernel
+{
+protected:
+  typedef typename DstEvaluatorTypeT::XprType DstXprType;
+  typedef typename SrcEvaluatorTypeT::XprType SrcXprType;
+public:
+
+  typedef DstEvaluatorTypeT DstEvaluatorType;
+  typedef SrcEvaluatorTypeT SrcEvaluatorType;
+  typedef typename DstEvaluatorType::Scalar Scalar;
+  typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
+  typedef typename AssignmentTraits::PacketType PacketType;
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+    : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
+  {
+    #ifdef EIGEN_DEBUG_ASSIGN
+    AssignmentTraits::debug();
+    #endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_dstExpr.size(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const EIGEN_NOEXCEPT { return m_dstExpr.innerSize(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const EIGEN_NOEXCEPT { return m_dstExpr.outerSize(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dstExpr.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_dstExpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { return m_dstExpr.outerStride(); }
+
+  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() EIGEN_NOEXCEPT { return m_dst; }
+  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const EIGEN_NOEXCEPT { return m_src; }
+
+  /// Assign src(row,col) to dst(row,col) through the assignment functor.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col)
+  {
+    m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
+  }
+
+  /// \sa assignCoeff(Index,Index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index)
+  {
+    m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
+  }
+
+  /// \sa assignCoeff(Index,Index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner)
+  {
+    Index row = rowIndexByOuterInner(outer, inner);
+    Index col = colIndexByOuterInner(outer, inner);
+    assignCoeff(row, col);
+  }
+
+
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
+  {
+    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode,PacketType>(row,col));
+  }
+
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index)
+  {
+    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode,PacketType>(index));
+  }
+
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
+  {
+    Index row = rowIndexByOuterInner(outer, inner);
+    Index col = colIndexByOuterInner(outer, inner);
+    assignPacket<StoreMode,LoadMode,PacketType>(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner)
+  {
+    typedef typename DstEvaluatorType::ExpressionTraits Traits;
+    return int(Traits::RowsAtCompileTime) == 1 ? 0
+      : int(Traits::ColsAtCompileTime) == 1 ? inner
+      : int(DstEvaluatorType::Flags)&RowMajorBit ? outer
+      : inner;
+  }
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner)
+  {
+    typedef typename DstEvaluatorType::ExpressionTraits Traits;
+    return int(Traits::ColsAtCompileTime) == 1 ? 0
+      : int(Traits::RowsAtCompileTime) == 1 ? inner
+      : int(DstEvaluatorType::Flags)&RowMajorBit ? inner
+      : outer;
+  }
+
+  EIGEN_DEVICE_FUNC const Scalar* dstDataPtr() const
+  {
+    return m_dstExpr.data();
+  }
+
+protected:
+  DstEvaluatorType& m_dst;
+  const SrcEvaluatorType& m_src;
+  const Functor &m_functor;
+  // TODO find a way to avoid the needs of the original expression
+  DstXprType& m_dstExpr;
+};
+
+// Special kernel used when computing small products whose operands have dynamic dimensions.  It ensures that the
+// PacketSize used is no larger than 4, thereby increasing the chance that vectorized instructions will be used
+// when computing the product.
+
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor>
+class restricted_packet_dense_assignment_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn>
+{
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn> Base;
+ public:
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::DstXprType DstXprType;
+    typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, 4> AssignmentTraits;
+    typedef typename AssignmentTraits::PacketType PacketType;
+
+    EIGEN_DEVICE_FUNC restricted_packet_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {
+  }
+ };
+
+/***************************************************************************
+* Part 5 : Entry point for dense rectangular assignment
+***************************************************************************/
+
+template<typename DstXprType,typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const Functor &/*func*/)
+{
+  EIGEN_ONLY_USED_FOR_DEBUG(dst);
+  EIGEN_ONLY_USED_FOR_DEBUG(src);
+  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+}
+
+template<typename DstXprType,typename SrcXprType, typename T1, typename T2>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const internal::assign_op<T1,T2> &/*func*/)
+{
+  Index dstRows = src.rows();
+  Index dstCols = src.cols();
+  if(((dst.rows()!=dstRows) || (dst.cols()!=dstCols)))
+    dst.resize(dstRows, dstCols);
+  eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols);
+}
+
+template<typename DstXprType, typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func)
+{
+  typedef evaluator<DstXprType> DstEvaluatorType;
+  typedef evaluator<SrcXprType> SrcEvaluatorType;
+
+  SrcEvaluatorType srcEvaluator(src);
+
+  // NOTE To properly handle A = (A*A.transpose())/s with A rectangular,
+  // we need to resize the destination after the source evaluator has been created.
+  resize_if_allowed(dst, src, func);
+
+  DstEvaluatorType dstEvaluator(dst);
+
+  typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
+  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+
+  dense_assignment_loop<Kernel>::run(kernel);
+}
+
+// Specialization for filling the destination with a constant value.
+#ifndef EIGEN_GPU_COMPILE_PHASE
+template<typename DstXprType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const Eigen::CwiseNullaryOp<Eigen::internal::scalar_constant_op<typename DstXprType::Scalar>, DstXprType>& src, const internal::assign_op<typename DstXprType::Scalar,typename DstXprType::Scalar>& func)
+{
+  resize_if_allowed(dst, src, func);
+  std::fill_n(dst.data(), dst.size(), src.functor()());
+}
+#endif
+
+template<typename DstXprType, typename SrcXprType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src)
+{
+  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
+}
+
+/***************************************************************************
+* Part 6 : Generic assignment
+***************************************************************************/
+
+// Based on the respective shapes of the destination and source,
+// the class AssignmentKind determine the kind of assignment mechanism.
+// AssignmentKind must define a Kind typedef.
+template<typename DstShape, typename SrcShape> struct AssignmentKind;
+
+// Assignment kind defined in this file:
+struct Dense2Dense {};
+struct EigenBase2EigenBase {};
+
+template<typename,typename> struct AssignmentKind { typedef EigenBase2EigenBase Kind; };
+template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Kind; };
+
+// This is the main assignment class
+template< typename DstXprType, typename SrcXprType, typename Functor,
+          typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
+          typename EnableIf = void>
+struct Assignment;
+
+
+// The only purpose of this call_assignment() function is to deal with noalias() / "assume-aliasing" and automatic transposition.
+// Indeed, I (Gael) think that this concept of "assume-aliasing" was a mistake, and it makes thing quite complicated.
+// So this intermediate function removes everything related to "assume-aliasing" such that Assignment
+// does not has to bother about these annoying details.
+
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src)
+{
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
+}
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(const Dst& dst, const Src& src)
+{
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
+}
+
+// Deal with "assume-aliasing"
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing<Src>::value, void*>::type = 0)
+{
+  typename plain_matrix_type<Src>::type tmp(src);
+  call_assignment_no_alias(dst, tmp, func);
+}
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<!evaluator_assume_aliasing<Src>::value, void*>::type = 0)
+{
+  call_assignment_no_alias(dst, src, func);
+}
+
+// by-pass "assume-aliasing"
+// When there is no aliasing, we require that 'dst' has been properly resized
+template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
+{
+  call_assignment_no_alias(dst.expression(), src, func);
+}
+
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
+{
+  enum {
+    NeedToTranspose = (    (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1)
+                        || (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)
+                      ) && int(Dst::SizeAtCompileTime) != 1
+  };
+
+  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst>::type ActualDstTypeCleaned;
+  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst&>::type ActualDstType;
+  ActualDstType actualDst(dst);
+
+  // TODO check whether this is the right place to perform these checks:
+  EIGEN_STATIC_ASSERT_LVALUE(Dst)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src)
+  EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
+
+  Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
+}
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_restricted_packet_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
+{
+    typedef evaluator<Dst> DstEvaluatorType;
+    typedef evaluator<Src> SrcEvaluatorType;
+    typedef restricted_packet_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Func> Kernel;
+
+    EIGEN_STATIC_ASSERT_LVALUE(Dst)
+    EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar);
+
+    SrcEvaluatorType srcEvaluator(src);
+    resize_if_allowed(dst, src, func);
+
+    DstEvaluatorType dstEvaluator(dst);
+    Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+
+    dense_assignment_loop<Kernel>::run(kernel);
+}
+
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias(Dst& dst, const Src& src)
+{
+  call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
+}
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func)
+{
+  // TODO check whether this is the right place to perform these checks:
+  EIGEN_STATIC_ASSERT_LVALUE(Dst)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src)
+  EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar);
+
+  Assignment<Dst,Src,Func>::run(dst, src, func);
+}
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
+{
+  call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
+}
+
+// forward declaration
+template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);
+
+// Generic Dense to Dense assignment
+// Note that the last template argument "Weak" is needed to make it possible to perform
+// both partial specialization+SFINAE without ambiguous specialization
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+#ifndef EIGEN_NO_DEBUG
+    internal::check_for_aliasing(dst, src);
+#endif
+
+    call_dense_assignment_loop(dst, src, func);
+  }
+};
+
+// Generic assignment through evalTo.
+// TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism.
+// Note that the last template argument "Weak" is needed to make it possible to perform
+// both partial specialization+SFINAE without ambiguous specialization
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.evalTo(dst);
+  }
+
+  // NOTE The following two functions are templated to avoid their instantiation if not needed
+  //      This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
+  template<typename SrcScalarType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.addTo(dst);
+  }
+
+  template<typename SrcScalarType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.subTo(dst);
+  }
+};
+
+} // namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_ASSIGN_EVALUATOR_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Assign_MKL.h b/src/3rdparty/eigen/Eigen/src/Core/Assign_MKL.h
new file mode 100644
index 000000000..c6140d185
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Assign_MKL.h
@@ -0,0 +1,178 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+ Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+ 
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   MKL VML support for coefficient-wise unary Eigen expressions like a=b.sin()
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_ASSIGN_VML_H
+#define EIGEN_ASSIGN_VML_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Dst, typename Src>
+class vml_assign_traits
+{
+  private:
+    enum {
+      DstHasDirectAccess = Dst::Flags & DirectAccessBit,
+      SrcHasDirectAccess = Src::Flags & DirectAccessBit,
+      StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)),
+      InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
+                : int(Dst::Flags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
+                : int(Dst::RowsAtCompileTime),
+      InnerMaxSize  = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
+                    : int(Dst::Flags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
+                    : int(Dst::MaxRowsAtCompileTime),
+      MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
+
+      MightEnableVml = StorageOrdersAgree && DstHasDirectAccess && SrcHasDirectAccess && Src::InnerStrideAtCompileTime==1 && Dst::InnerStrideAtCompileTime==1,
+      MightLinearize = MightEnableVml && (int(Dst::Flags) & int(Src::Flags) & LinearAccessBit),
+      VmlSize = MightLinearize ? MaxSizeAtCompileTime : InnerMaxSize,
+      LargeEnough = VmlSize==Dynamic || VmlSize>=EIGEN_MKL_VML_THRESHOLD
+    };
+  public:
+    enum {
+      EnableVml = MightEnableVml && LargeEnough,
+      Traversal = MightLinearize ? LinearTraversal : DefaultTraversal
+    };
+};
+
+#define EIGEN_PP_EXPAND(ARG) ARG
+#if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1)
+#define EIGEN_VMLMODE_EXPAND_xLA , VML_HA
+#else
+#define EIGEN_VMLMODE_EXPAND_xLA , VML_LA
+#endif
+
+#define EIGEN_VMLMODE_EXPAND_x_
+
+#define EIGEN_VMLMODE_PREFIX_xLA vm
+#define EIGEN_VMLMODE_PREFIX_x_  v
+#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_x,VMLMODE)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
+  template< typename DstXprType, typename SrcXprNested>                                                                         \
+  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>,   \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {              \
+    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                            \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) {                       \
+      resize_if_allowed(dst, src, func);                                                                                        \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
+      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
+        VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) );                                           \
+      } else {                                                                                                                  \
+        const Index outerSize = dst.outerSize();                                                                                \
+        for(Index outer = 0; outer < outerSize; ++outer) {                                                                      \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.nestedExpression().coeffRef(outer,0)) :                             \
+                                                      &(src.nestedExpression().coeffRef(0, outer));                             \
+          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                           \
+          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr,                                                                      \
+                (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                                             \
+        }                                                                                                                       \
+      }                                                                                                                         \
+    }                                                                                                                           \
+  };                                                                                                                            \
+
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP, VMLMODE)                                                         \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),s##VMLOP), float, float, VMLMODE)           \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),d##VMLOP), double, double, VMLMODE)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(EIGENOP, VMLOP, VMLMODE)                                                         \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),c##VMLOP), scomplex, MKL_Complex8, VMLMODE) \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),z##VMLOP), dcomplex, MKL_Complex16, VMLMODE)
+  
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS(EIGENOP, VMLOP, VMLMODE)                                                              \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP, VMLMODE)                                                               \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(EIGENOP, VMLOP, VMLMODE)
+
+  
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sin,   Sin,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(asin,  Asin,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sinh,  Sinh,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(cos,   Cos,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(acos,  Acos,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(cosh,  Cosh,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(tan,   Tan,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(atan,  Atan,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(tanh,  Tanh,  LA)
+// EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,   Abs,    _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(exp,   Exp,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(log,   Ln,    LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(log10, Log10, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sqrt,  Sqrt,  _)
+
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(square, Sqr,   _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(arg, Arg,      _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(round, Round,  _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(floor, Floor,  _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
+
+#define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
+  template< typename DstXprType, typename SrcXprNested, typename Plain>                                                       \
+  struct Assignment<DstXprType, CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                       \
+                    const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> >, assign_op<EIGENTYPE,EIGENTYPE>,    \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {            \
+    typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                                           \
+                    const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType;                         \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) {                     \
+      resize_if_allowed(dst, src, func);                                                                                      \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                     \
+      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other);                                       \
+      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
+      {                                                                                                                       \
+        VMLOP( dst.size(), (const VMLTYPE*)src.lhs().data(), exponent,                                                        \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) );                                         \
+      } else {                                                                                                                \
+        const Index outerSize = dst.outerSize();                                                                              \
+        for(Index outer = 0; outer < outerSize; ++outer) {                                                                    \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.lhs().coeffRef(outer,0)) :                                        \
+                                                      &(src.lhs().coeffRef(0, outer));                                        \
+          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                         \
+          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent,                                                          \
+                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                                          \
+        }                                                                                                                     \
+      }                                                                                                                       \
+    }                                                                                                                         \
+  };
+  
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmsPowx, float,    float,         LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmdPowx, double,   double,        LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmcPowx, scomplex, MKL_Complex8,  LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmzPowx, dcomplex, MKL_Complex16, LA)
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_ASSIGN_VML_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/BandMatrix.h b/src/3rdparty/eigen/Eigen/src/Core/BandMatrix.h
new file mode 100644
index 000000000..878c0240a
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/BandMatrix.h
@@ -0,0 +1,353 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BANDMATRIX_H
+#define EIGEN_BANDMATRIX_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Derived>
+class BandMatrixBase : public EigenBase<Derived>
+{
+  public:
+
+    enum {
+      Flags = internal::traits<Derived>::Flags,
+      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
+      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+      MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
+      Supers = internal::traits<Derived>::Supers,
+      Subs   = internal::traits<Derived>::Subs,
+      Options = internal::traits<Derived>::Options
+    };
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime> DenseMatrixType;
+    typedef typename DenseMatrixType::StorageIndex StorageIndex;
+    typedef typename internal::traits<Derived>::CoefficientsType CoefficientsType;
+    typedef EigenBase<Derived> Base;
+
+  protected:
+    enum {
+      DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic))
+                            ? 1 + Supers + Subs
+                            : Dynamic,
+      SizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime,ColsAtCompileTime)
+    };
+
+  public:
+
+    using Base::derived;
+    using Base::rows;
+    using Base::cols;
+
+    /** \returns the number of super diagonals */
+    inline Index supers() const { return derived().supers(); }
+
+    /** \returns the number of sub diagonals */
+    inline Index subs() const { return derived().subs(); }
+
+    /** \returns an expression of the underlying coefficient matrix */
+    inline const CoefficientsType& coeffs() const { return derived().coeffs(); }
+
+    /** \returns an expression of the underlying coefficient matrix */
+    inline CoefficientsType& coeffs() { return derived().coeffs(); }
+
+    /** \returns a vector expression of the \a i -th column,
+      * only the meaningful part is returned.
+      * \warning the internal storage must be column major. */
+    inline Block<CoefficientsType,Dynamic,1> col(Index i)
+    {
+      EIGEN_STATIC_ASSERT((int(Options) & int(RowMajor)) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+      Index start = 0;
+      Index len = coeffs().rows();
+      if (i<=supers())
+      {
+        start = supers()-i;
+        len = (std::min)(rows(),std::max<Index>(0,coeffs().rows() - (supers()-i)));
+      }
+      else if (i>=rows()-subs())
+        len = std::max<Index>(0,coeffs().rows() - (i + 1 - rows() + subs()));
+      return Block<CoefficientsType,Dynamic,1>(coeffs(), start, i, len, 1);
+    }
+
+    /** \returns a vector expression of the main diagonal */
+    inline Block<CoefficientsType,1,SizeAtCompileTime> diagonal()
+    { return Block<CoefficientsType,1,SizeAtCompileTime>(coeffs(),supers(),0,1,(std::min)(rows(),cols())); }
+
+    /** \returns a vector expression of the main diagonal (const version) */
+    inline const Block<const CoefficientsType,1,SizeAtCompileTime> diagonal() const
+    { return Block<const CoefficientsType,1,SizeAtCompileTime>(coeffs(),supers(),0,1,(std::min)(rows(),cols())); }
+
+    template<int Index> struct DiagonalIntReturnType {
+      enum {
+        ReturnOpposite = (int(Options) & int(SelfAdjoint)) && (((Index) > 0 && Supers == 0) || ((Index) < 0 && Subs == 0)),
+        Conjugate = ReturnOpposite && NumTraits<Scalar>::IsComplex,
+        ActualIndex = ReturnOpposite ? -Index : Index,
+        DiagonalSize = (RowsAtCompileTime==Dynamic || ColsAtCompileTime==Dynamic)
+                     ? Dynamic
+                     : (ActualIndex<0
+                     ? EIGEN_SIZE_MIN_PREFER_DYNAMIC(ColsAtCompileTime, RowsAtCompileTime + ActualIndex)
+                     : EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime, ColsAtCompileTime - ActualIndex))
+      };
+      typedef Block<CoefficientsType,1, DiagonalSize> BuildType;
+      typedef typename internal::conditional<Conjugate,
+                 CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>,BuildType >,
+                 BuildType>::type Type;
+    };
+
+    /** \returns a vector expression of the \a N -th sub or super diagonal */
+    template<int N> inline typename DiagonalIntReturnType<N>::Type diagonal()
+    {
+      return typename DiagonalIntReturnType<N>::BuildType(coeffs(), supers()-N, (std::max)(0,N), 1, diagonalLength(N));
+    }
+
+    /** \returns a vector expression of the \a N -th sub or super diagonal */
+    template<int N> inline const typename DiagonalIntReturnType<N>::Type diagonal() const
+    {
+      return typename DiagonalIntReturnType<N>::BuildType(coeffs(), supers()-N, (std::max)(0,N), 1, diagonalLength(N));
+    }
+
+    /** \returns a vector expression of the \a i -th sub or super diagonal */
+    inline Block<CoefficientsType,1,Dynamic> diagonal(Index i)
+    {
+      eigen_assert((i<0 && -i<=subs()) || (i>=0 && i<=supers()));
+      return Block<CoefficientsType,1,Dynamic>(coeffs(), supers()-i, std::max<Index>(0,i), 1, diagonalLength(i));
+    }
+
+    /** \returns a vector expression of the \a i -th sub or super diagonal */
+    inline const Block<const CoefficientsType,1,Dynamic> diagonal(Index i) const
+    {
+      eigen_assert((i<0 && -i<=subs()) || (i>=0 && i<=supers()));
+      return Block<const CoefficientsType,1,Dynamic>(coeffs(), supers()-i, std::max<Index>(0,i), 1, diagonalLength(i));
+    }
+
+    template<typename Dest> inline void evalTo(Dest& dst) const
+    {
+      dst.resize(rows(),cols());
+      dst.setZero();
+      dst.diagonal() = diagonal();
+      for (Index i=1; i<=supers();++i)
+        dst.diagonal(i) = diagonal(i);
+      for (Index i=1; i<=subs();++i)
+        dst.diagonal(-i) = diagonal(-i);
+    }
+
+    DenseMatrixType toDenseMatrix() const
+    {
+      DenseMatrixType res(rows(),cols());
+      evalTo(res);
+      return res;
+    }
+
+  protected:
+
+    inline Index diagonalLength(Index i) const
+    { return i<0 ? (std::min)(cols(),rows()+i) : (std::min)(rows(),cols()-i); }
+};
+
+/**
+  * \class BandMatrix
+  * \ingroup Core_Module
+  *
+  * \brief Represents a rectangular matrix with a banded storage
+  *
+  * \tparam _Scalar Numeric type, i.e. float, double, int
+  * \tparam _Rows Number of rows, or \b Dynamic
+  * \tparam _Cols Number of columns, or \b Dynamic
+  * \tparam _Supers Number of super diagonal
+  * \tparam _Subs Number of sub diagonal
+  * \tparam _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint
+  *                  The former controls \ref TopicStorageOrders "storage order", and defaults to
+  *                  column-major. The latter controls whether the matrix represents a selfadjoint
+  *                  matrix in which case either Supers of Subs have to be null.
+  *
+  * \sa class TridiagonalMatrix
+  */
+
+template<typename _Scalar, int _Rows, int _Cols, int _Supers, int _Subs, int _Options>
+struct traits<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
+{
+  typedef _Scalar Scalar;
+  typedef Dense StorageKind;
+  typedef Eigen::Index StorageIndex;
+  enum {
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    RowsAtCompileTime = _Rows,
+    ColsAtCompileTime = _Cols,
+    MaxRowsAtCompileTime = _Rows,
+    MaxColsAtCompileTime = _Cols,
+    Flags = LvalueBit,
+    Supers = _Supers,
+    Subs = _Subs,
+    Options = _Options,
+    DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic
+  };
+  typedef Matrix<Scalar, DataRowsAtCompileTime, ColsAtCompileTime, int(Options) & int(RowMajor) ? RowMajor : ColMajor> CoefficientsType;
+};
+
+template<typename _Scalar, int Rows, int Cols, int Supers, int Subs, int Options>
+class BandMatrix : public BandMatrixBase<BandMatrix<_Scalar,Rows,Cols,Supers,Subs,Options> >
+{
+  public:
+
+    typedef typename internal::traits<BandMatrix>::Scalar Scalar;
+    typedef typename internal::traits<BandMatrix>::StorageIndex StorageIndex;
+    typedef typename internal::traits<BandMatrix>::CoefficientsType CoefficientsType;
+
+    explicit inline BandMatrix(Index rows=Rows, Index cols=Cols, Index supers=Supers, Index subs=Subs)
+      : m_coeffs(1+supers+subs,cols),
+        m_rows(rows), m_supers(supers), m_subs(subs)
+    {
+    }
+
+    /** \returns the number of columns */
+    inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
+
+    /** \returns the number of rows */
+    inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); }
+
+    /** \returns the number of super diagonals */
+    inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); }
+
+    /** \returns the number of sub diagonals */
+    inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); }
+
+    inline const CoefficientsType& coeffs() const { return m_coeffs; }
+    inline CoefficientsType& coeffs() { return m_coeffs; }
+
+  protected:
+
+    CoefficientsType m_coeffs;
+    internal::variable_if_dynamic<Index, Rows>   m_rows;
+    internal::variable_if_dynamic<Index, Supers> m_supers;
+    internal::variable_if_dynamic<Index, Subs>   m_subs;
+};
+
+template<typename _CoefficientsType,int _Rows, int _Cols, int _Supers, int _Subs,int _Options>
+class BandMatrixWrapper;
+
+template<typename _CoefficientsType,int _Rows, int _Cols, int _Supers, int _Subs,int _Options>
+struct traits<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
+{
+  typedef typename _CoefficientsType::Scalar Scalar;
+  typedef typename _CoefficientsType::StorageKind StorageKind;
+  typedef typename _CoefficientsType::StorageIndex StorageIndex;
+  enum {
+    CoeffReadCost = internal::traits<_CoefficientsType>::CoeffReadCost,
+    RowsAtCompileTime = _Rows,
+    ColsAtCompileTime = _Cols,
+    MaxRowsAtCompileTime = _Rows,
+    MaxColsAtCompileTime = _Cols,
+    Flags = LvalueBit,
+    Supers = _Supers,
+    Subs = _Subs,
+    Options = _Options,
+    DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic
+  };
+  typedef _CoefficientsType CoefficientsType;
+};
+
+template<typename _CoefficientsType,int _Rows, int _Cols, int _Supers, int _Subs,int _Options>
+class BandMatrixWrapper : public BandMatrixBase<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
+{
+  public:
+
+    typedef typename internal::traits<BandMatrixWrapper>::Scalar Scalar;
+    typedef typename internal::traits<BandMatrixWrapper>::CoefficientsType CoefficientsType;
+    typedef typename internal::traits<BandMatrixWrapper>::StorageIndex StorageIndex;
+
+    explicit inline BandMatrixWrapper(const CoefficientsType& coeffs, Index rows=_Rows, Index cols=_Cols, Index supers=_Supers, Index subs=_Subs)
+      : m_coeffs(coeffs),
+        m_rows(rows), m_supers(supers), m_subs(subs)
+    {
+      EIGEN_UNUSED_VARIABLE(cols);
+      //internal::assert(coeffs.cols()==cols() && (supers()+subs()+1)==coeffs.rows());
+    }
+
+    /** \returns the number of columns */
+    inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
+
+    /** \returns the number of rows */
+    inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); }
+
+    /** \returns the number of super diagonals */
+    inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); }
+
+    /** \returns the number of sub diagonals */
+    inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); }
+
+    inline const CoefficientsType& coeffs() const { return m_coeffs; }
+
+  protected:
+
+    const CoefficientsType& m_coeffs;
+    internal::variable_if_dynamic<Index, _Rows>   m_rows;
+    internal::variable_if_dynamic<Index, _Supers> m_supers;
+    internal::variable_if_dynamic<Index, _Subs>   m_subs;
+};
+
+/**
+  * \class TridiagonalMatrix
+  * \ingroup Core_Module
+  *
+  * \brief Represents a tridiagonal matrix with a compact banded storage
+  *
+  * \tparam Scalar Numeric type, i.e. float, double, int
+  * \tparam Size Number of rows and cols, or \b Dynamic
+  * \tparam Options Can be 0 or \b SelfAdjoint
+  *
+  * \sa class BandMatrix
+  */
+template<typename Scalar, int Size, int Options>
+class TridiagonalMatrix : public BandMatrix<Scalar,Size,Size,Options&SelfAdjoint?0:1,1,Options|RowMajor>
+{
+    typedef BandMatrix<Scalar,Size,Size,Options&SelfAdjoint?0:1,1,Options|RowMajor> Base;
+    typedef typename Base::StorageIndex StorageIndex;
+  public:
+    explicit TridiagonalMatrix(Index size = Size) : Base(size,size,Options&SelfAdjoint?0:1,1) {}
+
+    inline typename Base::template DiagonalIntReturnType<1>::Type super()
+    { return Base::template diagonal<1>(); }
+    inline const typename Base::template DiagonalIntReturnType<1>::Type super() const
+    { return Base::template diagonal<1>(); }
+    inline typename Base::template DiagonalIntReturnType<-1>::Type sub()
+    { return Base::template diagonal<-1>(); }
+    inline const typename Base::template DiagonalIntReturnType<-1>::Type sub() const
+    { return Base::template diagonal<-1>(); }
+  protected:
+};
+
+
+struct BandShape {};
+
+template<typename _Scalar, int _Rows, int _Cols, int _Supers, int _Subs, int _Options>
+struct evaluator_traits<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
+  : public evaluator_traits_base<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
+{
+  typedef BandShape Shape;
+};
+
+template<typename _CoefficientsType,int _Rows, int _Cols, int _Supers, int _Subs,int _Options>
+struct evaluator_traits<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
+  : public evaluator_traits_base<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
+{
+  typedef BandShape Shape;
+};
+
+template<> struct AssignmentKind<DenseShape,BandShape> { typedef EigenBase2EigenBase Kind; };
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BANDMATRIX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Block.h b/src/3rdparty/eigen/Eigen/src/Core/Block.h
new file mode 100644
index 000000000..3206d6633
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Block.h
@@ -0,0 +1,448 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BLOCK_H
+#define EIGEN_BLOCK_H
+
+namespace Eigen {
+
+namespace internal {
+template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
+struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprType>
+{
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::XprKind XprKind;
+  typedef typename ref_selector<XprType>::type XprTypeNested;
+  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+  enum{
+    MatrixRows = traits<XprType>::RowsAtCompileTime,
+    MatrixCols = traits<XprType>::ColsAtCompileTime,
+    RowsAtCompileTime = MatrixRows == 0 ? 0 : BlockRows,
+    ColsAtCompileTime = MatrixCols == 0 ? 0 : BlockCols,
+    MaxRowsAtCompileTime = BlockRows==0 ? 0
+                         : RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime)
+                         : int(traits<XprType>::MaxRowsAtCompileTime),
+    MaxColsAtCompileTime = BlockCols==0 ? 0
+                         : ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime)
+                         : int(traits<XprType>::MaxColsAtCompileTime),
+
+    XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
+    IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+               : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+               : XprTypeIsRowMajor,
+    HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor),
+    InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+    InnerStrideAtCompileTime = HasSameStorageOrderAsXprType
+                             ? int(inner_stride_at_compile_time<XprType>::ret)
+                             : int(outer_stride_at_compile_time<XprType>::ret),
+    OuterStrideAtCompileTime = HasSameStorageOrderAsXprType
+                             ? int(outer_stride_at_compile_time<XprType>::ret)
+                             : int(inner_stride_at_compile_time<XprType>::ret),
+
+    // FIXME, this traits is rather specialized for dense object and it needs to be cleaned further
+    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
+    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
+    Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit,
+    // FIXME DirectAccessBit should not be handled by expressions
+    //
+    // Alignment is needed by MapBase's assertions
+    // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the respective evaluator
+    Alignment = 0
+  };
+};
+
+template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool InnerPanel = false,
+         bool HasDirectAccess = internal::has_direct_access<XprType>::ret> class BlockImpl_dense;
+
+} // end namespace internal
+
+template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, typename StorageKind> class BlockImpl;
+
+/** \class Block
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a fixed-size or dynamic-size block
+  *
+  * \tparam XprType the type of the expression in which we are taking a block
+  * \tparam BlockRows the number of rows of the block we are taking at compile time (optional)
+  * \tparam BlockCols the number of columns of the block we are taking at compile time (optional)
+  * \tparam InnerPanel is true, if the block maps to a set of rows of a row major matrix or
+  *         to set of columns of a column major matrix (optional). The parameter allows to determine
+  *         at compile time whether aligned access is possible on the block expression.
+  *
+  * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
+  * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
+  * most of the time this is the only way it is used.
+  *
+  * However, if you want to directly maniputate block expressions,
+  * for instance if you want to write a function returning such an expression, you
+  * will need to use this class.
+  *
+  * Here is an example illustrating the dynamic case:
+  * \include class_Block.cpp
+  * Output: \verbinclude class_Block.out
+  *
+  * \note Even though this expression has dynamic size, in the case where \a XprType
+  * has fixed size, this expression inherits a fixed maximal size which means that evaluating
+  * it does not cause a dynamic memory allocation.
+  *
+  * Here is an example illustrating the fixed-size case:
+  * \include class_FixedBlock.cpp
+  * Output: \verbinclude class_FixedBlock.out
+  *
+  * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock
+  */
+template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class Block
+  : public BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, typename internal::traits<XprType>::StorageKind>
+{
+    typedef BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, typename internal::traits<XprType>::StorageKind> Impl;
+  public:
+    //typedef typename Impl::Base Base;
+    typedef Impl Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Block)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
+
+    typedef typename internal::remove_all<XprType>::type NestedExpression;
+
+    /** Column or Row constructor
+      */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Block(XprType& xpr, Index i) : Impl(xpr,i)
+    {
+      eigen_assert( (i>=0) && (
+          ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && i<xpr.rows())
+        ||((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && i<xpr.cols())));
+    }
+
+    /** Fixed-size constructor
+      */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Block(XprType& xpr, Index startRow, Index startCol)
+      : Impl(xpr, startRow, startCol)
+    {
+      EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
+      eigen_assert(startRow >= 0 && BlockRows >= 0 && startRow + BlockRows <= xpr.rows()
+             && startCol >= 0 && BlockCols >= 0 && startCol + BlockCols <= xpr.cols());
+    }
+
+    /** Dynamic-size constructor
+      */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Block(XprType& xpr,
+          Index startRow, Index startCol,
+          Index blockRows, Index blockCols)
+      : Impl(xpr, startRow, startCol, blockRows, blockCols)
+    {
+      eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==blockRows)
+          && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols));
+      eigen_assert(startRow >= 0 && blockRows >= 0 && startRow  <= xpr.rows() - blockRows
+          && startCol >= 0 && blockCols >= 0 && startCol <= xpr.cols() - blockCols);
+    }
+};
+
+// The generic default implementation for dense block simplu forward to the internal::BlockImpl_dense
+// that must be specialized for direct and non-direct access...
+template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
+class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Dense>
+  : public internal::BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel>
+{
+    typedef internal::BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel> Impl;
+    typedef typename XprType::StorageIndex StorageIndex;
+  public:
+    typedef Impl Base;
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {}
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : Impl(xpr, startRow, startCol, blockRows, blockCols) {}
+};
+
+namespace internal {
+
+/** \internal Internal implementation of dense Blocks in the general case. */
+template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool HasDirectAccess> class BlockImpl_dense
+  : public internal::dense_xpr_base<Block<XprType, BlockRows, BlockCols, InnerPanel> >::type
+{
+    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
+  public:
+
+    typedef typename internal::dense_xpr_base<BlockType>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(BlockType)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense)
+
+    // class InnerIterator; // FIXME apparently never used
+
+    /** Column or Row constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline BlockImpl_dense(XprType& xpr, Index i)
+      : m_xpr(xpr),
+        // It is a row if and only if BlockRows==1 and BlockCols==XprType::ColsAtCompileTime,
+        // and it is a column if and only if BlockRows==XprType::RowsAtCompileTime and BlockCols==1,
+        // all other cases are invalid.
+        // The case a 1x1 matrix seems ambiguous, but the result is the same anyway.
+        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0),
+        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0),
+        m_blockRows(BlockRows==1 ? 1 : xpr.rows()),
+        m_blockCols(BlockCols==1 ? 1 : xpr.cols())
+    {}
+
+    /** Fixed-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+      : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol),
+                    m_blockRows(BlockRows), m_blockCols(BlockCols)
+    {}
+
+    /** Dynamic-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline BlockImpl_dense(XprType& xpr,
+          Index startRow, Index startCol,
+          Index blockRows, Index blockCols)
+      : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol),
+                    m_blockRows(blockRows), m_blockCols(blockCols)
+    {}
+
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_blockRows.value(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_blockCols.value(); }
+
+    EIGEN_DEVICE_FUNC
+    inline Scalar& coeffRef(Index rowId, Index colId)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(XprType)
+      return m_xpr.coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
+    }
+
+    EIGEN_DEVICE_FUNC
+    inline const Scalar& coeffRef(Index rowId, Index colId) const
+    {
+      return m_xpr.derived().coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const
+    {
+      return m_xpr.coeff(rowId + m_startRow.value(), colId + m_startCol.value());
+    }
+
+    EIGEN_DEVICE_FUNC
+    inline Scalar& coeffRef(Index index)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(XprType)
+      return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+    }
+
+    EIGEN_DEVICE_FUNC
+    inline const Scalar& coeffRef(Index index) const
+    {
+      return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+    }
+
+    EIGEN_DEVICE_FUNC
+    inline const CoeffReturnType coeff(Index index) const
+    {
+      return m_xpr.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                         m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+    }
+
+    template<int LoadMode>
+    inline PacketScalar packet(Index rowId, Index colId) const
+    {
+      return m_xpr.template packet<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value());
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
+    {
+      m_xpr.template writePacket<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value(), val);
+    }
+
+    template<int LoadMode>
+    inline PacketScalar packet(Index index) const
+    {
+      return m_xpr.template packet<Unaligned>
+              (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+               m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index index, const PacketScalar& val)
+    {
+      m_xpr.template writePacket<Unaligned>
+         (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+          m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), val);
+    }
+
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    /** \sa MapBase::data() */
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const;
+    EIGEN_DEVICE_FUNC inline Index innerStride() const;
+    EIGEN_DEVICE_FUNC inline Index outerStride() const;
+    #endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
+    {
+      return m_xpr;
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    XprType& nestedExpression() { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    StorageIndex startRow() const EIGEN_NOEXCEPT
+    {
+      return m_startRow.value();
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    StorageIndex startCol() const EIGEN_NOEXCEPT
+    {
+      return m_startCol.value();
+    }
+
+  protected:
+
+    XprTypeNested m_xpr;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
+    const internal::variable_if_dynamic<StorageIndex, RowsAtCompileTime> m_blockRows;
+    const internal::variable_if_dynamic<StorageIndex, ColsAtCompileTime> m_blockCols;
+};
+
+/** \internal Internal implementation of dense Blocks in the direct access case.*/
+template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
+class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
+  : public MapBase<Block<XprType, BlockRows, BlockCols, InnerPanel> >
+{
+    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
+    enum {
+      XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0
+    };
+  public:
+
+    typedef MapBase<BlockType> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(BlockType)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense)
+
+    /** Column or Row constructor
+      */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    BlockImpl_dense(XprType& xpr, Index i)
+      : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor))
+                                || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
+             BlockRows==1 ? 1 : xpr.rows(),
+             BlockCols==1 ? 1 : xpr.cols()),
+        m_xpr(xpr),
+        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0),
+        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)
+    {
+      init();
+    }
+
+    /** Fixed-size constructor
+      */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
+        m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
+    {
+      init();
+    }
+
+    /** Dynamic-size constructor
+      */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    BlockImpl_dense(XprType& xpr,
+          Index startRow, Index startCol,
+          Index blockRows, Index blockCols)
+      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
+        m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
+    {
+      init();
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const EIGEN_NOEXCEPT
+    {
+      return m_xpr;
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    XprType& nestedExpression() { return m_xpr; }
+
+    /** \sa MapBase::innerStride() */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index innerStride() const EIGEN_NOEXCEPT
+    {
+      return internal::traits<BlockType>::HasSameStorageOrderAsXprType
+             ? m_xpr.innerStride()
+             : m_xpr.outerStride();
+    }
+
+    /** \sa MapBase::outerStride() */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index outerStride() const EIGEN_NOEXCEPT
+    {
+      return internal::traits<BlockType>::HasSameStorageOrderAsXprType
+                    ? m_xpr.outerStride()
+                    : m_xpr.innerStride();
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    StorageIndex startRow() const EIGEN_NOEXCEPT { return m_startRow.value(); }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    StorageIndex startCol() const EIGEN_NOEXCEPT { return m_startCol.value(); }
+
+  #ifndef __SUNPRO_CC
+  // FIXME sunstudio is not friendly with the above friend...
+  // META-FIXME there is no 'friend' keyword around here. Is this obsolete?
+  protected:
+  #endif
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal used by allowAligned() */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
+      : Base(data, blockRows, blockCols), m_xpr(xpr)
+    {
+      init();
+    }
+    #endif
+
+  protected:
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void init()
+    {
+      m_outerStride = internal::traits<BlockType>::HasSameStorageOrderAsXprType
+                    ? m_xpr.outerStride()
+                    : m_xpr.innerStride();
+    }
+
+    XprTypeNested m_xpr;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
+    const internal::variable_if_dynamic<StorageIndex, (XprType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
+    Index m_outerStride;
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BLOCK_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/BooleanRedux.h b/src/3rdparty/eigen/Eigen/src/Core/BooleanRedux.h
new file mode 100644
index 000000000..852de8b90
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/BooleanRedux.h
@@ -0,0 +1,162 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ALLANDANY_H
+#define EIGEN_ALLANDANY_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Derived, int UnrollCount, int Rows>
+struct all_unroller
+{
+  enum {
+    col = (UnrollCount-1) / Rows,
+    row = (UnrollCount-1) % Rows
+  };
+
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
+  {
+    return all_unroller<Derived, UnrollCount-1, Rows>::run(mat) && mat.coeff(row, col);
+  }
+};
+
+template<typename Derived, int Rows>
+struct all_unroller<Derived, 0, Rows>
+{
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; }
+};
+
+template<typename Derived, int Rows>
+struct all_unroller<Derived, Dynamic, Rows>
+{
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
+};
+
+template<typename Derived, int UnrollCount, int Rows>
+struct any_unroller
+{
+  enum {
+    col = (UnrollCount-1) / Rows,
+    row = (UnrollCount-1) % Rows
+  };
+  
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
+  {
+    return any_unroller<Derived, UnrollCount-1, Rows>::run(mat) || mat.coeff(row, col);
+  }
+};
+
+template<typename Derived, int Rows>
+struct any_unroller<Derived, 0, Rows>
+{
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; }
+};
+
+template<typename Derived, int Rows>
+struct any_unroller<Derived, Dynamic, Rows>
+{
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
+};
+
+} // end namespace internal
+
+/** \returns true if all coefficients are true
+  *
+  * Example: \include MatrixBase_all.cpp
+  * Output: \verbinclude MatrixBase_all.out
+  *
+  * \sa any(), Cwise::operator<()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
+{
+  typedef internal::evaluator<Derived> Evaluator;
+  enum {
+    unroll = SizeAtCompileTime != Dynamic
+          && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits<Scalar>::AddCost)) <= EIGEN_UNROLLING_LIMIT
+  };
+  Evaluator evaluator(derived());
+  if(unroll)
+    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
+  else
+  {
+    for(Index j = 0; j < cols(); ++j)
+      for(Index i = 0; i < rows(); ++i)
+        if (!evaluator.coeff(i, j)) return false;
+    return true;
+  }
+}
+
+/** \returns true if at least one coefficient is true
+  *
+  * \sa all()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
+{
+  typedef internal::evaluator<Derived> Evaluator;
+  enum {
+    unroll = SizeAtCompileTime != Dynamic
+          && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits<Scalar>::AddCost)) <= EIGEN_UNROLLING_LIMIT
+  };
+  Evaluator evaluator(derived());
+  if(unroll)
+    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
+  else
+  {
+    for(Index j = 0; j < cols(); ++j)
+      for(Index i = 0; i < rows(); ++i)
+        if (evaluator.coeff(i, j)) return true;
+    return false;
+  }
+}
+
+/** \returns the number of coefficients which evaluate to true
+  *
+  * \sa all(), any()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase<Derived>::count() const
+{
+  return derived().template cast<bool>().template cast<Index>().sum();
+}
+
+/** \returns true is \c *this contains at least one Not A Number (NaN).
+  *
+  * \sa allFinite()
+  */
+template<typename Derived>
+inline bool DenseBase<Derived>::hasNaN() const
+{
+#if EIGEN_COMP_MSVC || (defined __FAST_MATH__)
+  return derived().array().isNaN().any();
+#else
+  return !((derived().array()==derived().array()).all());
+#endif
+}
+
+/** \returns true if \c *this contains only finite numbers, i.e., no NaN and no +/-INF values.
+  *
+  * \sa hasNaN()
+  */
+template<typename Derived>
+inline bool DenseBase<Derived>::allFinite() const
+{
+#if EIGEN_COMP_MSVC || (defined __FAST_MATH__)
+  return derived().array().isFinite().all();
+#else
+  return !((derived()-derived()).hasNaN());
+#endif
+}
+    
+} // end namespace Eigen
+
+#endif // EIGEN_ALLANDANY_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/CommaInitializer.h b/src/3rdparty/eigen/Eigen/src/Core/CommaInitializer.h
new file mode 100644
index 000000000..c0e29c75c
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/CommaInitializer.h
@@ -0,0 +1,164 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMMAINITIALIZER_H
+#define EIGEN_COMMAINITIALIZER_H
+
+namespace Eigen { 
+
+/** \class CommaInitializer
+  * \ingroup Core_Module
+  *
+  * \brief Helper class used by the comma initializer operator
+  *
+  * This class is internally used to implement the comma initializer feature. It is
+  * the return type of MatrixBase::operator<<, and most of the time this is the only
+  * way it is used.
+  *
+  * \sa \blank \ref MatrixBaseCommaInitRef "MatrixBase::operator<<", CommaInitializer::finished()
+  */
+template<typename XprType>
+struct CommaInitializer
+{
+  typedef typename XprType::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC
+  inline CommaInitializer(XprType& xpr, const Scalar& s)
+    : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1)
+  {
+    eigen_assert(m_xpr.rows() > 0 && m_xpr.cols() > 0
+      && "Cannot comma-initialize a 0x0 matrix (operator<<)");
+    m_xpr.coeffRef(0,0) = s;
+  }
+
+  template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
+  inline CommaInitializer(XprType& xpr, const DenseBase<OtherDerived>& other)
+    : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows())
+  {
+    eigen_assert(m_xpr.rows() >= other.rows() && m_xpr.cols() >= other.cols()
+      && "Cannot comma-initialize a 0x0 matrix (operator<<)");
+    m_xpr.block(0, 0, other.rows(), other.cols()) = other;
+  }
+
+  /* Copy/Move constructor which transfers ownership. This is crucial in 
+   * absence of return value optimization to avoid assertions during destruction. */
+  // FIXME in C++11 mode this could be replaced by a proper RValue constructor
+  EIGEN_DEVICE_FUNC
+  inline CommaInitializer(const CommaInitializer& o)
+  : m_xpr(o.m_xpr), m_row(o.m_row), m_col(o.m_col), m_currentBlockRows(o.m_currentBlockRows) {
+    // Mark original object as finished. In absence of R-value references we need to const_cast:
+    const_cast<CommaInitializer&>(o).m_row = m_xpr.rows();
+    const_cast<CommaInitializer&>(o).m_col = m_xpr.cols();
+    const_cast<CommaInitializer&>(o).m_currentBlockRows = 0;
+  }
+
+  /* inserts a scalar value in the target matrix */
+  EIGEN_DEVICE_FUNC
+  CommaInitializer& operator,(const Scalar& s)
+  {
+    if (m_col==m_xpr.cols())
+    {
+      m_row+=m_currentBlockRows;
+      m_col = 0;
+      m_currentBlockRows = 1;
+      eigen_assert(m_row<m_xpr.rows()
+        && "Too many rows passed to comma initializer (operator<<)");
+    }
+    eigen_assert(m_col<m_xpr.cols()
+      && "Too many coefficients passed to comma initializer (operator<<)");
+    eigen_assert(m_currentBlockRows==1);
+    m_xpr.coeffRef(m_row, m_col++) = s;
+    return *this;
+  }
+
+  /* inserts a matrix expression in the target matrix */
+  template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
+  CommaInitializer& operator,(const DenseBase<OtherDerived>& other)
+  {
+    if (m_col==m_xpr.cols() && (other.cols()!=0 || other.rows()!=m_currentBlockRows))
+    {
+      m_row+=m_currentBlockRows;
+      m_col = 0;
+      m_currentBlockRows = other.rows();
+      eigen_assert(m_row+m_currentBlockRows<=m_xpr.rows()
+        && "Too many rows passed to comma initializer (operator<<)");
+    }
+    eigen_assert((m_col + other.cols() <= m_xpr.cols())
+      && "Too many coefficients passed to comma initializer (operator<<)");
+    eigen_assert(m_currentBlockRows==other.rows());
+    m_xpr.template block<OtherDerived::RowsAtCompileTime, OtherDerived::ColsAtCompileTime>
+                    (m_row, m_col, other.rows(), other.cols()) = other;
+    m_col += other.cols();
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC
+  inline ~CommaInitializer()
+#if defined VERIFY_RAISES_ASSERT && (!defined EIGEN_NO_ASSERTION_CHECKING) && defined EIGEN_EXCEPTIONS
+  EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception)
+#endif
+  {
+    finished();
+  }
+
+  /** \returns the built matrix once all its coefficients have been set.
+    * Calling finished is 100% optional. Its purpose is to write expressions
+    * like this:
+    * \code
+    * quaternion.fromRotationMatrix((Matrix3f() << axis0, axis1, axis2).finished());
+    * \endcode
+    */
+  EIGEN_DEVICE_FUNC
+  inline XprType& finished() {
+      eigen_assert(((m_row+m_currentBlockRows) == m_xpr.rows() || m_xpr.cols() == 0)
+           && m_col == m_xpr.cols()
+           && "Too few coefficients passed to comma initializer (operator<<)");
+      return m_xpr;
+  }
+
+  XprType& m_xpr;           // target expression
+  Index m_row;              // current row id
+  Index m_col;              // current col id
+  Index m_currentBlockRows; // current block height
+};
+
+/** \anchor MatrixBaseCommaInitRef
+  * Convenient operator to set the coefficients of a matrix.
+  *
+  * The coefficients must be provided in a row major order and exactly match
+  * the size of the matrix. Otherwise an assertion is raised.
+  *
+  * Example: \include MatrixBase_set.cpp
+  * Output: \verbinclude MatrixBase_set.out
+  * 
+  * \note According the c++ standard, the argument expressions of this comma initializer are evaluated in arbitrary order.
+  *
+  * \sa CommaInitializer::finished(), class CommaInitializer
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
+{
+  return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
+}
+
+/** \sa operator<<(const Scalar&) */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived>
+DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
+{
+  return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMMAINITIALIZER_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/ConditionEstimator.h b/src/3rdparty/eigen/Eigen/src/Core/ConditionEstimator.h
new file mode 100644
index 000000000..51a2e5f1b
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/ConditionEstimator.h
@@ -0,0 +1,175 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Rasmus Munk Larsen (rmlarsen@google.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONDITIONESTIMATOR_H
+#define EIGEN_CONDITIONESTIMATOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Vector, typename RealVector, bool IsComplex>
+struct rcond_compute_sign {
+  static inline Vector run(const Vector& v) {
+    const RealVector v_abs = v.cwiseAbs();
+    return (v_abs.array() == static_cast<typename Vector::RealScalar>(0))
+            .select(Vector::Ones(v.size()), v.cwiseQuotient(v_abs));
+  }
+};
+
+// Partial specialization to avoid elementwise division for real vectors.
+template <typename Vector>
+struct rcond_compute_sign<Vector, Vector, false> {
+  static inline Vector run(const Vector& v) {
+    return (v.array() < static_cast<typename Vector::RealScalar>(0))
+           .select(-Vector::Ones(v.size()), Vector::Ones(v.size()));
+  }
+};
+
+/**
+  * \returns an estimate of ||inv(matrix)||_1 given a decomposition of
+  * \a matrix that implements .solve() and .adjoint().solve() methods.
+  *
+  * This function implements Algorithms 4.1 and 5.1 from
+  *   http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf
+  * which also forms the basis for the condition number estimators in
+  * LAPACK. Since at most 10 calls to the solve method of dec are
+  * performed, the total cost is O(dims^2), as opposed to O(dims^3)
+  * needed to compute the inverse matrix explicitly.
+  *
+  * The most common usage is in estimating the condition number
+  * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be
+  * computed directly in O(n^2) operations.
+  *
+  * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, and
+  * LLT.
+  *
+  * \sa FullPivLU, PartialPivLU, LDLT, LLT.
+  */
+template <typename Decomposition>
+typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomposition& dec)
+{
+  typedef typename Decomposition::MatrixType MatrixType;
+  typedef typename Decomposition::Scalar Scalar;
+  typedef typename Decomposition::RealScalar RealScalar;
+  typedef typename internal::plain_col_type<MatrixType>::type Vector;
+  typedef typename internal::plain_col_type<MatrixType, RealScalar>::type RealVector;
+  const bool is_complex = (NumTraits<Scalar>::IsComplex != 0);
+
+  eigen_assert(dec.rows() == dec.cols());
+  const Index n = dec.rows();
+  if (n == 0)
+    return 0;
+
+  // Disable Index to float conversion warning
+#ifdef __INTEL_COMPILER
+  #pragma warning push
+  #pragma warning ( disable : 2259 )
+#endif
+  Vector v = dec.solve(Vector::Ones(n) / Scalar(n));
+#ifdef __INTEL_COMPILER
+  #pragma warning pop
+#endif
+
+  // lower_bound is a lower bound on
+  //   ||inv(matrix)||_1  = sup_v ||inv(matrix) v||_1 / ||v||_1
+  // and is the objective maximized by the ("super-") gradient ascent
+  // algorithm below.
+  RealScalar lower_bound = v.template lpNorm<1>();
+  if (n == 1)
+    return lower_bound;
+
+  // Gradient ascent algorithm follows: We know that the optimum is achieved at
+  // one of the simplices v = e_i, so in each iteration we follow a
+  // super-gradient to move towards the optimal one.
+  RealScalar old_lower_bound = lower_bound;
+  Vector sign_vector(n);
+  Vector old_sign_vector;
+  Index v_max_abs_index = -1;
+  Index old_v_max_abs_index = v_max_abs_index;
+  for (int k = 0; k < 4; ++k)
+  {
+    sign_vector = internal::rcond_compute_sign<Vector, RealVector, is_complex>::run(v);
+    if (k > 0 && !is_complex && sign_vector == old_sign_vector) {
+      // Break if the solution stagnated.
+      break;
+    }
+    // v_max_abs_index = argmax |real( inv(matrix)^T * sign_vector )|
+    v = dec.adjoint().solve(sign_vector);
+    v.real().cwiseAbs().maxCoeff(&v_max_abs_index);
+    if (v_max_abs_index == old_v_max_abs_index) {
+      // Break if the solution stagnated.
+      break;
+    }
+    // Move to the new simplex e_j, where j = v_max_abs_index.
+    v = dec.solve(Vector::Unit(n, v_max_abs_index));  // v = inv(matrix) * e_j.
+    lower_bound = v.template lpNorm<1>();
+    if (lower_bound <= old_lower_bound) {
+      // Break if the gradient step did not increase the lower_bound.
+      break;
+    }
+    if (!is_complex) {
+      old_sign_vector = sign_vector;
+    }
+    old_v_max_abs_index = v_max_abs_index;
+    old_lower_bound = lower_bound;
+  }
+  // The following calculates an independent estimate of ||matrix||_1 by
+  // multiplying matrix by a vector with entries of slowly increasing
+  // magnitude and alternating sign:
+  //   v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1.
+  // This improvement to Hager's algorithm above is due to Higham. It was
+  // added to make the algorithm more robust in certain corner cases where
+  // large elements in the matrix might otherwise escape detection due to
+  // exact cancellation (especially when op and op_adjoint correspond to a
+  // sequence of backsubstitutions and permutations), which could cause
+  // Hager's algorithm to vastly underestimate ||matrix||_1.
+  Scalar alternating_sign(RealScalar(1));
+  for (Index i = 0; i < n; ++i) {
+    // The static_cast is needed when Scalar is a complex and RealScalar implements expression templates
+    v[i] = alternating_sign * static_cast<RealScalar>(RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1))));
+    alternating_sign = -alternating_sign;
+  }
+  v = dec.solve(v);
+  const RealScalar alternate_lower_bound = (2 * v.template lpNorm<1>()) / (3 * RealScalar(n));
+  return numext::maxi(lower_bound, alternate_lower_bound);
+}
+
+/** \brief Reciprocal condition number estimator.
+  *
+  * Computing a decomposition of a dense matrix takes O(n^3) operations, while
+  * this method estimates the condition number quickly and reliably in O(n^2)
+  * operations.
+  *
+  * \returns an estimate of the reciprocal condition number
+  * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and
+  * its decomposition. Supports the following decompositions: FullPivLU,
+  * PartialPivLU, LDLT, and LLT.
+  *
+  * \sa FullPivLU, PartialPivLU, LDLT, LLT.
+  */
+template <typename Decomposition>
+typename Decomposition::RealScalar
+rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Decomposition& dec)
+{
+  typedef typename Decomposition::RealScalar RealScalar;
+  eigen_assert(dec.rows() == dec.cols());
+  if (dec.rows() == 0)              return NumTraits<RealScalar>::infinity();
+  if (matrix_norm == RealScalar(0)) return RealScalar(0);
+  if (dec.rows() == 1)              return RealScalar(1);
+  const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec);
+  return (inverse_matrix_norm == RealScalar(0) ? RealScalar(0)
+                                               : (RealScalar(1) / inverse_matrix_norm) / matrix_norm);
+}
+
+}  // namespace internal
+
+}  // namespace Eigen
+
+#endif
diff --git a/src/3rdparty/eigen/Eigen/src/Core/CoreEvaluators.h b/src/3rdparty/eigen/Eigen/src/Core/CoreEvaluators.h
new file mode 100644
index 000000000..0ff8c8deb
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/CoreEvaluators.h
@@ -0,0 +1,1741 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_COREEVALUATORS_H
+#define EIGEN_COREEVALUATORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// This class returns the evaluator kind from the expression storage kind.
+// Default assumes index based accessors
+template<typename StorageKind>
+struct storage_kind_to_evaluator_kind {
+  typedef IndexBased Kind;
+};
+
+// This class returns the evaluator shape from the expression storage kind.
+// It can be Dense, Sparse, Triangular, Diagonal, SelfAdjoint, Band, etc.
+template<typename StorageKind> struct storage_kind_to_shape;
+
+template<> struct storage_kind_to_shape<Dense>                  { typedef DenseShape Shape;           };
+template<> struct storage_kind_to_shape<SolverStorage>          { typedef SolverShape Shape;           };
+template<> struct storage_kind_to_shape<PermutationStorage>     { typedef PermutationShape Shape;     };
+template<> struct storage_kind_to_shape<TranspositionsStorage>  { typedef TranspositionsShape Shape;  };
+
+// Evaluators have to be specialized with respect to various criteria such as:
+//  - storage/structure/shape
+//  - scalar type
+//  - etc.
+// Therefore, we need specialization of evaluator providing additional template arguments for each kind of evaluators.
+// We currently distinguish the following kind of evaluators:
+// - unary_evaluator    for expressions taking only one arguments (CwiseUnaryOp, CwiseUnaryView, Transpose, MatrixWrapper, ArrayWrapper, Reverse, Replicate)
+// - binary_evaluator   for expression taking two arguments (CwiseBinaryOp)
+// - ternary_evaluator   for expression taking three arguments (CwiseTernaryOp)
+// - product_evaluator  for linear algebra products (Product); special case of binary_evaluator because it requires additional tags for dispatching.
+// - mapbase_evaluator  for Map, Block, Ref
+// - block_evaluator    for Block (special dispatching to a mapbase_evaluator or unary_evaluator)
+
+template< typename T,
+          typename Arg1Kind   = typename evaluator_traits<typename T::Arg1>::Kind,
+          typename Arg2Kind   = typename evaluator_traits<typename T::Arg2>::Kind,
+          typename Arg3Kind   = typename evaluator_traits<typename T::Arg3>::Kind,
+          typename Arg1Scalar = typename traits<typename T::Arg1>::Scalar,
+          typename Arg2Scalar = typename traits<typename T::Arg2>::Scalar,
+          typename Arg3Scalar = typename traits<typename T::Arg3>::Scalar> struct ternary_evaluator;
+
+template< typename T,
+          typename LhsKind   = typename evaluator_traits<typename T::Lhs>::Kind,
+          typename RhsKind   = typename evaluator_traits<typename T::Rhs>::Kind,
+          typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
+          typename RhsScalar = typename traits<typename T::Rhs>::Scalar> struct binary_evaluator;
+
+template< typename T,
+          typename Kind   = typename evaluator_traits<typename T::NestedExpression>::Kind,
+          typename Scalar = typename T::Scalar> struct unary_evaluator;
+
+// evaluator_traits<T> contains traits for evaluator<T>
+
+template<typename T>
+struct evaluator_traits_base
+{
+  // by default, get evaluator kind and shape from storage
+  typedef typename storage_kind_to_evaluator_kind<typename traits<T>::StorageKind>::Kind Kind;
+  typedef typename storage_kind_to_shape<typename traits<T>::StorageKind>::Shape Shape;
+};
+
+// Default evaluator traits
+template<typename T>
+struct evaluator_traits : public evaluator_traits_base<T>
+{
+};
+
+template<typename T, typename Shape = typename evaluator_traits<T>::Shape >
+struct evaluator_assume_aliasing {
+  static const bool value = false;
+};
+
+// By default, we assume a unary expression:
+template<typename T>
+struct evaluator : public unary_evaluator<T>
+{
+  typedef unary_evaluator<T> Base;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const T& xpr) : Base(xpr) {}
+};
+
+
+// TODO: Think about const-correctness
+template<typename T>
+struct evaluator<const T>
+  : evaluator<T>
+{
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const T& xpr) : evaluator<T>(xpr) {}
+};
+
+// ---------- base class for all evaluators ----------
+
+template<typename ExpressionType>
+struct evaluator_base
+{
+  // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices.
+  typedef traits<ExpressionType> ExpressionTraits;
+
+  enum {
+    Alignment = 0
+  };
+  // noncopyable:
+  // Don't make this class inherit noncopyable as this kills EBO (Empty Base Optimization)
+  // and make complex evaluator much larger than then should do.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator_base() {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~evaluator_base() {}
+private:
+  EIGEN_DEVICE_FUNC evaluator_base(const evaluator_base&);
+  EIGEN_DEVICE_FUNC const evaluator_base& operator=(const evaluator_base&);
+};
+
+// -------------------- Matrix and Array --------------------
+//
+// evaluator<PlainObjectBase> is a common base class for the
+// Matrix and Array evaluators.
+// Here we directly specialize evaluator. This is not really a unary expression, and it is, by definition, dense,
+// so no need for more sophisticated dispatching.
+
+// this helper permits to completely eliminate m_outerStride if it is known at compiletime.
+template<typename Scalar,int OuterStride> class plainobjectbase_evaluator_data {
+public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
+  {
+#ifndef EIGEN_INTERNAL_DEBUGGING
+    EIGEN_UNUSED_VARIABLE(outerStride);
+#endif
+    eigen_internal_assert(outerStride==OuterStride);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index outerStride() const EIGEN_NOEXCEPT { return OuterStride; }
+  const Scalar *data;
+};
+
+template<typename Scalar> class plainobjectbase_evaluator_data<Scalar,Dynamic> {
+public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Index outerStride() const { return m_outerStride; }
+  const Scalar *data;
+protected:
+  Index m_outerStride;
+};
+
+template<typename Derived>
+struct evaluator<PlainObjectBase<Derived> >
+  : evaluator_base<Derived>
+{
+  typedef PlainObjectBase<Derived> PlainObjectType;
+  typedef typename PlainObjectType::Scalar Scalar;
+  typedef typename PlainObjectType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    IsRowMajor = PlainObjectType::IsRowMajor,
+    IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime,
+    RowsAtCompileTime = PlainObjectType::RowsAtCompileTime,
+    ColsAtCompileTime = PlainObjectType::ColsAtCompileTime,
+
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = traits<Derived>::EvaluatorFlags,
+    Alignment = traits<Derived>::Alignment
+  };
+  enum {
+    // We do not need to know the outer stride for vectors
+    OuterStrideAtCompileTime = IsVectorAtCompileTime  ? 0
+                                                      : int(IsRowMajor) ? ColsAtCompileTime
+                                                                        : RowsAtCompileTime
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  evaluator()
+    : m_d(0,OuterStrideAtCompileTime)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const PlainObjectType& m)
+    : m_d(m.data(),IsVectorAtCompileTime ? 0 : m.outerStride())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    if (IsRowMajor)
+      return m_d.data[row * m_d.outerStride() + col];
+    else
+      return m_d.data[row + col * m_d.outerStride()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_d.data[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    if (IsRowMajor)
+      return const_cast<Scalar*>(m_d.data)[row * m_d.outerStride() + col];
+    else
+      return const_cast<Scalar*>(m_d.data)[row + col * m_d.outerStride()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return const_cast<Scalar*>(m_d.data)[index];
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    if (IsRowMajor)
+      return ploadt<PacketType, LoadMode>(m_d.data + row * m_d.outerStride() + col);
+    else
+      return ploadt<PacketType, LoadMode>(m_d.data + row + col * m_d.outerStride());
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return ploadt<PacketType, LoadMode>(m_d.data + index);
+  }
+
+  template<int StoreMode,typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    if (IsRowMajor)
+      return pstoret<Scalar, PacketType, StoreMode>
+	            (const_cast<Scalar*>(m_d.data) + row * m_d.outerStride() + col, x);
+    else
+      return pstoret<Scalar, PacketType, StoreMode>
+                    (const_cast<Scalar*>(m_d.data) + row + col * m_d.outerStride(), x);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
+  {
+    return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x);
+  }
+
+protected:
+
+  plainobjectbase_evaluator_data<Scalar,OuterStrideAtCompileTime> m_d;
+};
+
+template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct evaluator<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
+  : evaluator<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > >
+{
+  typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  evaluator() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& m)
+    : evaluator<PlainObjectBase<XprType> >(m)
+  { }
+};
+
+template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct evaluator<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
+  : evaluator<PlainObjectBase<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > >
+{
+  typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  evaluator() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& m)
+    : evaluator<PlainObjectBase<XprType> >(m)
+  { }
+};
+
+// -------------------- Transpose --------------------
+
+template<typename ArgType>
+struct unary_evaluator<Transpose<ArgType>, IndexBased>
+  : evaluator_base<Transpose<ArgType> >
+{
+  typedef Transpose<ArgType> XprType;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    Flags = evaluator<ArgType>::Flags ^ RowMajorBit,
+    Alignment = evaluator<ArgType>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(col, row);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_argImpl.coeff(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(col, row);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename XprType::Scalar& coeffRef(Index index)
+  {
+    return m_argImpl.coeffRef(index);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(col, row);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(index);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    m_argImpl.template writePacket<StoreMode,PacketType>(col, row, x);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
+  {
+    m_argImpl.template writePacket<StoreMode,PacketType>(index, x);
+  }
+
+protected:
+  evaluator<ArgType> m_argImpl;
+};
+
+// -------------------- CwiseNullaryOp --------------------
+// Like Matrix and Array, this is not really a unary expression, so we directly specialize evaluator.
+// Likewise, there is not need to more sophisticated dispatching here.
+
+template<typename Scalar,typename NullaryOp,
+         bool has_nullary = has_nullary_operator<NullaryOp>::value,
+         bool has_unary   = has_unary_operator<NullaryOp>::value,
+         bool has_binary  = has_binary_operator<NullaryOp>::value>
+struct nullary_wrapper
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const { return op(i,j); }
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const { return op(i); }
+
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const { return op.template packetOp<T>(i,j); }
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const { return op.template packetOp<T>(i); }
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,true,false,false>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType=0, IndexType=0) const { return op(); }
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType=0, IndexType=0) const { return op.template packetOp<T>(); }
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,false,false,true>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j=0) const { return op(i,j); }
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j=0) const { return op.template packetOp<T>(i,j); }
+};
+
+// We need the following specialization for vector-only functors assigned to a runtime vector,
+// for instance, using linspace and assigning a RowVectorXd to a MatrixXd or even a row of a MatrixXd.
+// In this case, i==0 and j is used for the actual iteration.
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,false,true,false>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
+    eigen_assert(i==0 || j==0);
+    return op(i+j);
+  }
+  template <typename T, typename IndexType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const {
+    eigen_assert(i==0 || j==0);
+    return op.template packetOp<T>(i+j);
+  }
+
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const { return op(i); }
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const { return op.template packetOp<T>(i); }
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,false,false,false> {};
+
+#if 0 && EIGEN_COMP_MSVC>0
+// Disable this ugly workaround. This is now handled in traits<Ref>::match,
+// but this piece of code might still become handly if some other weird compilation
+// erros pop up again.
+
+// MSVC exhibits a weird compilation error when
+// compiling:
+//    Eigen::MatrixXf A = MatrixXf::Random(3,3);
+//    Ref<const MatrixXf> R = 2.f*A;
+// and that has_*ary_operator<scalar_constant_op<float>> have not been instantiated yet.
+// The "problem" is that evaluator<2.f*A> is instantiated by traits<Ref>::match<2.f*A>
+// and at that time has_*ary_operator<T> returns true regardless of T.
+// Then nullary_wrapper is badly instantiated as nullary_wrapper<.,.,true,true,true>.
+// The trick is thus to defer the proper instantiation of nullary_wrapper when coeff(),
+// and packet() are really instantiated as implemented below:
+
+// This is a simple wrapper around Index to enforce the re-instantiation of
+// has_*ary_operator when needed.
+template<typename T> struct nullary_wrapper_workaround_msvc {
+  nullary_wrapper_workaround_msvc(const T&);
+  operator T()const;
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,true,true,true>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().operator()(op,i,j);
+  }
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().operator()(op,i);
+  }
+
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().template packetOp<T>(op,i,j);
+  }
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().template packetOp<T>(op,i);
+  }
+};
+#endif // MSVC workaround
+
+template<typename NullaryOp, typename PlainObjectType>
+struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
+  : evaluator_base<CwiseNullaryOp<NullaryOp,PlainObjectType> >
+{
+  typedef CwiseNullaryOp<NullaryOp,PlainObjectType> XprType;
+  typedef typename internal::remove_all<PlainObjectType>::type PlainObjectTypeCleaned;
+
+  enum {
+    CoeffReadCost = internal::functor_traits<NullaryOp>::Cost,
+
+    Flags = (evaluator<PlainObjectTypeCleaned>::Flags
+          &  (  HereditaryBits
+              | (functor_has_linear_access<NullaryOp>::ret  ? LinearAccessBit : 0)
+              | (functor_traits<NullaryOp>::PacketAccess    ? PacketAccessBit : 0)))
+          | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
+    Alignment = AlignedMax
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& n)
+    : m_functor(n.functor()), m_wrapper()
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(IndexType row, IndexType col) const
+  {
+    return m_wrapper(m_functor, row, col);
+  }
+
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(IndexType index) const
+  {
+    return m_wrapper(m_functor,index);
+  }
+
+  template<int LoadMode, typename PacketType, typename IndexType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(IndexType row, IndexType col) const
+  {
+    return m_wrapper.template packetOp<PacketType>(m_functor, row, col);
+  }
+
+  template<int LoadMode, typename PacketType, typename IndexType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(IndexType index) const
+  {
+    return m_wrapper.template packetOp<PacketType>(m_functor, index);
+  }
+
+protected:
+  const NullaryOp m_functor;
+  const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
+};
+
+// -------------------- CwiseUnaryOp --------------------
+
+template<typename UnaryOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
+  : evaluator_base<CwiseUnaryOp<UnaryOp, ArgType> >
+{
+  typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
+
+  enum {
+    CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
+
+    Flags = evaluator<ArgType>::Flags
+          & (HereditaryBits | LinearAccessBit | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
+    Alignment = evaluator<ArgType>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& op) : m_d(op)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_d.func()(m_d.argImpl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_d.func()(m_d.argImpl.coeff(index));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(row, col));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(index));
+  }
+
+protected:
+
+  // this helper permits to completely eliminate the functor if it is empty
+  struct Data
+  {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const UnaryOp& func() const { return op; }
+    UnaryOp op;
+    evaluator<ArgType> argImpl;
+  };
+
+  Data m_d;
+};
+
+// -------------------- CwiseTernaryOp --------------------
+
+// this is a ternary expression
+template<typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
+struct evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
+  : public ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
+{
+  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
+  typedef ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> > Base;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+template<typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
+struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased, IndexBased>
+  : evaluator_base<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
+{
+  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
+
+  enum {
+    CoeffReadCost = int(evaluator<Arg1>::CoeffReadCost) + int(evaluator<Arg2>::CoeffReadCost) + int(evaluator<Arg3>::CoeffReadCost) + int(functor_traits<TernaryOp>::Cost),
+
+    Arg1Flags = evaluator<Arg1>::Flags,
+    Arg2Flags = evaluator<Arg2>::Flags,
+    Arg3Flags = evaluator<Arg3>::Flags,
+    SameType = is_same<typename Arg1::Scalar,typename Arg2::Scalar>::value && is_same<typename Arg1::Scalar,typename Arg3::Scalar>::value,
+    StorageOrdersAgree = (int(Arg1Flags)&RowMajorBit)==(int(Arg2Flags)&RowMajorBit) && (int(Arg1Flags)&RowMajorBit)==(int(Arg3Flags)&RowMajorBit),
+    Flags0 = (int(Arg1Flags) | int(Arg2Flags) | int(Arg3Flags)) & (
+        HereditaryBits
+        | (int(Arg1Flags) & int(Arg2Flags) & int(Arg3Flags) &
+           ( (StorageOrdersAgree ? LinearAccessBit : 0)
+           | (functor_traits<TernaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
+           )
+        )
+     ),
+    Flags = (Flags0 & ~RowMajorBit) | (Arg1Flags & RowMajorBit),
+    Alignment = EIGEN_PLAIN_ENUM_MIN(
+        EIGEN_PLAIN_ENUM_MIN(evaluator<Arg1>::Alignment, evaluator<Arg2>::Alignment),
+        evaluator<Arg3>::Alignment)
+  };
+
+  EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) : m_d(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<TernaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_d.func()(m_d.arg1Impl.coeff(row, col), m_d.arg2Impl.coeff(row, col), m_d.arg3Impl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_d.func()(m_d.arg1Impl.coeff(index), m_d.arg2Impl.coeff(index), m_d.arg3Impl.coeff(index));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_d.func().packetOp(m_d.arg1Impl.template packet<LoadMode,PacketType>(row, col),
+                               m_d.arg2Impl.template packet<LoadMode,PacketType>(row, col),
+                               m_d.arg3Impl.template packet<LoadMode,PacketType>(row, col));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return m_d.func().packetOp(m_d.arg1Impl.template packet<LoadMode,PacketType>(index),
+                               m_d.arg2Impl.template packet<LoadMode,PacketType>(index),
+                               m_d.arg3Impl.template packet<LoadMode,PacketType>(index));
+  }
+
+protected:
+  // this helper permits to completely eliminate the functor if it is empty
+  struct Data
+  {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Data(const XprType& xpr) : op(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TernaryOp& func() const { return op; }
+    TernaryOp op;
+    evaluator<Arg1> arg1Impl;
+    evaluator<Arg2> arg2Impl;
+    evaluator<Arg3> arg3Impl;
+  };
+
+  Data m_d;
+};
+
+// -------------------- CwiseBinaryOp --------------------
+
+// this is a binary expression
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+  : public binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > Base;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBased>
+  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+
+  enum {
+    CoeffReadCost = int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+
+    LhsFlags = evaluator<Lhs>::Flags,
+    RhsFlags = evaluator<Rhs>::Flags,
+    SameType = is_same<typename Lhs::Scalar,typename Rhs::Scalar>::value,
+    StorageOrdersAgree = (int(LhsFlags)&RowMajorBit)==(int(RhsFlags)&RowMajorBit),
+    Flags0 = (int(LhsFlags) | int(RhsFlags)) & (
+        HereditaryBits
+      | (int(LhsFlags) & int(RhsFlags) &
+           ( (StorageOrdersAgree ? LinearAccessBit : 0)
+           | (functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
+           )
+        )
+     ),
+    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit),
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<Lhs>::Alignment,evaluator<Rhs>::Alignment)
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit binary_evaluator(const XprType& xpr) : m_d(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_d.func()(m_d.lhsImpl.coeff(row, col), m_d.rhsImpl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_d.func()(m_d.lhsImpl.coeff(index), m_d.rhsImpl.coeff(index));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_d.func().packetOp(m_d.lhsImpl.template packet<LoadMode,PacketType>(row, col),
+                               m_d.rhsImpl.template packet<LoadMode,PacketType>(row, col));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return m_d.func().packetOp(m_d.lhsImpl.template packet<LoadMode,PacketType>(index),
+                               m_d.rhsImpl.template packet<LoadMode,PacketType>(index));
+  }
+
+protected:
+
+  // this helper permits to completely eliminate the functor if it is empty
+  struct Data
+  {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Data(const XprType& xpr) : op(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const BinaryOp& func() const { return op; }
+    BinaryOp op;
+    evaluator<Lhs> lhsImpl;
+    evaluator<Rhs> rhsImpl;
+  };
+
+  Data m_d;
+};
+
+// -------------------- CwiseUnaryView --------------------
+
+template<typename UnaryOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
+  : evaluator_base<CwiseUnaryView<UnaryOp, ArgType> >
+{
+  typedef CwiseUnaryView<UnaryOp, ArgType> XprType;
+
+  enum {
+    CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
+
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)),
+
+    Alignment = 0 // FIXME it is not very clear why alignment is necessarily lost...
+  };
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) : m_d(op)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_d.func()(m_d.argImpl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_d.func()(m_d.argImpl.coeff(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_d.func()(m_d.argImpl.coeffRef(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return m_d.func()(m_d.argImpl.coeffRef(index));
+  }
+
+protected:
+
+  // this helper permits to completely eliminate the functor if it is empty
+  struct Data
+  {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const UnaryOp& func() const { return op; }
+    UnaryOp op;
+    evaluator<ArgType> argImpl;
+  };
+
+  Data m_d;
+};
+
+// -------------------- Map --------------------
+
+// FIXME perhaps the PlainObjectType could be provided by Derived::PlainObject ?
+// but that might complicate template specialization
+template<typename Derived, typename PlainObjectType>
+struct mapbase_evaluator;
+
+template<typename Derived, typename PlainObjectType>
+struct mapbase_evaluator : evaluator_base<Derived>
+{
+  typedef Derived  XprType;
+  typedef typename XprType::PointerType PointerType;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    IsRowMajor = XprType::RowsAtCompileTime,
+    ColsAtCompileTime = XprType::ColsAtCompileTime,
+    CoeffReadCost = NumTraits<Scalar>::ReadCost
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit mapbase_evaluator(const XprType& map)
+    : m_data(const_cast<PointerType>(map.data())),
+      m_innerStride(map.innerStride()),
+      m_outerStride(map.outerStride())
+  {
+    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1),
+                        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_data[col * colStride() + row * rowStride()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_data[index * m_innerStride.value()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_data[col * colStride() + row * rowStride()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return m_data[index * m_innerStride.value()];
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
+    return internal::ploadt<PacketType, LoadMode>(ptr);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
+    return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
+  {
+    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
+  }
+protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index rowStride() const EIGEN_NOEXCEPT {
+    return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index colStride() const EIGEN_NOEXCEPT {
+     return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value();
+  }
+
+  PointerType m_data;
+  const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
+  const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
+};
+
+template<typename PlainObjectType, int MapOptions, typename StrideType>
+struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
+  : public mapbase_evaluator<Map<PlainObjectType, MapOptions, StrideType>, PlainObjectType>
+{
+  typedef Map<PlainObjectType, MapOptions, StrideType> XprType;
+  typedef typename XprType::Scalar Scalar;
+  // TODO: should check for smaller packet types once we can handle multi-sized packet types
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+
+  enum {
+    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
+                             ? int(PlainObjectType::InnerStrideAtCompileTime)
+                             : int(StrideType::InnerStrideAtCompileTime),
+    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
+                             ? int(PlainObjectType::OuterStrideAtCompileTime)
+                             : int(StrideType::OuterStrideAtCompileTime),
+    HasNoInnerStride = InnerStrideAtCompileTime == 1,
+    HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
+    HasNoStride = HasNoInnerStride && HasNoOuterStride,
+    IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
+
+    PacketAccessMask = bool(HasNoInnerStride) ? ~int(0) : ~int(PacketAccessBit),
+    LinearAccessMask = bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime) ? ~int(0) : ~int(LinearAccessBit),
+    Flags = int( evaluator<PlainObjectType>::Flags) & (LinearAccessMask&PacketAccessMask),
+
+    Alignment = int(MapOptions)&int(AlignedMask)
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map)
+    : mapbase_evaluator<XprType, PlainObjectType>(map)
+  { }
+};
+
+// -------------------- Ref --------------------
+
+template<typename PlainObjectType, int RefOptions, typename StrideType>
+struct evaluator<Ref<PlainObjectType, RefOptions, StrideType> >
+  : public mapbase_evaluator<Ref<PlainObjectType, RefOptions, StrideType>, PlainObjectType>
+{
+  typedef Ref<PlainObjectType, RefOptions, StrideType> XprType;
+
+  enum {
+    Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Flags,
+    Alignment = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& ref)
+    : mapbase_evaluator<XprType, PlainObjectType>(ref)
+  { }
+};
+
+// -------------------- Block --------------------
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel,
+         bool HasDirectAccess = internal::has_direct_access<ArgType>::ret> struct block_evaluator;
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
+  : block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel>
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  typedef typename XprType::Scalar Scalar;
+  // TODO: should check for smaller packet types once we can handle multi-sized packet types
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+
+    RowsAtCompileTime = traits<XprType>::RowsAtCompileTime,
+    ColsAtCompileTime = traits<XprType>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = traits<XprType>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = traits<XprType>::MaxColsAtCompileTime,
+
+    ArgTypeIsRowMajor = (int(evaluator<ArgType>::Flags)&RowMajorBit) != 0,
+    IsRowMajor = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? 1
+               : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0
+               : ArgTypeIsRowMajor,
+    HasSameStorageOrderAsArgType = (IsRowMajor == ArgTypeIsRowMajor),
+    InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+    InnerStrideAtCompileTime = HasSameStorageOrderAsArgType
+                             ? int(inner_stride_at_compile_time<ArgType>::ret)
+                             : int(outer_stride_at_compile_time<ArgType>::ret),
+    OuterStrideAtCompileTime = HasSameStorageOrderAsArgType
+                             ? int(outer_stride_at_compile_time<ArgType>::ret)
+                             : int(inner_stride_at_compile_time<ArgType>::ret),
+    MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0,
+
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
+    FlagsRowMajorBit = XprType::Flags&RowMajorBit,
+    Flags0 = evaluator<ArgType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
+                                           DirectAccessBit |
+                                           MaskPacketAccessBit),
+    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
+
+    PacketAlignment = unpacket_traits<PacketScalar>::alignment,
+    Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic)
+                             && (OuterStrideAtCompileTime!=0)
+                             && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
+  };
+  typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& block) : block_evaluator_type(block)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+};
+
+// no direct-access => dispatch to a unary evaluator
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAccess*/ false>
+  : unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit block_evaluator(const XprType& block)
+    : unary_evaluator<XprType>(block)
+  {}
+};
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBased>
+  : evaluator_base<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& block)
+    : m_argImpl(block.nestedExpression()),
+      m_startRow(block.startRow()),
+      m_startCol(block.startCol()),
+      m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0)
+  { }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    RowsAtCompileTime = XprType::RowsAtCompileTime,
+    ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator<ArgType>::Flags&LinearAccessBit)
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return linear_coeff_impl(index, bool_constant<ForwardLinearAccess>());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return linear_coeffRef_impl(index, bool_constant<ForwardLinearAccess>());
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    if (ForwardLinearAccess)
+      return m_argImpl.template packet<LoadMode,PacketType>(m_linear_offset.value() + index);
+    else
+      return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                         RowsAtCompileTime == 1 ? index : 0);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    return m_argImpl.template writePacket<StoreMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col, x);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
+  {
+    if (ForwardLinearAccess)
+      return m_argImpl.template writePacket<StoreMode,PacketType>(m_linear_offset.value() + index, x);
+    else
+      return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                              RowsAtCompileTime == 1 ? index : 0,
+                                              x);
+  }
+
+protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const
+  {
+    return m_argImpl.coeff(m_linear_offset.value() + index);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType linear_coeff_impl(Index index, internal::false_type /* not ForwardLinearAccess */) const
+  {
+    return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& linear_coeffRef_impl(Index index, internal::true_type /* ForwardLinearAccess */)
+  {
+    return m_argImpl.coeffRef(m_linear_offset.value() + index);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& linear_coeffRef_impl(Index index, internal::false_type /* not ForwardLinearAccess */)
+  {
+    return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+  }
+
+  evaluator<ArgType> m_argImpl;
+  const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
+  const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
+  const variable_if_dynamic<Index, ForwardLinearAccess ? Dynamic : 0> m_linear_offset;
+};
+
+// TODO: This evaluator does not actually use the child evaluator;
+// all action is via the data() as returned by the Block expression.
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAccess */ true>
+  : mapbase_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>,
+                      typename Block<ArgType, BlockRows, BlockCols, InnerPanel>::PlainObject>
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  typedef typename XprType::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit block_evaluator(const XprType& block)
+    : mapbase_evaluator<XprType, typename XprType::PlainObject>(block)
+  {
+    // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
+    eigen_assert(((internal::UIntPtr(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
+  }
+};
+
+
+// -------------------- Select --------------------
+// NOTE shall we introduce a ternary_evaluator?
+
+// TODO enable vectorization for Select
+template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
+struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
+  : evaluator_base<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
+{
+  typedef Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> XprType;
+  enum {
+    CoeffReadCost = evaluator<ConditionMatrixType>::CoeffReadCost
+                  + EIGEN_PLAIN_ENUM_MAX(evaluator<ThenMatrixType>::CoeffReadCost,
+                                         evaluator<ElseMatrixType>::CoeffReadCost),
+
+    Flags = (unsigned int)evaluator<ThenMatrixType>::Flags & evaluator<ElseMatrixType>::Flags & HereditaryBits,
+
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ThenMatrixType>::Alignment, evaluator<ElseMatrixType>::Alignment)
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& select)
+    : m_conditionImpl(select.conditionMatrix()),
+      m_thenImpl(select.thenMatrix()),
+      m_elseImpl(select.elseMatrix())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    if (m_conditionImpl.coeff(row, col))
+      return m_thenImpl.coeff(row, col);
+    else
+      return m_elseImpl.coeff(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    if (m_conditionImpl.coeff(index))
+      return m_thenImpl.coeff(index);
+    else
+      return m_elseImpl.coeff(index);
+  }
+
+protected:
+  evaluator<ConditionMatrixType> m_conditionImpl;
+  evaluator<ThenMatrixType> m_thenImpl;
+  evaluator<ElseMatrixType> m_elseImpl;
+};
+
+
+// -------------------- Replicate --------------------
+
+template<typename ArgType, int RowFactor, int ColFactor>
+struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
+  : evaluator_base<Replicate<ArgType, RowFactor, ColFactor> >
+{
+  typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  enum {
+    Factor = (RowFactor==Dynamic || ColFactor==Dynamic) ? Dynamic : RowFactor*ColFactor
+  };
+  typedef typename internal::nested_eval<ArgType,Factor>::type ArgTypeNested;
+  typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
+
+  enum {
+    CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
+    LinearAccessMask = XprType::IsVectorAtCompileTime ? LinearAccessBit : 0,
+    Flags = (evaluator<ArgTypeNestedCleaned>::Flags & (HereditaryBits|LinearAccessMask) & ~RowMajorBit) | (traits<XprType>::Flags & RowMajorBit),
+
+    Alignment = evaluator<ArgTypeNestedCleaned>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& replicate)
+    : m_arg(replicate.nestedExpression()),
+      m_argImpl(m_arg),
+      m_rows(replicate.nestedExpression().rows()),
+      m_cols(replicate.nestedExpression().cols())
+  {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    // try to avoid using modulo; this is a pure optimization strategy
+    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
+                           : RowFactor==1 ? row
+                           : row % m_rows.value();
+    const Index actual_col = internal::traits<XprType>::ColsAtCompileTime==1 ? 0
+                           : ColFactor==1 ? col
+                           : col % m_cols.value();
+
+    return m_argImpl.coeff(actual_row, actual_col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    // try to avoid using modulo; this is a pure optimization strategy
+    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
+                                  ? (ColFactor==1 ?  index : index%m_cols.value())
+                                  : (RowFactor==1 ?  index : index%m_rows.value());
+
+    return m_argImpl.coeff(actual_index);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
+                           : RowFactor==1 ? row
+                           : row % m_rows.value();
+    const Index actual_col = internal::traits<XprType>::ColsAtCompileTime==1 ? 0
+                           : ColFactor==1 ? col
+                           : col % m_cols.value();
+
+    return m_argImpl.template packet<LoadMode,PacketType>(actual_row, actual_col);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
+                                  ? (ColFactor==1 ?  index : index%m_cols.value())
+                                  : (RowFactor==1 ?  index : index%m_rows.value());
+
+    return m_argImpl.template packet<LoadMode,PacketType>(actual_index);
+  }
+
+protected:
+  const ArgTypeNested m_arg;
+  evaluator<ArgTypeNestedCleaned> m_argImpl;
+  const variable_if_dynamic<Index, ArgType::RowsAtCompileTime> m_rows;
+  const variable_if_dynamic<Index, ArgType::ColsAtCompileTime> m_cols;
+};
+
+// -------------------- MatrixWrapper and ArrayWrapper --------------------
+//
+// evaluator_wrapper_base<T> is a common base class for the
+// MatrixWrapper and ArrayWrapper evaluators.
+
+template<typename XprType>
+struct evaluator_wrapper_base
+  : evaluator_base<XprType>
+{
+  typedef typename remove_all<typename XprType::NestedExpressionType>::type ArgType;
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    Flags = evaluator<ArgType>::Flags,
+    Alignment = evaluator<ArgType>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
+
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename ArgType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_argImpl.coeff(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return m_argImpl.coeffRef(index);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(row, col);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(index);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    m_argImpl.template writePacket<StoreMode>(row, col, x);
+  }
+
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
+  {
+    m_argImpl.template writePacket<StoreMode>(index, x);
+  }
+
+protected:
+  evaluator<ArgType> m_argImpl;
+};
+
+template<typename TArgType>
+struct unary_evaluator<MatrixWrapper<TArgType> >
+  : evaluator_wrapper_base<MatrixWrapper<TArgType> >
+{
+  typedef MatrixWrapper<TArgType> XprType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& wrapper)
+    : evaluator_wrapper_base<MatrixWrapper<TArgType> >(wrapper.nestedExpression())
+  { }
+};
+
+template<typename TArgType>
+struct unary_evaluator<ArrayWrapper<TArgType> >
+  : evaluator_wrapper_base<ArrayWrapper<TArgType> >
+{
+  typedef ArrayWrapper<TArgType> XprType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& wrapper)
+    : evaluator_wrapper_base<ArrayWrapper<TArgType> >(wrapper.nestedExpression())
+  { }
+};
+
+
+// -------------------- Reverse --------------------
+
+// defined in Reverse.h:
+template<typename PacketType, bool ReversePacket> struct reverse_packet_cond;
+
+template<typename ArgType, int Direction>
+struct unary_evaluator<Reverse<ArgType, Direction> >
+  : evaluator_base<Reverse<ArgType, Direction> >
+{
+  typedef Reverse<ArgType, Direction> XprType;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    IsRowMajor = XprType::IsRowMajor,
+    IsColMajor = !IsRowMajor,
+    ReverseRow = (Direction == Vertical)   || (Direction == BothDirections),
+    ReverseCol = (Direction == Horizontal) || (Direction == BothDirections),
+    ReversePacket = (Direction == BothDirections)
+                    || ((Direction == Vertical)   && IsColMajor)
+                    || ((Direction == Horizontal) && IsRowMajor),
+
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+
+    // let's enable LinearAccess only with vectorization because of the product overhead
+    // FIXME enable DirectAccess with negative strides?
+    Flags0 = evaluator<ArgType>::Flags,
+    LinearAccess = ( (Direction==BothDirections) && (int(Flags0)&PacketAccessBit) )
+                  || ((ReverseRow && XprType::ColsAtCompileTime==1) || (ReverseCol && XprType::RowsAtCompileTime==1))
+                 ? LinearAccessBit : 0,
+
+    Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess),
+
+    Alignment = 0 // FIXME in some rare cases, Alignment could be preserved, like a Vector4f.
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& reverse)
+    : m_argImpl(reverse.nestedExpression()),
+      m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1),
+      m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1)
+  { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row,
+                           ReverseCol ? m_cols.value() - col - 1 : col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_argImpl.coeff(m_rows.value() * m_cols.value() - index - 1);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row,
+                              ReverseCol ? m_cols.value() - col - 1 : col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1);
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index row, Index col) const
+  {
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      OffsetRow  = ReverseRow && IsColMajor ? PacketSize : 1,
+      OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1
+    };
+    typedef internal::reverse_packet_cond<PacketType,ReversePacket> reverse_packet;
+    return reverse_packet::run(m_argImpl.template packet<LoadMode,PacketType>(
+                                  ReverseRow ? m_rows.value() - row - OffsetRow : row,
+                                  ReverseCol ? m_cols.value() - col - OffsetCol : col));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  PacketType packet(Index index) const
+  {
+    enum { PacketSize = unpacket_traits<PacketType>::size };
+    return preverse(m_argImpl.template packet<LoadMode,PacketType>(m_rows.value() * m_cols.value() - index - PacketSize));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    // FIXME we could factorize some code with packet(i,j)
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      OffsetRow  = ReverseRow && IsColMajor ? PacketSize : 1,
+      OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1
+    };
+    typedef internal::reverse_packet_cond<PacketType,ReversePacket> reverse_packet;
+    m_argImpl.template writePacket<LoadMode>(
+                                  ReverseRow ? m_rows.value() - row - OffsetRow : row,
+                                  ReverseCol ? m_cols.value() - col - OffsetCol : col,
+                                  reverse_packet::run(x));
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketType& x)
+  {
+    enum { PacketSize = unpacket_traits<PacketType>::size };
+    m_argImpl.template writePacket<LoadMode>
+      (m_rows.value() * m_cols.value() - index - PacketSize, preverse(x));
+  }
+
+protected:
+  evaluator<ArgType> m_argImpl;
+
+  // If we do not reverse rows, then we do not need to know the number of rows; same for columns
+  // Nonetheless, in this case it is important to set to 1 such that the coeff(index) method works fine for vectors.
+  const variable_if_dynamic<Index, ReverseRow ? ArgType::RowsAtCompileTime : 1> m_rows;
+  const variable_if_dynamic<Index, ReverseCol ? ArgType::ColsAtCompileTime : 1> m_cols;
+};
+
+
+// -------------------- Diagonal --------------------
+
+template<typename ArgType, int DiagIndex>
+struct evaluator<Diagonal<ArgType, DiagIndex> >
+  : evaluator_base<Diagonal<ArgType, DiagIndex> >
+{
+  typedef Diagonal<ArgType, DiagIndex> XprType;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+
+    Flags = (unsigned int)(evaluator<ArgType>::Flags & (HereditaryBits | DirectAccessBit) & ~RowMajorBit) | LinearAccessBit,
+
+    Alignment = 0
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& diagonal)
+    : m_argImpl(diagonal.nestedExpression()),
+      m_index(diagonal.index())
+  { }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index) const
+  {
+    return m_argImpl.coeff(row + rowOffset(), row + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
+  {
+    return m_argImpl.coeff(index + rowOffset(), index + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index)
+  {
+    return m_argImpl.coeffRef(row + rowOffset(), row + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    return m_argImpl.coeffRef(index + rowOffset(), index + colOffset());
+  }
+
+protected:
+  evaluator<ArgType> m_argImpl;
+  const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
+
+private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
+};
+
+
+//----------------------------------------------------------------------
+// deprecated code
+//----------------------------------------------------------------------
+
+// -------------------- EvalToTemp --------------------
+
+// expression class for evaluating nested expression to a temporary
+
+template<typename ArgType> class EvalToTemp;
+
+template<typename ArgType>
+struct traits<EvalToTemp<ArgType> >
+  : public traits<ArgType>
+{ };
+
+template<typename ArgType>
+class EvalToTemp
+  : public dense_xpr_base<EvalToTemp<ArgType> >::type
+{
+ public:
+
+  typedef typename dense_xpr_base<EvalToTemp>::type Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(EvalToTemp)
+
+  explicit EvalToTemp(const ArgType& arg)
+    : m_arg(arg)
+  { }
+
+  const ArgType& arg() const
+  {
+    return m_arg;
+  }
+
+  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT
+  {
+    return m_arg.rows();
+  }
+
+  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT
+  {
+    return m_arg.cols();
+  }
+
+ private:
+  const ArgType& m_arg;
+};
+
+template<typename ArgType>
+struct evaluator<EvalToTemp<ArgType> >
+  : public evaluator<typename ArgType::PlainObject>
+{
+  typedef EvalToTemp<ArgType>                   XprType;
+  typedef typename ArgType::PlainObject         PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+    : m_result(xpr.arg())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+  }
+
+  // This constructor is used when nesting an EvalTo evaluator in another evaluator
+  EIGEN_DEVICE_FUNC evaluator(const ArgType& arg)
+    : m_result(arg)
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+} // namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COREEVALUATORS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/CoreIterators.h b/src/3rdparty/eigen/Eigen/src/Core/CoreIterators.h
new file mode 100644
index 000000000..b96719681
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/CoreIterators.h
@@ -0,0 +1,132 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COREITERATORS_H
+#define EIGEN_COREITERATORS_H
+
+namespace Eigen { 
+
+/* This file contains the respective InnerIterator definition of the expressions defined in Eigen/Core
+ */
+
+namespace internal {
+
+template<typename XprType, typename EvaluatorKind>
+class inner_iterator_selector;
+
+}
+
+/** \class InnerIterator
+  * \brief An InnerIterator allows to loop over the element of any matrix expression.
+  * 
+  * \warning To be used with care because an evaluator is constructed every time an InnerIterator iterator is constructed.
+  * 
+  * TODO: add a usage example
+  */
+template<typename XprType>
+class InnerIterator
+{
+protected:
+  typedef internal::inner_iterator_selector<XprType, typename internal::evaluator_traits<XprType>::Kind> IteratorType;
+  typedef internal::evaluator<XprType> EvaluatorType;
+  typedef typename internal::traits<XprType>::Scalar Scalar;
+public:
+  /** Construct an iterator over the \a outerId -th row or column of \a xpr */
+  InnerIterator(const XprType &xpr, const Index &outerId)
+    : m_eval(xpr), m_iter(m_eval, outerId, xpr.innerSize())
+  {}
+  
+  /// \returns the value of the current coefficient.
+  EIGEN_STRONG_INLINE Scalar value() const          { return m_iter.value(); }
+  /** Increment the iterator \c *this to the next non-zero coefficient.
+    * Explicit zeros are not skipped over. To skip explicit zeros, see class SparseView
+    */
+  EIGEN_STRONG_INLINE InnerIterator& operator++()   { m_iter.operator++(); return *this; }
+  EIGEN_STRONG_INLINE InnerIterator& operator+=(Index i) { m_iter.operator+=(i); return *this; }
+  EIGEN_STRONG_INLINE InnerIterator operator+(Index i) 
+  { InnerIterator result(*this); result+=i; return result; }
+    
+
+  /// \returns the column or row index of the current coefficient.
+  EIGEN_STRONG_INLINE Index index() const           { return m_iter.index(); }
+  /// \returns the row index of the current coefficient.
+  EIGEN_STRONG_INLINE Index row() const             { return m_iter.row(); }
+  /// \returns the column index of the current coefficient.
+  EIGEN_STRONG_INLINE Index col() const             { return m_iter.col(); }
+  /// \returns \c true if the iterator \c *this still references a valid coefficient.
+  EIGEN_STRONG_INLINE operator bool() const         { return m_iter; }
+  
+protected:
+  EvaluatorType m_eval;
+  IteratorType m_iter;
+private:
+  // If you get here, then you're not using the right InnerIterator type, e.g.:
+  //   SparseMatrix<double,RowMajor> A;
+  //   SparseMatrix<double>::InnerIterator it(A,0);
+  template<typename T> InnerIterator(const EigenBase<T>&,Index outer);
+};
+
+namespace internal {
+
+// Generic inner iterator implementation for dense objects
+template<typename XprType>
+class inner_iterator_selector<XprType, IndexBased>
+{
+protected:
+  typedef evaluator<XprType> EvaluatorType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  enum { IsRowMajor = (XprType::Flags&RowMajorBit)==RowMajorBit };
+  
+public:
+  EIGEN_STRONG_INLINE inner_iterator_selector(const EvaluatorType &eval, const Index &outerId, const Index &innerSize)
+    : m_eval(eval), m_inner(0), m_outer(outerId), m_end(innerSize)
+  {}
+
+  EIGEN_STRONG_INLINE Scalar value() const
+  {
+    return (IsRowMajor) ? m_eval.coeff(m_outer, m_inner)
+                        : m_eval.coeff(m_inner, m_outer);
+  }
+
+  EIGEN_STRONG_INLINE inner_iterator_selector& operator++() { m_inner++; return *this; }
+
+  EIGEN_STRONG_INLINE Index index() const { return m_inner; }
+  inline Index row() const { return IsRowMajor ? m_outer : index(); }
+  inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+  EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner>=0; }
+
+protected:
+  const EvaluatorType& m_eval;
+  Index m_inner;
+  const Index m_outer;
+  const Index m_end;
+};
+
+// For iterator-based evaluator, inner-iterator is already implemented as
+// evaluator<>::InnerIterator
+template<typename XprType>
+class inner_iterator_selector<XprType, IteratorBased>
+ : public evaluator<XprType>::InnerIterator
+{
+protected:
+  typedef typename evaluator<XprType>::InnerIterator Base;
+  typedef evaluator<XprType> EvaluatorType;
+  
+public:
+  EIGEN_STRONG_INLINE inner_iterator_selector(const EvaluatorType &eval, const Index &outerId, const Index &/*innerSize*/)
+    : Base(eval, outerId)
+  {}  
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COREITERATORS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/CwiseBinaryOp.h b/src/3rdparty/eigen/Eigen/src/Core/CwiseBinaryOp.h
new file mode 100644
index 000000000..2202b1cc6
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/CwiseBinaryOp.h
@@ -0,0 +1,183 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CWISE_BINARY_OP_H
+#define EIGEN_CWISE_BINARY_OP_H
+
+namespace Eigen {
+
+namespace internal {
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+  // we must not inherit from traits<Lhs> since it has
+  // the potential to cause problems with MSVC
+  typedef typename remove_all<Lhs>::type Ancestor;
+  typedef typename traits<Ancestor>::XprKind XprKind;
+  enum {
+    RowsAtCompileTime = traits<Ancestor>::RowsAtCompileTime,
+    ColsAtCompileTime = traits<Ancestor>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = traits<Ancestor>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = traits<Ancestor>::MaxColsAtCompileTime
+  };
+
+  // even though we require Lhs and Rhs to have the same scalar type (see CwiseBinaryOp constructor),
+  // we still want to handle the case when the result type is different.
+  typedef typename result_of<
+                     BinaryOp(
+                       const typename Lhs::Scalar&,
+                       const typename Rhs::Scalar&
+                     )
+                   >::type Scalar;
+  typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind,
+                                              typename traits<Rhs>::StorageKind,
+                                              BinaryOp>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<Lhs>::StorageIndex,
+                                      typename traits<Rhs>::StorageIndex>::type StorageIndex;
+  typedef typename Lhs::Nested LhsNested;
+  typedef typename Rhs::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  enum {
+    Flags = cwise_promote_storage_order<typename traits<Lhs>::StorageKind,typename traits<Rhs>::StorageKind,_LhsNested::Flags & RowMajorBit,_RhsNested::Flags & RowMajorBit>::value
+  };
+};
+} // end namespace internal
+
+template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
+class CwiseBinaryOpImpl;
+
+/** \class CwiseBinaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions
+  *
+  * \tparam BinaryOp template functor implementing the operator
+  * \tparam LhsType the type of the left-hand side
+  * \tparam RhsType the type of the right-hand side
+  *
+  * This class represents an expression  where a coefficient-wise binary operator is applied to two expressions.
+  * It is the return type of binary operators, by which we mean only those binary operators where
+  * both the left-hand side and the right-hand side are Eigen expressions.
+  * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically don't have to name
+  * CwiseBinaryOp types explicitly.
+  *
+  * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
+  */
+template<typename BinaryOp, typename LhsType, typename RhsType>
+class CwiseBinaryOp :
+  public CwiseBinaryOpImpl<
+          BinaryOp, LhsType, RhsType,
+          typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
+                                                        typename internal::traits<RhsType>::StorageKind,
+                                                        BinaryOp>::ret>,
+  internal::no_assignment_operator
+{
+  public:
+
+    typedef typename internal::remove_all<BinaryOp>::type Functor;
+    typedef typename internal::remove_all<LhsType>::type Lhs;
+    typedef typename internal::remove_all<RhsType>::type Rhs;
+
+    typedef typename CwiseBinaryOpImpl<
+        BinaryOp, LhsType, RhsType,
+        typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
+                                                      typename internal::traits<Rhs>::StorageKind,
+                                                      BinaryOp>::ret>::Base Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseBinaryOp)
+
+    typedef typename internal::ref_selector<LhsType>::type LhsNested;
+    typedef typename internal::ref_selector<RhsType>::type RhsNested;
+    typedef typename internal::remove_reference<LhsNested>::type _LhsNested;
+    typedef typename internal::remove_reference<RhsNested>::type _RhsNested;
+
+#if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11
+    //Required for Visual Studio or the Copy constructor will probably not get inlined!
+    EIGEN_STRONG_INLINE
+    CwiseBinaryOp(const CwiseBinaryOp<BinaryOp,LhsType,RhsType>&) = default;
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
+      : m_lhs(aLhs), m_rhs(aRhs), m_functor(func)
+    {
+      EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar);
+      // require the sizes to match
+      EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs, Rhs)
+      eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT {
+      // return the fixed size type if available to enable compile time optimizations
+      return internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic ? m_rhs.rows() : m_lhs.rows();
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT {
+      // return the fixed size type if available to enable compile time optimizations
+      return internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic ? m_rhs.cols() : m_lhs.cols();
+    }
+
+    /** \returns the left hand side nested expression */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const _LhsNested& lhs() const { return m_lhs; }
+    /** \returns the right hand side nested expression */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const _RhsNested& rhs() const { return m_rhs; }
+    /** \returns the functor representing the binary operation */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const BinaryOp& functor() const { return m_functor; }
+
+  protected:
+    LhsNested m_lhs;
+    RhsNested m_rhs;
+    const BinaryOp m_functor;
+};
+
+// Generic API dispatcher
+template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
+class CwiseBinaryOpImpl
+  : public internal::generic_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type Base;
+};
+
+/** replaces \c *this by \c *this - \a other.
+  *
+  * \returns a reference to \c *this
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
+MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
+{
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+/** replaces \c *this by \c *this + \a other.
+  *
+  * \returns a reference to \c *this
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
+MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
+{
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CWISE_BINARY_OP_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/CwiseNullaryOp.h b/src/3rdparty/eigen/Eigen/src/Core/CwiseNullaryOp.h
new file mode 100644
index 000000000..289ec510a
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/CwiseNullaryOp.h
@@ -0,0 +1,1001 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CWISE_NULLARY_OP_H
+#define EIGEN_CWISE_NULLARY_OP_H
+
+namespace Eigen {
+
+namespace internal {
+template<typename NullaryOp, typename PlainObjectType>
+struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType>
+{
+  enum {
+    Flags = traits<PlainObjectType>::Flags & RowMajorBit
+  };
+};
+
+} // namespace internal
+
+/** \class CwiseNullaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression of a matrix where all coefficients are defined by a functor
+  *
+  * \tparam NullaryOp template functor implementing the operator
+  * \tparam PlainObjectType the underlying plain matrix/array type
+  *
+  * This class represents an expression of a generic nullary operator.
+  * It is the return type of the Ones(), Zero(), Constant(), Identity() and Random() methods,
+  * and most of the time this is the only way it is used.
+  *
+  * However, if you want to write a function returning such an expression, you
+  * will need to use this class.
+  *
+  * The functor NullaryOp must expose one of the following method:
+    <table class="manual">
+    <tr            ><td>\c operator()() </td><td>if the procedural generation does not depend on the coefficient entries (e.g., random numbers)</td></tr>
+    <tr class="alt"><td>\c operator()(Index i)</td><td>if the procedural generation makes sense for vectors only and that it depends on the coefficient index \c i (e.g., linspace) </td></tr>
+    <tr            ><td>\c operator()(Index i,Index j)</td><td>if the procedural generation depends on the matrix coordinates \c i, \c j (e.g., to generate a checkerboard with 0 and 1)</td></tr>
+    </table>
+  * It is also possible to expose the last two operators if the generation makes sense for matrices but can be optimized for vectors.
+  *
+  * See DenseBase::NullaryExpr(Index,const CustomNullaryOp&) for an example binding
+  * C++11 random number generators.
+  *
+  * A nullary expression can also be used to implement custom sophisticated matrix manipulations
+  * that cannot be covered by the existing set of natively supported matrix manipulations.
+  * See this \ref TopicCustomizing_NullaryExpr "page" for some examples and additional explanations
+  * on the behavior of CwiseNullaryOp.
+  *
+  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr
+  */
+template<typename NullaryOp, typename PlainObjectType>
+class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type, internal::no_assignment_operator
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<CwiseNullaryOp>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(CwiseNullaryOp)
+
+    EIGEN_DEVICE_FUNC
+    CwiseNullaryOp(Index rows, Index cols, const NullaryOp& func = NullaryOp())
+      : m_rows(rows), m_cols(cols), m_functor(func)
+    {
+      eigen_assert(rows >= 0
+            && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
+            &&  cols >= 0
+            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const { return m_rows.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const { return m_cols.value(); }
+
+    /** \returns the functor representing the nullary operation */
+    EIGEN_DEVICE_FUNC
+    const NullaryOp& functor() const { return m_functor; }
+
+  protected:
+    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_rows;
+    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_cols;
+    const NullaryOp m_functor;
+};
+
+
+/** \returns an expression of a matrix defined by a custom functor \a func
+  *
+  * The parameters \a rows and \a cols are the number of rows and of columns of
+  * the returned matrix. Must be compatible with this MatrixBase type.
+  *
+  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+  * it is redundant to pass \a rows and \a cols as arguments, so Zero() should be used
+  * instead.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
+template<typename Derived>
+template<typename CustomNullaryOp>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const CwiseNullaryOp<CustomNullaryOp,typename DenseBase<Derived>::PlainObject>
+#else
+const CwiseNullaryOp<CustomNullaryOp,PlainObject>
+#endif
+DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
+{
+  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
+}
+
+/** \returns an expression of a matrix defined by a custom functor \a func
+  *
+  * The parameter \a size is the size of the returned vector.
+  * Must be compatible with this MatrixBase type.
+  *
+  * \only_for_vectors
+  *
+  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+  * it is redundant to pass \a size as argument, so Zero() should be used
+  * instead.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * Here is an example with C++11 random generators: \include random_cpp11.cpp
+  * Output: \verbinclude random_cpp11.out
+  *
+  * \sa class CwiseNullaryOp
+  */
+template<typename Derived>
+template<typename CustomNullaryOp>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+#else
+const CwiseNullaryOp<CustomNullaryOp, PlainObject>
+#endif
+DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  if(RowsAtCompileTime == 1) return CwiseNullaryOp<CustomNullaryOp, PlainObject>(1, size, func);
+  else return CwiseNullaryOp<CustomNullaryOp, PlainObject>(size, 1, func);
+}
+
+/** \returns an expression of a matrix defined by a custom functor \a func
+  *
+  * This variant is only for fixed-size DenseBase types. For dynamic-size types, you
+  * need to use the variants taking size arguments.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
+template<typename Derived>
+template<typename CustomNullaryOp>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+#else
+const CwiseNullaryOp<CustomNullaryOp, PlainObject>
+#endif
+DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
+{
+  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
+}
+
+/** \returns an expression of a constant matrix of value \a value
+  *
+  * The parameters \a rows and \a cols are the number of rows and of columns of
+  * the returned matrix. Must be compatible with this DenseBase type.
+  *
+  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+  * it is redundant to pass \a rows and \a cols as arguments, so Zero() should be used
+  * instead.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
+{
+  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
+}
+
+/** \returns an expression of a constant matrix of value \a value
+  *
+  * The parameter \a size is the size of the returned vector.
+  * Must be compatible with this DenseBase type.
+  *
+  * \only_for_vectors
+  *
+  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+  * it is redundant to pass \a size as argument, so Zero() should be used
+  * instead.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Constant(Index size, const Scalar& value)
+{
+  return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value));
+}
+
+/** \returns an expression of a constant matrix of value \a value
+  *
+  * This variant is only for fixed-size DenseBase types. For dynamic-size types, you
+  * need to use the variants taking size arguments.
+  *
+  * The template parameter \a CustomNullaryOp is the type of the functor.
+  *
+  * \sa class CwiseNullaryOp
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Constant(const Scalar& value)
+{
+  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
+  return DenseBase<Derived>::NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_constant_op<Scalar>(value));
+}
+
+/** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(Index,const Scalar&,const Scalar&)
+  *
+  * \only_for_vectors
+  *
+  * Example: \include DenseBase_LinSpaced_seq_deprecated.cpp
+  * Output: \verbinclude DenseBase_LinSpaced_seq_deprecated.out
+  *
+  * \sa LinSpaced(Index,const Scalar&, const Scalar&), setLinSpaced(Index,const Scalar&,const Scalar&)
+  */
+template<typename Derived>
+EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low,high,size));
+}
+
+/** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(const Scalar&,const Scalar&)
+  *
+  * \sa LinSpaced(const Scalar&, const Scalar&)
+  */
+template<typename Derived>
+EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar>(low,high,Derived::SizeAtCompileTime));
+}
+
+/**
+  * \brief Sets a linearly spaced vector.
+  *
+  * The function generates 'size' equally spaced values in the closed interval [low,high].
+  * When size is set to 1, a vector of length 1 containing 'high' is returned.
+  *
+  * \only_for_vectors
+  *
+  * Example: \include DenseBase_LinSpaced.cpp
+  * Output: \verbinclude DenseBase_LinSpaced.out
+  *
+  * For integer scalar types, an even spacing is possible if and only if the length of the range,
+  * i.e., \c high-low is a scalar multiple of \c size-1, or if \c size is a scalar multiple of the
+  * number of values \c high-low+1 (meaning each value can be repeated the same number of time).
+  * If one of these two considions is not satisfied, then \c high is lowered to the largest value
+  * satisfying one of this constraint.
+  * Here are some examples:
+  *
+  * Example: \include DenseBase_LinSpacedInt.cpp
+  * Output: \verbinclude DenseBase_LinSpacedInt.out
+  *
+  * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low,high,size));
+}
+
+/**
+  * \copydoc DenseBase::LinSpaced(Index, const Scalar&, const Scalar&)
+  * Special version for fixed size types which does not require the size parameter.
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar>(low,high,Derived::SizeAtCompileTime));
+}
+
+/** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
+template<typename Derived>
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApproxToConstant
+(const Scalar& val, const RealScalar& prec) const
+{
+  typename internal::nested_eval<Derived,1>::type self(derived());
+  for(Index j = 0; j < cols(); ++j)
+    for(Index i = 0; i < rows(); ++i)
+      if(!internal::isApprox(self.coeff(i, j), val, prec))
+        return false;
+  return true;
+}
+
+/** This is just an alias for isApproxToConstant().
+  *
+  * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
+template<typename Derived>
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isConstant
+(const Scalar& val, const RealScalar& prec) const
+{
+  return isApproxToConstant(val, prec);
+}
+
+/** Alias for setConstant(): sets all coefficients in this expression to \a val.
+  *
+  * \sa setConstant(), Constant(), class CwiseNullaryOp
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
+{
+  setConstant(val);
+}
+
+/** Sets all coefficients in this expression to value \a val.
+  *
+  * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
+{
+  return derived() = Constant(rows(), cols(), val);
+}
+
+/** Resizes to the given \a size, and sets all coefficients in this expression to the given value \a val.
+  *
+  * \only_for_vectors
+  *
+  * Example: \include Matrix_setConstant_int.cpp
+  * Output: \verbinclude Matrix_setConstant_int.out
+  *
+  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
+{
+  resize(size);
+  return setConstant(val);
+}
+
+/** Resizes to the given size, and sets all coefficients in this expression to the given value \a val.
+  *
+  * \param rows the new number of rows
+  * \param cols the new number of columns
+  * \param val the value to which all coefficients are set
+  *
+  * Example: \include Matrix_setConstant_int_int.cpp
+  * Output: \verbinclude Matrix_setConstant_int_int.out
+  *
+  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
+{
+  resize(rows, cols);
+  return setConstant(val);
+}
+
+/** Resizes to the given size, changing only the number of columns, and sets all
+  * coefficients in this expression to the given value \a val. For the parameter
+  * of type NoChange_t, just pass the special value \c NoChange.
+  *
+  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setConstant(NoChange_t, Index cols, const Scalar& val)
+{
+  return setConstant(rows(), cols, val);
+}
+
+/** Resizes to the given size, changing only the number of rows, and sets all
+  * coefficients in this expression to the given value \a val. For the parameter
+  * of type NoChange_t, just pass the special value \c NoChange.
+  *
+  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setConstant(Index rows, NoChange_t, const Scalar& val)
+{
+  return setConstant(rows, cols(), val);
+}
+
+
+/**
+  * \brief Sets a linearly spaced vector.
+  *
+  * The function generates 'size' equally spaced values in the closed interval [low,high].
+  * When size is set to 1, a vector of length 1 containing 'high' is returned.
+  *
+  * \only_for_vectors
+  *
+  * Example: \include DenseBase_setLinSpaced.cpp
+  * Output: \verbinclude DenseBase_setLinSpaced.out
+  *
+  * For integer scalar types, do not miss the explanations on the definition
+  * of \link LinSpaced(Index,const Scalar&,const Scalar&) even spacing \endlink.
+  *
+  * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar>(low,high,newSize));
+}
+
+/**
+  * \brief Sets a linearly spaced vector.
+  *
+  * The function fills \c *this with equally spaced values in the closed interval [low,high].
+  * When size is set to 1, a vector of length 1 containing 'high' is returned.
+  *
+  * \only_for_vectors
+  *
+  * For integer scalar types, do not miss the explanations on the definition
+  * of \link LinSpaced(Index,const Scalar&,const Scalar&) even spacing \endlink.
+  *
+  * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return setLinSpaced(size(), low, high);
+}
+
+// zero:
+
+/** \returns an expression of a zero matrix.
+  *
+  * The parameters \a rows and \a cols are the number of rows and of columns of
+  * the returned matrix. Must be compatible with this MatrixBase type.
+  *
+  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+  * it is redundant to pass \a rows and \a cols as arguments, so Zero() should be used
+  * instead.
+  *
+  * Example: \include MatrixBase_zero_int_int.cpp
+  * Output: \verbinclude MatrixBase_zero_int_int.out
+  *
+  * \sa Zero(), Zero(Index)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Zero(Index rows, Index cols)
+{
+  return Constant(rows, cols, Scalar(0));
+}
+
+/** \returns an expression of a zero vector.
+  *
+  * The parameter \a size is the size of the returned vector.
+  * Must be compatible with this MatrixBase type.
+  *
+  * \only_for_vectors
+  *
+  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+  * it is redundant to pass \a size as argument, so Zero() should be used
+  * instead.
+  *
+  * Example: \include MatrixBase_zero_int.cpp
+  * Output: \verbinclude MatrixBase_zero_int.out
+  *
+  * \sa Zero(), Zero(Index,Index)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Zero(Index size)
+{
+  return Constant(size, Scalar(0));
+}
+
+/** \returns an expression of a fixed-size zero matrix or vector.
+  *
+  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
+  * need to use the variants taking size arguments.
+  *
+  * Example: \include MatrixBase_zero.cpp
+  * Output: \verbinclude MatrixBase_zero.out
+  *
+  * \sa Zero(Index), Zero(Index,Index)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Zero()
+{
+  return Constant(Scalar(0));
+}
+
+/** \returns true if *this is approximately equal to the zero matrix,
+  *          within the precision given by \a prec.
+  *
+  * Example: \include MatrixBase_isZero.cpp
+  * Output: \verbinclude MatrixBase_isZero.out
+  *
+  * \sa class CwiseNullaryOp, Zero()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isZero(const RealScalar& prec) const
+{
+  typename internal::nested_eval<Derived,1>::type self(derived());
+  for(Index j = 0; j < cols(); ++j)
+    for(Index i = 0; i < rows(); ++i)
+      if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<Scalar>(1), prec))
+        return false;
+  return true;
+}
+
+/** Sets all coefficients in this expression to zero.
+  *
+  * Example: \include MatrixBase_setZero.cpp
+  * Output: \verbinclude MatrixBase_setZero.out
+  *
+  * \sa class CwiseNullaryOp, Zero()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
+{
+  return setConstant(Scalar(0));
+}
+
+/** Resizes to the given \a size, and sets all coefficients in this expression to zero.
+  *
+  * \only_for_vectors
+  *
+  * Example: \include Matrix_setZero_int.cpp
+  * Output: \verbinclude Matrix_setZero_int.out
+  *
+  * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setZero(Index newSize)
+{
+  resize(newSize);
+  return setConstant(Scalar(0));
+}
+
+/** Resizes to the given size, and sets all coefficients in this expression to zero.
+  *
+  * \param rows the new number of rows
+  * \param cols the new number of columns
+  *
+  * Example: \include Matrix_setZero_int_int.cpp
+  * Output: \verbinclude Matrix_setZero_int_int.out
+  *
+  * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setZero(Index rows, Index cols)
+{
+  resize(rows, cols);
+  return setConstant(Scalar(0));
+}
+
+/** Resizes to the given size, changing only the number of columns, and sets all
+  * coefficients in this expression to zero. For the parameter of type NoChange_t,
+  * just pass the special value \c NoChange.
+  *
+  * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Zero()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setZero(NoChange_t, Index cols)
+{
+  return setZero(rows(), cols);
+}
+
+/** Resizes to the given size, changing only the number of rows, and sets all
+  * coefficients in this expression to zero. For the parameter of type NoChange_t,
+  * just pass the special value \c NoChange.
+  *
+  * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Zero()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setZero(Index rows, NoChange_t)
+{
+  return setZero(rows, cols());
+}
+
+// ones:
+
+/** \returns an expression of a matrix where all coefficients equal one.
+  *
+  * The parameters \a rows and \a cols are the number of rows and of columns of
+  * the returned matrix. Must be compatible with this MatrixBase type.
+  *
+  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+  * it is redundant to pass \a rows and \a cols as arguments, so Ones() should be used
+  * instead.
+  *
+  * Example: \include MatrixBase_ones_int_int.cpp
+  * Output: \verbinclude MatrixBase_ones_int_int.out
+  *
+  * \sa Ones(), Ones(Index), isOnes(), class Ones
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Ones(Index rows, Index cols)
+{
+  return Constant(rows, cols, Scalar(1));
+}
+
+/** \returns an expression of a vector where all coefficients equal one.
+  *
+  * The parameter \a newSize is the size of the returned vector.
+  * Must be compatible with this MatrixBase type.
+  *
+  * \only_for_vectors
+  *
+  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+  * it is redundant to pass \a size as argument, so Ones() should be used
+  * instead.
+  *
+  * Example: \include MatrixBase_ones_int.cpp
+  * Output: \verbinclude MatrixBase_ones_int.out
+  *
+  * \sa Ones(), Ones(Index,Index), isOnes(), class Ones
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Ones(Index newSize)
+{
+  return Constant(newSize, Scalar(1));
+}
+
+/** \returns an expression of a fixed-size matrix or vector where all coefficients equal one.
+  *
+  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
+  * need to use the variants taking size arguments.
+  *
+  * Example: \include MatrixBase_ones.cpp
+  * Output: \verbinclude MatrixBase_ones.out
+  *
+  * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Ones()
+{
+  return Constant(Scalar(1));
+}
+
+/** \returns true if *this is approximately equal to the matrix where all coefficients
+  *          are equal to 1, within the precision given by \a prec.
+  *
+  * Example: \include MatrixBase_isOnes.cpp
+  * Output: \verbinclude MatrixBase_isOnes.out
+  *
+  * \sa class CwiseNullaryOp, Ones()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isOnes
+(const RealScalar& prec) const
+{
+  return isApproxToConstant(Scalar(1), prec);
+}
+
+/** Sets all coefficients in this expression to one.
+  *
+  * Example: \include MatrixBase_setOnes.cpp
+  * Output: \verbinclude MatrixBase_setOnes.out
+  *
+  * \sa class CwiseNullaryOp, Ones()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
+{
+  return setConstant(Scalar(1));
+}
+
+/** Resizes to the given \a newSize, and sets all coefficients in this expression to one.
+  *
+  * \only_for_vectors
+  *
+  * Example: \include Matrix_setOnes_int.cpp
+  * Output: \verbinclude Matrix_setOnes_int.out
+  *
+  * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setOnes(Index newSize)
+{
+  resize(newSize);
+  return setConstant(Scalar(1));
+}
+
+/** Resizes to the given size, and sets all coefficients in this expression to one.
+  *
+  * \param rows the new number of rows
+  * \param cols the new number of columns
+  *
+  * Example: \include Matrix_setOnes_int_int.cpp
+  * Output: \verbinclude Matrix_setOnes_int_int.out
+  *
+  * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
+{
+  resize(rows, cols);
+  return setConstant(Scalar(1));
+}
+
+/** Resizes to the given size, changing only the number of rows, and sets all
+  * coefficients in this expression to one. For the parameter of type NoChange_t,
+  * just pass the special value \c NoChange.
+  *
+ * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(NoChange_t, Index), class CwiseNullaryOp, MatrixBase::Ones()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setOnes(Index rows, NoChange_t)
+{
+  return setOnes(rows, cols());
+}
+
+/** Resizes to the given size, changing only the number of columns, and sets all
+  * coefficients in this expression to one. For the parameter of type NoChange_t,
+  * just pass the special value \c NoChange.
+  *
+ * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(Index, NoChange_t) class CwiseNullaryOp, MatrixBase::Ones()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setOnes(NoChange_t, Index cols)
+{
+  return setOnes(rows(), cols);
+}
+
+// Identity:
+
+/** \returns an expression of the identity matrix (not necessarily square).
+  *
+  * The parameters \a rows and \a cols are the number of rows and of columns of
+  * the returned matrix. Must be compatible with this MatrixBase type.
+  *
+  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+  * it is redundant to pass \a rows and \a cols as arguments, so Identity() should be used
+  * instead.
+  *
+  * Example: \include MatrixBase_identity_int_int.cpp
+  * Output: \verbinclude MatrixBase_identity_int_int.out
+  *
+  * \sa Identity(), setIdentity(), isIdentity()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+MatrixBase<Derived>::Identity(Index rows, Index cols)
+{
+  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
+}
+
+/** \returns an expression of the identity matrix (not necessarily square).
+  *
+  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
+  * need to use the variant taking size arguments.
+  *
+  * Example: \include MatrixBase_identity.cpp
+  * Output: \verbinclude MatrixBase_identity.out
+  *
+  * \sa Identity(Index,Index), setIdentity(), isIdentity()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+MatrixBase<Derived>::Identity()
+{
+  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
+  return MatrixBase<Derived>::NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_identity_op<Scalar>());
+}
+
+/** \returns true if *this is approximately equal to the identity matrix
+  *          (not necessarily square),
+  *          within the precision given by \a prec.
+  *
+  * Example: \include MatrixBase_isIdentity.cpp
+  * Output: \verbinclude MatrixBase_isIdentity.out
+  *
+  * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), setIdentity()
+  */
+template<typename Derived>
+bool MatrixBase<Derived>::isIdentity
+(const RealScalar& prec) const
+{
+  typename internal::nested_eval<Derived,1>::type self(derived());
+  for(Index j = 0; j < cols(); ++j)
+  {
+    for(Index i = 0; i < rows(); ++i)
+    {
+      if(i == j)
+      {
+        if(!internal::isApprox(self.coeff(i, j), static_cast<Scalar>(1), prec))
+          return false;
+      }
+      else
+      {
+        if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<RealScalar>(1), prec))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+namespace internal {
+
+template<typename Derived, bool Big = (Derived::SizeAtCompileTime>=16)>
+struct setIdentity_impl
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Derived& run(Derived& m)
+  {
+    return m = Derived::Identity(m.rows(), m.cols());
+  }
+};
+
+template<typename Derived>
+struct setIdentity_impl<Derived, true>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Derived& run(Derived& m)
+  {
+    m.setZero();
+    const Index size = numext::mini(m.rows(), m.cols());
+    for(Index i = 0; i < size; ++i) m.coeffRef(i,i) = typename Derived::Scalar(1);
+    return m;
+  }
+};
+
+} // end namespace internal
+
+/** Writes the identity expression (not necessarily square) into *this.
+  *
+  * Example: \include MatrixBase_setIdentity.cpp
+  * Output: \verbinclude MatrixBase_setIdentity.out
+  *
+  * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
+{
+  return internal::setIdentity_impl<Derived>::run(derived());
+}
+
+/** \brief Resizes to the given size, and writes the identity expression (not necessarily square) into *this.
+  *
+  * \param rows the new number of rows
+  * \param cols the new number of columns
+  *
+  * Example: \include Matrix_setIdentity_int_int.cpp
+  * Output: \verbinclude Matrix_setIdentity_int_int.out
+  *
+  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
+{
+  derived().resize(rows, cols);
+  return setIdentity();
+}
+
+/** \returns an expression of the i-th unit (basis) vector.
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i);
+}
+
+/** \returns an expression of the i-th unit (basis) vector.
+  *
+  * \only_for_vectors
+  *
+  * This variant is for fixed-size vector only.
+  *
+  * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return BasisReturnType(SquareMatrixType::Identity(),i);
+}
+
+/** \returns an expression of the X axis unit vector (1{,0}^*)
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
+{ return Derived::Unit(0); }
+
+/** \returns an expression of the Y axis unit vector (0,1{,0}^*)
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
+{ return Derived::Unit(1); }
+
+/** \returns an expression of the Z axis unit vector (0,0,1{,0}^*)
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
+{ return Derived::Unit(2); }
+
+/** \returns an expression of the W axis unit vector (0,0,0,1)
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
+{ return Derived::Unit(3); }
+
+/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector
+  *
+  * \param i index of the unique coefficient to be set to 1
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index i)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  eigen_assert(i<size());
+  derived().setZero();
+  derived().coeffRef(i) = Scalar(1);
+  return derived();
+}
+
+/** \brief Resizes to the given \a newSize, and writes the i-th unit (basis) vector into *this.
+  *
+  * \param newSize the new size of the vector
+  * \param i index of the unique coefficient to be set to 1
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index newSize, Index i)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  eigen_assert(i<newSize);
+  derived().resize(newSize);
+  return setUnit(i);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CWISE_NULLARY_OP_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/CwiseTernaryOp.h b/src/3rdparty/eigen/Eigen/src/Core/CwiseTernaryOp.h
new file mode 100644
index 000000000..9f3576fec
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/CwiseTernaryOp.h
@@ -0,0 +1,197 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CWISE_TERNARY_OP_H
+#define EIGEN_CWISE_TERNARY_OP_H
+
+namespace Eigen {
+
+namespace internal {
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
+struct traits<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> > {
+  // we must not inherit from traits<Arg1> since it has
+  // the potential to cause problems with MSVC
+  typedef typename remove_all<Arg1>::type Ancestor;
+  typedef typename traits<Ancestor>::XprKind XprKind;
+  enum {
+    RowsAtCompileTime = traits<Ancestor>::RowsAtCompileTime,
+    ColsAtCompileTime = traits<Ancestor>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = traits<Ancestor>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = traits<Ancestor>::MaxColsAtCompileTime
+  };
+
+  // even though we require Arg1, Arg2, and Arg3 to have the same scalar type
+  // (see CwiseTernaryOp constructor),
+  // we still want to handle the case when the result type is different.
+  typedef typename result_of<TernaryOp(
+      const typename Arg1::Scalar&, const typename Arg2::Scalar&,
+      const typename Arg3::Scalar&)>::type Scalar;
+
+  typedef typename internal::traits<Arg1>::StorageKind StorageKind;
+  typedef typename internal::traits<Arg1>::StorageIndex StorageIndex;
+
+  typedef typename Arg1::Nested Arg1Nested;
+  typedef typename Arg2::Nested Arg2Nested;
+  typedef typename Arg3::Nested Arg3Nested;
+  typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
+  typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
+  typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
+  enum { Flags = _Arg1Nested::Flags & RowMajorBit };
+};
+}  // end namespace internal
+
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3,
+          typename StorageKind>
+class CwiseTernaryOpImpl;
+
+/** \class CwiseTernaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise ternary operator is
+ * applied to two expressions
+  *
+  * \tparam TernaryOp template functor implementing the operator
+  * \tparam Arg1Type the type of the first argument
+  * \tparam Arg2Type the type of the second argument
+  * \tparam Arg3Type the type of the third argument
+  *
+  * This class represents an expression where a coefficient-wise ternary
+ * operator is applied to three expressions.
+  * It is the return type of ternary operators, by which we mean only those
+ * ternary operators where
+  * all three arguments are Eigen expressions.
+  * For example, the return type of betainc(matrix1, matrix2, matrix3) is a
+ * CwiseTernaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically
+ * don't have to name
+  * CwiseTernaryOp types explicitly.
+  *
+  * \sa MatrixBase::ternaryExpr(const MatrixBase<Argument2> &, const
+ * MatrixBase<Argument3> &, const CustomTernaryOp &) const, class CwiseBinaryOp,
+ * class CwiseUnaryOp, class CwiseNullaryOp
+  */
+template <typename TernaryOp, typename Arg1Type, typename Arg2Type,
+          typename Arg3Type>
+class CwiseTernaryOp : public CwiseTernaryOpImpl<
+                           TernaryOp, Arg1Type, Arg2Type, Arg3Type,
+                           typename internal::traits<Arg1Type>::StorageKind>,
+                       internal::no_assignment_operator
+{
+ public:
+  typedef typename internal::remove_all<Arg1Type>::type Arg1;
+  typedef typename internal::remove_all<Arg2Type>::type Arg2;
+  typedef typename internal::remove_all<Arg3Type>::type Arg3;
+
+  typedef typename CwiseTernaryOpImpl<
+      TernaryOp, Arg1Type, Arg2Type, Arg3Type,
+      typename internal::traits<Arg1Type>::StorageKind>::Base Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseTernaryOp)
+
+  typedef typename internal::ref_selector<Arg1Type>::type Arg1Nested;
+  typedef typename internal::ref_selector<Arg2Type>::type Arg2Nested;
+  typedef typename internal::ref_selector<Arg3Type>::type Arg3Nested;
+  typedef typename internal::remove_reference<Arg1Nested>::type _Arg1Nested;
+  typedef typename internal::remove_reference<Arg2Nested>::type _Arg2Nested;
+  typedef typename internal::remove_reference<Arg3Nested>::type _Arg3Nested;
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE CwiseTernaryOp(const Arg1& a1, const Arg2& a2,
+                                     const Arg3& a3,
+                                     const TernaryOp& func = TernaryOp())
+      : m_arg1(a1), m_arg2(a2), m_arg3(a3), m_functor(func) {
+    // require the sizes to match
+    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg2)
+    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg3)
+
+    // The index types should match
+    EIGEN_STATIC_ASSERT((internal::is_same<
+                         typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg2Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<
+                         typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg3Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+
+    eigen_assert(a1.rows() == a2.rows() && a1.cols() == a2.cols() &&
+                 a1.rows() == a3.rows() && a1.cols() == a3.cols());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Index rows() const {
+    // return the fixed size type if available to enable compile time
+    // optimizations
+    if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
+                RowsAtCompileTime == Dynamic &&
+        internal::traits<typename internal::remove_all<Arg2Nested>::type>::
+                RowsAtCompileTime == Dynamic)
+      return m_arg3.rows();
+    else if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
+                     RowsAtCompileTime == Dynamic &&
+             internal::traits<typename internal::remove_all<Arg3Nested>::type>::
+                     RowsAtCompileTime == Dynamic)
+      return m_arg2.rows();
+    else
+      return m_arg1.rows();
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Index cols() const {
+    // return the fixed size type if available to enable compile time
+    // optimizations
+    if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
+                ColsAtCompileTime == Dynamic &&
+        internal::traits<typename internal::remove_all<Arg2Nested>::type>::
+                ColsAtCompileTime == Dynamic)
+      return m_arg3.cols();
+    else if (internal::traits<typename internal::remove_all<Arg1Nested>::type>::
+                     ColsAtCompileTime == Dynamic &&
+             internal::traits<typename internal::remove_all<Arg3Nested>::type>::
+                     ColsAtCompileTime == Dynamic)
+      return m_arg2.cols();
+    else
+      return m_arg1.cols();
+  }
+
+  /** \returns the first argument nested expression */
+  EIGEN_DEVICE_FUNC
+  const _Arg1Nested& arg1() const { return m_arg1; }
+  /** \returns the first argument nested expression */
+  EIGEN_DEVICE_FUNC
+  const _Arg2Nested& arg2() const { return m_arg2; }
+  /** \returns the third argument nested expression */
+  EIGEN_DEVICE_FUNC
+  const _Arg3Nested& arg3() const { return m_arg3; }
+  /** \returns the functor representing the ternary operation */
+  EIGEN_DEVICE_FUNC
+  const TernaryOp& functor() const { return m_functor; }
+
+ protected:
+  Arg1Nested m_arg1;
+  Arg2Nested m_arg2;
+  Arg3Nested m_arg3;
+  const TernaryOp m_functor;
+};
+
+// Generic API dispatcher
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3,
+          typename StorageKind>
+class CwiseTernaryOpImpl
+    : public internal::generic_xpr_base<
+          CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >::type {
+ public:
+  typedef typename internal::generic_xpr_base<
+      CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >::type Base;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CWISE_TERNARY_OP_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/CwiseUnaryOp.h b/src/3rdparty/eigen/Eigen/src/Core/CwiseUnaryOp.h
new file mode 100644
index 000000000..e68c4f748
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/CwiseUnaryOp.h
@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CWISE_UNARY_OP_H
+#define EIGEN_CWISE_UNARY_OP_H
+
+namespace Eigen {
+
+namespace internal {
+template<typename UnaryOp, typename XprType>
+struct traits<CwiseUnaryOp<UnaryOp, XprType> >
+ : traits<XprType>
+{
+  typedef typename result_of<
+                     UnaryOp(const typename XprType::Scalar&)
+                   >::type Scalar;
+  typedef typename XprType::Nested XprTypeNested;
+  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+  enum {
+    Flags = _XprTypeNested::Flags & RowMajorBit
+  };
+};
+}
+
+template<typename UnaryOp, typename XprType, typename StorageKind>
+class CwiseUnaryOpImpl;
+
+/** \class CwiseUnaryOp
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression where a coefficient-wise unary operator is applied to an expression
+  *
+  * \tparam UnaryOp template functor implementing the operator
+  * \tparam XprType the type of the expression to which we are applying the unary operator
+  *
+  * This class represents an expression where a unary operator is applied to an expression.
+  * It is the return type of all operations taking exactly 1 input expression, regardless of the
+  * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix
+  * is considered unary, because only the right-hand side is an expression, and its
+  * return type is a specialization of CwiseUnaryOp.
+  *
+  * Most of the time, this is the only way that it is used, so you typically don't have to name
+  * CwiseUnaryOp types explicitly.
+  *
+  * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp
+  */
+template<typename UnaryOp, typename XprType>
+class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>, internal::no_assignment_operator
+{
+  public:
+
+    typedef typename CwiseUnaryOpImpl<UnaryOp, XprType,typename internal::traits<XprType>::StorageKind>::Base Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp)
+    typedef typename internal::ref_selector<XprType>::type XprTypeNested;
+    typedef typename internal::remove_all<XprType>::type NestedExpression;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+      : m_xpr(xpr), m_functor(func) {}
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
+
+    /** \returns the functor representing the unary operation */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const UnaryOp& functor() const { return m_functor; }
+
+    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<XprTypeNested>::type&
+    nestedExpression() const { return m_xpr; }
+
+    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename internal::remove_all<XprTypeNested>::type&
+    nestedExpression() { return m_xpr; }
+
+  protected:
+    XprTypeNested m_xpr;
+    const UnaryOp m_functor;
+};
+
+// Generic API dispatcher
+template<typename UnaryOp, typename XprType, typename StorageKind>
+class CwiseUnaryOpImpl
+  : public internal::generic_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type Base;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CWISE_UNARY_OP_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/CwiseUnaryView.h b/src/3rdparty/eigen/Eigen/src/Core/CwiseUnaryView.h
new file mode 100644
index 000000000..a06d7621e
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/CwiseUnaryView.h
@@ -0,0 +1,132 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CWISE_UNARY_VIEW_H
+#define EIGEN_CWISE_UNARY_VIEW_H
+
+namespace Eigen {
+
+namespace internal {
+template<typename ViewOp, typename MatrixType>
+struct traits<CwiseUnaryView<ViewOp, MatrixType> >
+ : traits<MatrixType>
+{
+  typedef typename result_of<
+                     ViewOp(const typename traits<MatrixType>::Scalar&)
+                   >::type Scalar;
+  typedef typename MatrixType::Nested MatrixTypeNested;
+  typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+  enum {
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags = traits<_MatrixTypeNested>::Flags & (RowMajorBit | FlagsLvalueBit | DirectAccessBit), // FIXME DirectAccessBit should not be handled by expressions
+    MatrixTypeInnerStride =  inner_stride_at_compile_time<MatrixType>::ret,
+    // need to cast the sizeof's from size_t to int explicitly, otherwise:
+    // "error: no integral type can represent all of the enumerator values
+    InnerStrideAtCompileTime = MatrixTypeInnerStride == Dynamic
+                             ? int(Dynamic)
+                             : int(MatrixTypeInnerStride) * int(sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar)),
+    OuterStrideAtCompileTime = outer_stride_at_compile_time<MatrixType>::ret == Dynamic
+                             ? int(Dynamic)
+                             : outer_stride_at_compile_time<MatrixType>::ret * int(sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar))
+  };
+};
+}
+
+template<typename ViewOp, typename MatrixType, typename StorageKind>
+class CwiseUnaryViewImpl;
+
+/** \class CwiseUnaryView
+  * \ingroup Core_Module
+  *
+  * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector
+  *
+  * \tparam ViewOp template functor implementing the view
+  * \tparam MatrixType the type of the matrix we are applying the unary operator
+  *
+  * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector.
+  * It is the return type of real() and imag(), and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp
+  */
+template<typename ViewOp, typename MatrixType>
+class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename internal::traits<MatrixType>::StorageKind>
+{
+  public:
+
+    typedef typename CwiseUnaryViewImpl<ViewOp, MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryView)
+    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
+
+    explicit EIGEN_DEVICE_FUNC inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
+      : m_matrix(mat), m_functor(func) {}
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+
+    /** \returns the functor representing unary operation */
+    EIGEN_DEVICE_FUNC const ViewOp& functor() const { return m_functor; }
+
+    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC const typename internal::remove_all<MatrixTypeNested>::type&
+    nestedExpression() const { return m_matrix; }
+
+    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC typename internal::remove_reference<MatrixTypeNested>::type&
+    nestedExpression() { return m_matrix; }
+
+  protected:
+    MatrixTypeNested m_matrix;
+    ViewOp m_functor;
+};
+
+// Generic API dispatcher
+template<typename ViewOp, typename XprType, typename StorageKind>
+class CwiseUnaryViewImpl
+  : public internal::generic_xpr_base<CwiseUnaryView<ViewOp, XprType> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<CwiseUnaryView<ViewOp, XprType> >::type Base;
+};
+
+template<typename ViewOp, typename MatrixType>
+class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
+  : public internal::dense_xpr_base< CwiseUnaryView<ViewOp, MatrixType> >::type
+{
+  public:
+
+    typedef CwiseUnaryView<ViewOp, MatrixType> Derived;
+    typedef typename internal::dense_xpr_base< CwiseUnaryView<ViewOp, MatrixType> >::type Base;
+
+    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl)
+
+    EIGEN_DEVICE_FUNC inline Scalar* data() { return &(this->coeffRef(0)); }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeff(0)); }
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const
+    {
+      return derived().nestedExpression().innerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const
+    {
+      return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
+    }
+  protected:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(CwiseUnaryViewImpl)
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CWISE_UNARY_VIEW_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/DenseBase.h b/src/3rdparty/eigen/Eigen/src/Core/DenseBase.h
new file mode 100644
index 000000000..9b16db68d
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/DenseBase.h
@@ -0,0 +1,701 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DENSEBASE_H
+#define EIGEN_DENSEBASE_H
+
+namespace Eigen {
+
+namespace internal {
+
+// The index type defined by EIGEN_DEFAULT_DENSE_INDEX_TYPE must be a signed type.
+// This dummy function simply aims at checking that at compile time.
+static inline void check_DenseIndex_is_signed() {
+  EIGEN_STATIC_ASSERT(NumTraits<DenseIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE)
+}
+
+} // end namespace internal
+
+/** \class DenseBase
+  * \ingroup Core_Module
+  *
+  * \brief Base class for all dense matrices, vectors, and arrays
+  *
+  * This class is the base that is inherited by all dense objects (matrix, vector, arrays,
+  * and related expression types). The common Eigen API for dense objects is contained in this class.
+  *
+  * \tparam Derived is the derived type, e.g., a matrix type or an expression.
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_DENSEBASE_PLUGIN.
+  *
+  * \sa \blank \ref TopicClassHierarchy
+  */
+template<typename Derived> class DenseBase
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  : public DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value>
+#else
+  : public DenseCoeffsBase<Derived,DirectWriteAccessors>
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+{
+  public:
+
+    /** Inner iterator type to iterate over the coefficients of a row or column.
+      * \sa class InnerIterator
+      */
+    typedef Eigen::InnerIterator<Derived> InnerIterator;
+
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+
+    /**
+      * \brief The type used to store indices
+      * \details This typedef is relevant for types that store multiple indices such as
+      *          PermutationMatrix or Transpositions, otherwise it defaults to Eigen::Index
+      * \sa \blank \ref TopicPreprocessorDirectives, Eigen::Index, SparseMatrixBase.
+     */
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+
+    /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc. */
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+
+    /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc.
+      *
+      * It is an alias for the Scalar type */
+    typedef Scalar value_type;
+
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value> Base;
+
+    using Base::derived;
+    using Base::const_cast_derived;
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::rowIndexByOuterInner;
+    using Base::colIndexByOuterInner;
+    using Base::coeff;
+    using Base::coeffByOuterInner;
+    using Base::operator();
+    using Base::operator[];
+    using Base::x;
+    using Base::y;
+    using Base::z;
+    using Base::w;
+    using Base::stride;
+    using Base::innerStride;
+    using Base::outerStride;
+    using Base::rowStride;
+    using Base::colStride;
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+
+    enum {
+
+      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+        /**< The number of rows at compile-time. This is just a copy of the value provided
+          * by the \a Derived type. If a value is not known at compile-time,
+          * it is set to the \a Dynamic constant.
+          * \sa MatrixBase::rows(), MatrixBase::cols(), ColsAtCompileTime, SizeAtCompileTime */
+
+      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+        /**< The number of columns at compile-time. This is just a copy of the value provided
+          * by the \a Derived type. If a value is not known at compile-time,
+          * it is set to the \a Dynamic constant.
+          * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */
+
+
+      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
+                                                   internal::traits<Derived>::ColsAtCompileTime>::ret),
+        /**< This is equal to the number of coefficients, i.e. the number of
+          * rows times the number of columns, or to \a Dynamic if this is not
+          * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
+
+      MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
+        /**< This value is equal to the maximum possible number of rows that this expression
+          * might have. If this expression might have an arbitrarily high number of rows,
+          * this value is set to \a Dynamic.
+          *
+          * This value is useful to know when evaluating an expression, in order to determine
+          * whether it is possible to avoid doing a dynamic memory allocation.
+          *
+          * \sa RowsAtCompileTime, MaxColsAtCompileTime, MaxSizeAtCompileTime
+          */
+
+      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
+        /**< This value is equal to the maximum possible number of columns that this expression
+          * might have. If this expression might have an arbitrarily high number of columns,
+          * this value is set to \a Dynamic.
+          *
+          * This value is useful to know when evaluating an expression, in order to determine
+          * whether it is possible to avoid doing a dynamic memory allocation.
+          *
+          * \sa ColsAtCompileTime, MaxRowsAtCompileTime, MaxSizeAtCompileTime
+          */
+
+      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
+                                                      internal::traits<Derived>::MaxColsAtCompileTime>::ret),
+        /**< This value is equal to the maximum possible number of coefficients that this expression
+          * might have. If this expression might have an arbitrarily high number of coefficients,
+          * this value is set to \a Dynamic.
+          *
+          * This value is useful to know when evaluating an expression, in order to determine
+          * whether it is possible to avoid doing a dynamic memory allocation.
+          *
+          * \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime
+          */
+
+      IsVectorAtCompileTime = internal::traits<Derived>::RowsAtCompileTime == 1
+                           || internal::traits<Derived>::ColsAtCompileTime == 1,
+        /**< This is set to true if either the number of rows or the number of
+          * columns is known at compile-time to be equal to 1. Indeed, in that case,
+          * we are dealing with a column-vector (if there is only one column) or with
+          * a row-vector (if there is only one row). */
+
+      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
+        /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors,
+         * and 2 for matrices.
+         */
+
+      Flags = internal::traits<Derived>::Flags,
+        /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
+          * constructed from this one. See the \ref flags "list of flags".
+          */
+
+      IsRowMajor = int(Flags) & RowMajorBit, /**< True if this expression has row-major storage order. */
+
+      InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? int(SizeAtCompileTime)
+                             : int(IsRowMajor) ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+
+      InnerStrideAtCompileTime = internal::inner_stride_at_compile_time<Derived>::ret,
+      OuterStrideAtCompileTime = internal::outer_stride_at_compile_time<Derived>::ret
+    };
+
+    typedef typename internal::find_best_packet<Scalar,SizeAtCompileTime>::type PacketScalar;
+
+    enum { IsPlainObjectBase = 0 };
+
+    /** The plain matrix type corresponding to this expression.
+      * \sa PlainObject */
+    typedef Matrix<typename internal::traits<Derived>::Scalar,
+                internal::traits<Derived>::RowsAtCompileTime,
+                internal::traits<Derived>::ColsAtCompileTime,
+                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                internal::traits<Derived>::MaxRowsAtCompileTime,
+                internal::traits<Derived>::MaxColsAtCompileTime
+          > PlainMatrix;
+
+    /** The plain array type corresponding to this expression.
+      * \sa PlainObject */
+    typedef Array<typename internal::traits<Derived>::Scalar,
+                internal::traits<Derived>::RowsAtCompileTime,
+                internal::traits<Derived>::ColsAtCompileTime,
+                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                internal::traits<Derived>::MaxRowsAtCompileTime,
+                internal::traits<Derived>::MaxColsAtCompileTime
+          > PlainArray;
+
+    /** \brief The plain matrix or array type corresponding to this expression.
+      *
+      * This is not necessarily exactly the return type of eval(). In the case of plain matrices,
+      * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed
+      * that the return type of eval() is either PlainObject or const PlainObject&.
+      */
+    typedef typename internal::conditional<internal::is_same<typename internal::traits<Derived>::XprKind,MatrixXpr >::value,
+                                 PlainMatrix, PlainArray>::type PlainObject;
+
+    /** \returns the number of nonzero coefficients which is in practice the number
+      * of stored coefficients. */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index nonZeros() const { return size(); }
+
+    /** \returns the outer size.
+      *
+      * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
+      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a
+      * column-major matrix, and the number of rows for a row-major matrix. */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index outerSize() const
+    {
+      return IsVectorAtCompileTime ? 1
+           : int(IsRowMajor) ? this->rows() : this->cols();
+    }
+
+    /** \returns the inner size.
+      *
+      * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
+      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a
+      * column-major matrix, and the number of columns for a row-major matrix. */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index innerSize() const
+    {
+      return IsVectorAtCompileTime ? this->size()
+           : int(IsRowMajor) ? this->cols() : this->rows();
+    }
+
+    /** Only plain matrices/arrays, not expressions, may be resized; therefore the only useful resize methods are
+      * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
+      * nothing else.
+      */
+    EIGEN_DEVICE_FUNC
+    void resize(Index newSize)
+    {
+      EIGEN_ONLY_USED_FOR_DEBUG(newSize);
+      eigen_assert(newSize == this->size()
+                && "DenseBase::resize() does not actually allow to resize.");
+    }
+    /** Only plain matrices/arrays, not expressions, may be resized; therefore the only useful resize methods are
+      * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
+      * nothing else.
+      */
+    EIGEN_DEVICE_FUNC
+    void resize(Index rows, Index cols)
+    {
+      EIGEN_ONLY_USED_FOR_DEBUG(rows);
+      EIGEN_ONLY_USED_FOR_DEBUG(cols);
+      eigen_assert(rows == this->rows() && cols == this->cols()
+                && "DenseBase::resize() does not actually allow to resize.");
+    }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal Represents a matrix with all coefficients equal to one another*/
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
+    /** \internal \deprecated Represents a vector with linearly spaced coefficients that allows sequential access only. */
+    EIGEN_DEPRECATED typedef CwiseNullaryOp<internal::linspaced_op<Scalar>,PlainObject> SequentialLinSpacedReturnType;
+    /** \internal Represents a vector with linearly spaced coefficients that allows random access. */
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar>,PlainObject> RandomAccessLinSpacedReturnType;
+    /** \internal the return type of MatrixBase::eigenvalues() */
+    typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real, internal::traits<Derived>::ColsAtCompileTime, 1> EigenvaluesReturnType;
+
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+    /** Copies \a other into *this. \returns a reference to *this. */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator=(const DenseBase<OtherDerived>& other);
+
+    /** Special case of the template operator=, in order to prevent the compiler
+      * from generating a default operator= (issue hit with g++ 4.1)
+      */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator=(const DenseBase& other);
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    Derived& operator=(const EigenBase<OtherDerived> &other);
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    Derived& operator+=(const EigenBase<OtherDerived> &other);
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    Derived& operator-=(const EigenBase<OtherDerived> &other);
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    Derived& operator=(const ReturnByValue<OtherDerived>& func);
+
+    /** \internal
+      * Copies \a other into *this without evaluating other. \returns a reference to *this. */
+    template<typename OtherDerived>
+    /** \deprecated */
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
+    Derived& lazyAssign(const DenseBase<OtherDerived>& other);
+
+    EIGEN_DEVICE_FUNC
+    CommaInitializer<Derived> operator<< (const Scalar& s);
+
+    template<unsigned int Added,unsigned int Removed>
+    /** \deprecated it now returns \c *this */
+    EIGEN_DEPRECATED
+    const Derived& flagged() const
+    { return derived(); }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    CommaInitializer<Derived> operator<< (const DenseBase<OtherDerived>& other);
+
+    typedef Transpose<Derived> TransposeReturnType;
+    EIGEN_DEVICE_FUNC
+    TransposeReturnType transpose();
+    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    EIGEN_DEVICE_FUNC
+    ConstTransposeReturnType transpose() const;
+    EIGEN_DEVICE_FUNC
+    void transposeInPlace();
+
+    EIGEN_DEVICE_FUNC static const ConstantReturnType
+    Constant(Index rows, Index cols, const Scalar& value);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType
+    Constant(Index size, const Scalar& value);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType
+    Constant(const Scalar& value);
+
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
+    LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high);
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
+    LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
+
+    EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
+    LinSpaced(Index size, const Scalar& low, const Scalar& high);
+    EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
+    LinSpaced(const Scalar& low, const Scalar& high);
+
+    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
+    NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func);
+    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
+    NullaryExpr(Index size, const CustomNullaryOp& func);
+    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
+    NullaryExpr(const CustomNullaryOp& func);
+
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index size);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero();
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index size);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones();
+
+    EIGEN_DEVICE_FUNC void fill(const Scalar& value);
+    EIGEN_DEVICE_FUNC Derived& setConstant(const Scalar& value);
+    EIGEN_DEVICE_FUNC Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high);
+    EIGEN_DEVICE_FUNC Derived& setLinSpaced(const Scalar& low, const Scalar& high);
+    EIGEN_DEVICE_FUNC Derived& setZero();
+    EIGEN_DEVICE_FUNC Derived& setOnes();
+    EIGEN_DEVICE_FUNC Derived& setRandom();
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC
+    bool isApprox(const DenseBase<OtherDerived>& other,
+                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC
+    bool isMuchSmallerThan(const RealScalar& other,
+                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC
+    bool isMuchSmallerThan(const DenseBase<OtherDerived>& other,
+                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+
+    EIGEN_DEVICE_FUNC bool isApproxToConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+
+    inline bool hasNaN() const;
+    inline bool allFinite() const;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator*=(const Scalar& other);
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator/=(const Scalar& other);
+
+    typedef typename internal::add_const_on_value_type<typename internal::eval<Derived>::type>::type EvalReturnType;
+    /** \returns the matrix or vector obtained by evaluating this expression.
+      *
+      * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
+      * a const reference, in order to avoid a useless copy.
+      *
+      * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE EvalReturnType eval() const
+    {
+      // Even though MSVC does not honor strong inlining when the return type
+      // is a dynamic matrix, we desperately need strong inlining for fixed
+      // size types on MSVC.
+      return typename internal::eval<Derived>::type(derived());
+    }
+
+    /** swaps *this with the expression \a other.
+      *
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void swap(const DenseBase<OtherDerived>& other)
+    {
+      EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      eigen_assert(rows()==other.rows() && cols()==other.cols());
+      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
+    }
+
+    /** swaps *this with the matrix or array \a other.
+      *
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void swap(PlainObjectBase<OtherDerived>& other)
+    {
+      eigen_assert(rows()==other.rows() && cols()==other.cols());
+      call_assignment(derived(), other.derived(), internal::swap_assign_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC inline const NestByValue<Derived> nestByValue() const;
+    EIGEN_DEVICE_FUNC inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
+    EIGEN_DEVICE_FUNC inline ForceAlignedAccess<Derived> forceAlignedAccess();
+    template<bool Enable> EIGEN_DEVICE_FUNC
+    inline const typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf() const;
+    template<bool Enable> EIGEN_DEVICE_FUNC
+    inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();
+
+    EIGEN_DEVICE_FUNC Scalar sum() const;
+    EIGEN_DEVICE_FUNC Scalar mean() const;
+    EIGEN_DEVICE_FUNC Scalar trace() const;
+
+    EIGEN_DEVICE_FUNC Scalar prod() const;
+
+    template<int NaNPropagation>
+    EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar minCoeff() const;
+    template<int NaNPropagation>
+    EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff() const;
+
+
+    // By default, the fastest version with undefined NaN propagation semantics is
+    // used.
+    // TODO(rmlarsen): Replace with default template argument when we move to
+    // c++11 or beyond.
+    EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar minCoeff() const {
+      return minCoeff<PropagateFast>();
+    }
+    EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar maxCoeff() const {
+      return maxCoeff<PropagateFast>();
+    }
+
+    template<int NaNPropagation, typename IndexType>
+    EIGEN_DEVICE_FUNC
+    typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const;
+    template<int NaNPropagation, typename IndexType>
+    EIGEN_DEVICE_FUNC
+    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const;
+    template<int NaNPropagation, typename IndexType>
+    EIGEN_DEVICE_FUNC
+    typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const;
+    template<int NaNPropagation, typename IndexType>
+    EIGEN_DEVICE_FUNC
+    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const;
+
+    // TODO(rmlarsen): Replace these methods with a default template argument.
+    template<typename IndexType>
+    EIGEN_DEVICE_FUNC inline
+    typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const {
+      return minCoeff<PropagateFast>(row, col);
+    }
+    template<typename IndexType>
+    EIGEN_DEVICE_FUNC inline
+    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const {
+      return maxCoeff<PropagateFast>(row, col);
+    }
+    template<typename IndexType>
+     EIGEN_DEVICE_FUNC inline
+    typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const {
+      return minCoeff<PropagateFast>(index);
+    }
+    template<typename IndexType>
+    EIGEN_DEVICE_FUNC inline
+    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const {
+      return maxCoeff<PropagateFast>(index);
+    }
+  
+    template<typename BinaryOp>
+    EIGEN_DEVICE_FUNC
+    Scalar redux(const BinaryOp& func) const;
+
+    template<typename Visitor>
+    EIGEN_DEVICE_FUNC
+    void visit(Visitor& func) const;
+
+    /** \returns a WithFormat proxy object allowing to print a matrix the with given
+      * format \a fmt.
+      *
+      * See class IOFormat for some examples.
+      *
+      * \sa class IOFormat, class WithFormat
+      */
+    inline const WithFormat<Derived> format(const IOFormat& fmt) const
+    {
+      return WithFormat<Derived>(derived(), fmt);
+    }
+
+    /** \returns the unique coefficient of a 1x1 expression */
+    EIGEN_DEVICE_FUNC
+    CoeffReturnType value() const
+    {
+      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
+      eigen_assert(this->rows() == 1 && this->cols() == 1);
+      return derived().coeff(0,0);
+    }
+
+    EIGEN_DEVICE_FUNC bool all() const;
+    EIGEN_DEVICE_FUNC bool any() const;
+    EIGEN_DEVICE_FUNC Index count() const;
+
+    typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
+    typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;
+    typedef VectorwiseOp<Derived, Vertical> ColwiseReturnType;
+    typedef const VectorwiseOp<const Derived, Vertical> ConstColwiseReturnType;
+
+    /** \returns a VectorwiseOp wrapper of *this for broadcasting and partial reductions
+    *
+    * Example: \include MatrixBase_rowwise.cpp
+    * Output: \verbinclude MatrixBase_rowwise.out
+    *
+    * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+    */
+    //Code moved here due to a CUDA compiler bug
+    EIGEN_DEVICE_FUNC inline ConstRowwiseReturnType rowwise() const {
+      return ConstRowwiseReturnType(derived());
+    }
+    EIGEN_DEVICE_FUNC RowwiseReturnType rowwise();
+
+    /** \returns a VectorwiseOp wrapper of *this broadcasting and partial reductions
+    *
+    * Example: \include MatrixBase_colwise.cpp
+    * Output: \verbinclude MatrixBase_colwise.out
+    *
+    * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+    */
+    EIGEN_DEVICE_FUNC inline ConstColwiseReturnType colwise() const {
+      return ConstColwiseReturnType(derived());
+    }
+    EIGEN_DEVICE_FUNC ColwiseReturnType colwise();
+
+    typedef CwiseNullaryOp<internal::scalar_random_op<Scalar>,PlainObject> RandomReturnType;
+    static const RandomReturnType Random(Index rows, Index cols);
+    static const RandomReturnType Random(Index size);
+    static const RandomReturnType Random();
+
+    template<typename ThenDerived,typename ElseDerived>
+    inline EIGEN_DEVICE_FUNC const Select<Derived,ThenDerived,ElseDerived>
+    select(const DenseBase<ThenDerived>& thenMatrix,
+           const DenseBase<ElseDerived>& elseMatrix) const;
+
+    template<typename ThenDerived>
+    inline EIGEN_DEVICE_FUNC const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
+    select(const DenseBase<ThenDerived>& thenMatrix, const typename ThenDerived::Scalar& elseScalar) const;
+
+    template<typename ElseDerived>
+    inline EIGEN_DEVICE_FUNC const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
+    select(const typename ElseDerived::Scalar& thenScalar, const DenseBase<ElseDerived>& elseMatrix) const;
+
+    template<int p> RealScalar lpNorm() const;
+
+    template<int RowFactor, int ColFactor>
+    EIGEN_DEVICE_FUNC
+    const Replicate<Derived,RowFactor,ColFactor> replicate() const;
+    /**
+    * \return an expression of the replication of \c *this
+    *
+    * Example: \include MatrixBase_replicate_int_int.cpp
+    * Output: \verbinclude MatrixBase_replicate_int_int.out
+    *
+    * \sa VectorwiseOp::replicate(), DenseBase::replicate<int,int>(), class Replicate
+    */
+    //Code moved here due to a CUDA compiler bug
+    EIGEN_DEVICE_FUNC
+    const Replicate<Derived, Dynamic, Dynamic> replicate(Index rowFactor, Index colFactor) const
+    {
+      return Replicate<Derived, Dynamic, Dynamic>(derived(), rowFactor, colFactor);
+    }
+
+    typedef Reverse<Derived, BothDirections> ReverseReturnType;
+    typedef const Reverse<const Derived, BothDirections> ConstReverseReturnType;
+    EIGEN_DEVICE_FUNC ReverseReturnType reverse();
+    /** This is the const version of reverse(). */
+    //Code moved here due to a CUDA compiler bug
+    EIGEN_DEVICE_FUNC ConstReverseReturnType reverse() const
+    {
+      return ConstReverseReturnType(derived());
+    }
+    EIGEN_DEVICE_FUNC void reverseInPlace();
+
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    /** STL-like <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">RandomAccessIterator</a>
+      * iterator type as returned by the begin() and end() methods.
+      */
+    typedef random_access_iterator_type iterator;
+    /** This is the const version of iterator (aka read-only) */
+    typedef random_access_iterator_type const_iterator;
+    #else
+    typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit,
+                                            internal::pointer_based_stl_iterator<Derived>,
+                                            internal::generic_randaccess_stl_iterator<Derived>
+                                          >::type iterator_type;
+
+    typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit,
+                                            internal::pointer_based_stl_iterator<const Derived>,
+                                            internal::generic_randaccess_stl_iterator<const Derived>
+                                          >::type const_iterator_type;
+
+    // Stl-style iterators are supported only for vectors.
+
+    typedef typename internal::conditional< IsVectorAtCompileTime,
+                                            iterator_type,
+                                            void
+                                          >::type iterator;
+
+    typedef typename internal::conditional< IsVectorAtCompileTime,
+                                            const_iterator_type,
+                                            void
+                                          >::type const_iterator;
+    #endif
+
+    inline iterator begin();
+    inline const_iterator begin() const;
+    inline const_iterator cbegin() const;
+    inline iterator end();
+    inline const_iterator end() const;
+    inline const_iterator cend() const;
+
+#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
+#   include "../plugins/CommonCwiseUnaryOps.h"
+#   include "../plugins/BlockMethods.h"
+#   include "../plugins/IndexedViewMethods.h"
+#   include "../plugins/ReshapedMethods.h"
+#   ifdef EIGEN_DENSEBASE_PLUGIN
+#     include EIGEN_DENSEBASE_PLUGIN
+#   endif
+#undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
+#undef EIGEN_DOC_UNARY_ADDONS
+
+    // disable the use of evalTo for dense objects with a nice compilation error
+    template<typename Dest>
+    EIGEN_DEVICE_FUNC
+    inline void evalTo(Dest& ) const
+    {
+      EIGEN_STATIC_ASSERT((internal::is_same<Dest,void>::value),THE_EVAL_EVALTO_FUNCTION_SHOULD_NEVER_BE_CALLED_FOR_DENSE_OBJECTS);
+    }
+
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(DenseBase)
+    /** Default constructor. Do nothing. */
+    EIGEN_DEVICE_FUNC DenseBase()
+    {
+      /* Just checks for self-consistency of the flags.
+       * Only do it when debugging Eigen, as this borders on paranoia and could slow compilation down
+       */
+#ifdef EIGEN_INTERNAL_DEBUGGING
+      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor))
+                        && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, int(!IsRowMajor))),
+                          INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION)
+#endif
+    }
+
+  private:
+    EIGEN_DEVICE_FUNC explicit DenseBase(int);
+    EIGEN_DEVICE_FUNC DenseBase(int,int);
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC explicit DenseBase(const DenseBase<OtherDerived>&);
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_DENSEBASE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/DenseCoeffsBase.h b/src/3rdparty/eigen/Eigen/src/Core/DenseCoeffsBase.h
new file mode 100644
index 000000000..37fcdb591
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/DenseCoeffsBase.h
@@ -0,0 +1,685 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DENSECOEFFSBASE_H
+#define EIGEN_DENSECOEFFSBASE_H
+
+namespace Eigen {
+
+namespace internal {
+template<typename T> struct add_const_on_value_type_if_arithmetic
+{
+  typedef typename conditional<is_arithmetic<T>::value, T, typename add_const_on_value_type<T>::type>::type type;
+};
+}
+
+/** \brief Base class providing read-only coefficient access to matrices and arrays.
+  * \ingroup Core_Module
+  * \tparam Derived Type of the derived class
+  *
+  * \note #ReadOnlyAccessors Constant indicating read-only access
+  *
+  * This class defines the \c operator() \c const function and friends, which can be used to read specific
+  * entries of a matrix or array.
+  *
+  * \sa DenseCoeffsBase<Derived, WriteAccessors>, DenseCoeffsBase<Derived, DirectAccessors>,
+  *     \ref TopicClassHierarchy
+  */
+template<typename Derived>
+class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
+{
+  public:
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+
+    // Explanation for this CoeffReturnType typedef.
+    // - This is the return type of the coeff() method.
+    // - The LvalueBit means exactly that we can offer a coeffRef() method, which means exactly that we can get references
+    // to coeffs, which means exactly that we can have coeff() return a const reference (as opposed to returning a value).
+    // - The is_artihmetic check is required since "const int", "const double", etc. will cause warnings on some systems
+    // while the declaration of "const T", where T is a non arithmetic type does not. Always returning "const Scalar&" is
+    // not possible, since the underlying expressions might not offer a valid address the reference could be referring to.
+    typedef typename internal::conditional<bool(internal::traits<Derived>::Flags&LvalueBit),
+                         const Scalar&,
+                         typename internal::conditional<internal::is_arithmetic<Scalar>::value, Scalar, const Scalar>::type
+                     >::type CoeffReturnType;
+
+    typedef typename internal::add_const_on_value_type_if_arithmetic<
+                         typename internal::packet_traits<Scalar>::type
+                     >::type PacketReturnType;
+
+    typedef EigenBase<Derived> Base;
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::derived;
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) const
+    {
+      return int(Derived::RowsAtCompileTime) == 1 ? 0
+          : int(Derived::ColsAtCompileTime) == 1 ? inner
+          : int(Derived::Flags)&RowMajorBit ? outer
+          : inner;
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) const
+    {
+      return int(Derived::ColsAtCompileTime) == 1 ? 0
+          : int(Derived::RowsAtCompileTime) == 1 ? inner
+          : int(Derived::Flags)&RowMajorBit ? inner
+          : outer;
+    }
+
+    /** Short version: don't use this function, use
+      * \link operator()(Index,Index) const \endlink instead.
+      *
+      * Long version: this function is similar to
+      * \link operator()(Index,Index) const \endlink, but without the assertion.
+      * Use this for limiting the performance cost of debugging code when doing
+      * repeated coefficient access. Only use this when it is guaranteed that the
+      * parameters \a row and \a col are in range.
+      *
+      * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
+      * function equivalent to \link operator()(Index,Index) const \endlink.
+      *
+      * \sa operator()(Index,Index) const, coeffRef(Index,Index), coeff(Index) const
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
+    {
+      eigen_internal_assert(row >= 0 && row < rows()
+                         && col >= 0 && col < cols());
+      return internal::evaluator<Derived>(derived()).coeff(row,col);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
+    {
+      return coeff(rowIndexByOuterInner(outer, inner),
+                   colIndexByOuterInner(outer, inner));
+    }
+
+    /** \returns the coefficient at given the given row and column.
+      *
+      * \sa operator()(Index,Index), operator[](Index)
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE CoeffReturnType operator()(Index row, Index col) const
+    {
+      eigen_assert(row >= 0 && row < rows()
+          && col >= 0 && col < cols());
+      return coeff(row, col);
+    }
+
+    /** Short version: don't use this function, use
+      * \link operator[](Index) const \endlink instead.
+      *
+      * Long version: this function is similar to
+      * \link operator[](Index) const \endlink, but without the assertion.
+      * Use this for limiting the performance cost of debugging code when doing
+      * repeated coefficient access. Only use this when it is guaranteed that the
+      * parameter \a index is in range.
+      *
+      * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
+      * function equivalent to \link operator[](Index) const \endlink.
+      *
+      * \sa operator[](Index) const, coeffRef(Index), coeff(Index,Index) const
+      */
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE CoeffReturnType
+    coeff(Index index) const
+    {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
+      eigen_internal_assert(index >= 0 && index < size());
+      return internal::evaluator<Derived>(derived()).coeff(index);
+    }
+
+
+    /** \returns the coefficient at given index.
+      *
+      * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
+      *
+      * \sa operator[](Index), operator()(Index,Index) const, x() const, y() const,
+      * z() const, w() const
+      */
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE CoeffReturnType
+    operator[](Index index) const
+    {
+      EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
+                          THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
+      eigen_assert(index >= 0 && index < size());
+      return coeff(index);
+    }
+
+    /** \returns the coefficient at given index.
+      *
+      * This is synonymous to operator[](Index) const.
+      *
+      * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
+      *
+      * \sa operator[](Index), operator()(Index,Index) const, x() const, y() const,
+      * z() const, w() const
+      */
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE CoeffReturnType
+    operator()(Index index) const
+    {
+      eigen_assert(index >= 0 && index < size());
+      return coeff(index);
+    }
+
+    /** equivalent to operator[](0).  */
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE CoeffReturnType
+    x() const { return (*this)[0]; }
+
+    /** equivalent to operator[](1).  */
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE CoeffReturnType
+    y() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS);
+      return (*this)[1];
+    }
+
+    /** equivalent to operator[](2).  */
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE CoeffReturnType
+    z() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS);
+      return (*this)[2];
+    }
+
+    /** equivalent to operator[](3).  */
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE CoeffReturnType
+    w() const
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS);
+      return (*this)[3];
+    }
+
+    /** \internal
+      * \returns the packet of coefficients starting at the given row and column. It is your responsibility
+      * to ensure that a packet really starts there. This method is only available on expressions having the
+      * PacketAccessBit.
+      *
+      * The \a LoadMode parameter may have the value \a #Aligned or \a #Unaligned. Its effect is to select
+      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+      * starting at an address which is a multiple of the packet size.
+      */
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketReturnType packet(Index row, Index col) const
+    {
+      typedef typename internal::packet_traits<Scalar>::type DefaultPacketType;
+      eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
+      return internal::evaluator<Derived>(derived()).template packet<LoadMode,DefaultPacketType>(row,col);
+    }
+
+
+    /** \internal */
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketReturnType packetByOuterInner(Index outer, Index inner) const
+    {
+      return packet<LoadMode>(rowIndexByOuterInner(outer, inner),
+                              colIndexByOuterInner(outer, inner));
+    }
+
+    /** \internal
+      * \returns the packet of coefficients starting at the given index. It is your responsibility
+      * to ensure that a packet really starts there. This method is only available on expressions having the
+      * PacketAccessBit and the LinearAccessBit.
+      *
+      * The \a LoadMode parameter may have the value \a #Aligned or \a #Unaligned. Its effect is to select
+      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+      * starting at an address which is a multiple of the packet size.
+      */
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+    {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
+      typedef typename internal::packet_traits<Scalar>::type DefaultPacketType;
+      eigen_internal_assert(index >= 0 && index < size());
+      return internal::evaluator<Derived>(derived()).template packet<LoadMode,DefaultPacketType>(index);
+    }
+
+  protected:
+    // explanation: DenseBase is doing "using ..." on the methods from DenseCoeffsBase.
+    // But some methods are only available in the DirectAccess case.
+    // So we add dummy methods here with these names, so that "using... " doesn't fail.
+    // It's not private so that the child class DenseBase can access them, and it's not public
+    // either since it's an implementation detail, so has to be protected.
+    void coeffRef();
+    void coeffRefByOuterInner();
+    void writePacket();
+    void writePacketByOuterInner();
+    void copyCoeff();
+    void copyCoeffByOuterInner();
+    void copyPacket();
+    void copyPacketByOuterInner();
+    void stride();
+    void innerStride();
+    void outerStride();
+    void rowStride();
+    void colStride();
+};
+
+/** \brief Base class providing read/write coefficient access to matrices and arrays.
+  * \ingroup Core_Module
+  * \tparam Derived Type of the derived class
+  *
+  * \note #WriteAccessors Constant indicating read/write access
+  *
+  * This class defines the non-const \c operator() function and friends, which can be used to write specific
+  * entries of a matrix or array. This class inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which
+  * defines the const variant for reading specific entries.
+  *
+  * \sa DenseCoeffsBase<Derived, DirectAccessors>, \ref TopicClassHierarchy
+  */
+template<typename Derived>
+class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived, ReadOnlyAccessors>
+{
+  public:
+
+    typedef DenseCoeffsBase<Derived, ReadOnlyAccessors> Base;
+
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    using Base::coeff;
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::derived;
+    using Base::rowIndexByOuterInner;
+    using Base::colIndexByOuterInner;
+    using Base::operator[];
+    using Base::operator();
+    using Base::x;
+    using Base::y;
+    using Base::z;
+    using Base::w;
+
+    /** Short version: don't use this function, use
+      * \link operator()(Index,Index) \endlink instead.
+      *
+      * Long version: this function is similar to
+      * \link operator()(Index,Index) \endlink, but without the assertion.
+      * Use this for limiting the performance cost of debugging code when doing
+      * repeated coefficient access. Only use this when it is guaranteed that the
+      * parameters \a row and \a col are in range.
+      *
+      * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
+      * function equivalent to \link operator()(Index,Index) \endlink.
+      *
+      * \sa operator()(Index,Index), coeff(Index, Index) const, coeffRef(Index)
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
+    {
+      eigen_internal_assert(row >= 0 && row < rows()
+                         && col >= 0 && col < cols());
+      return internal::evaluator<Derived>(derived()).coeffRef(row,col);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar&
+    coeffRefByOuterInner(Index outer, Index inner)
+    {
+      return coeffRef(rowIndexByOuterInner(outer, inner),
+                      colIndexByOuterInner(outer, inner));
+    }
+
+    /** \returns a reference to the coefficient at given the given row and column.
+      *
+      * \sa operator[](Index)
+      */
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar&
+    operator()(Index row, Index col)
+    {
+      eigen_assert(row >= 0 && row < rows()
+          && col >= 0 && col < cols());
+      return coeffRef(row, col);
+    }
+
+
+    /** Short version: don't use this function, use
+      * \link operator[](Index) \endlink instead.
+      *
+      * Long version: this function is similar to
+      * \link operator[](Index) \endlink, but without the assertion.
+      * Use this for limiting the performance cost of debugging code when doing
+      * repeated coefficient access. Only use this when it is guaranteed that the
+      * parameters \a row and \a col are in range.
+      *
+      * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
+      * function equivalent to \link operator[](Index) \endlink.
+      *
+      * \sa operator[](Index), coeff(Index) const, coeffRef(Index,Index)
+      */
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar&
+    coeffRef(Index index)
+    {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
+      eigen_internal_assert(index >= 0 && index < size());
+      return internal::evaluator<Derived>(derived()).coeffRef(index);
+    }
+
+    /** \returns a reference to the coefficient at given index.
+      *
+      * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
+      *
+      * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
+      */
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar&
+    operator[](Index index)
+    {
+      EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
+                          THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
+      eigen_assert(index >= 0 && index < size());
+      return coeffRef(index);
+    }
+
+    /** \returns a reference to the coefficient at given index.
+      *
+      * This is synonymous to operator[](Index).
+      *
+      * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
+      *
+      * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
+      */
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar&
+    operator()(Index index)
+    {
+      eigen_assert(index >= 0 && index < size());
+      return coeffRef(index);
+    }
+
+    /** equivalent to operator[](0).  */
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar&
+    x() { return (*this)[0]; }
+
+    /** equivalent to operator[](1).  */
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar&
+    y()
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS);
+      return (*this)[1];
+    }
+
+    /** equivalent to operator[](2).  */
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar&
+    z()
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS);
+      return (*this)[2];
+    }
+
+    /** equivalent to operator[](3).  */
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar&
+    w()
+    {
+      EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS);
+      return (*this)[3];
+    }
+};
+
+/** \brief Base class providing direct read-only coefficient access to matrices and arrays.
+  * \ingroup Core_Module
+  * \tparam Derived Type of the derived class
+  *
+  * \note #DirectAccessors Constant indicating direct access
+  *
+  * This class defines functions to work with strides which can be used to access entries directly. This class
+  * inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which defines functions to access entries read-only using
+  * \c operator() .
+  *
+  * \sa \blank \ref TopicClassHierarchy
+  */
+template<typename Derived>
+class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived, ReadOnlyAccessors>
+{
+  public:
+
+    typedef DenseCoeffsBase<Derived, ReadOnlyAccessors> Base;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::derived;
+
+    /** \returns the pointer increment between two consecutive elements within a slice in the inner direction.
+      *
+      * \sa outerStride(), rowStride(), colStride()
+      */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const
+    {
+      return derived().innerStride();
+    }
+
+    /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
+      *          in a column-major matrix).
+      *
+      * \sa innerStride(), rowStride(), colStride()
+      */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const
+    {
+      return derived().outerStride();
+    }
+
+    // FIXME shall we remove it ?
+    EIGEN_CONSTEXPR inline Index stride() const
+    {
+      return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
+    }
+
+    /** \returns the pointer increment between two consecutive rows.
+      *
+      * \sa innerStride(), outerStride(), colStride()
+      */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rowStride() const
+    {
+      return Derived::IsRowMajor ? outerStride() : innerStride();
+    }
+
+    /** \returns the pointer increment between two consecutive columns.
+      *
+      * \sa innerStride(), outerStride(), rowStride()
+      */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index colStride() const
+    {
+      return Derived::IsRowMajor ? innerStride() : outerStride();
+    }
+};
+
+/** \brief Base class providing direct read/write coefficient access to matrices and arrays.
+  * \ingroup Core_Module
+  * \tparam Derived Type of the derived class
+  *
+  * \note #DirectWriteAccessors Constant indicating direct access
+  *
+  * This class defines functions to work with strides which can be used to access entries directly. This class
+  * inherits DenseCoeffsBase<Derived, WriteAccessors> which defines functions to access entries read/write using
+  * \c operator().
+  *
+  * \sa \blank \ref TopicClassHierarchy
+  */
+template<typename Derived>
+class DenseCoeffsBase<Derived, DirectWriteAccessors>
+  : public DenseCoeffsBase<Derived, WriteAccessors>
+{
+  public:
+
+    typedef DenseCoeffsBase<Derived, WriteAccessors> Base;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::derived;
+
+    /** \returns the pointer increment between two consecutive elements within a slice in the inner direction.
+      *
+      * \sa outerStride(), rowStride(), colStride()
+      */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT
+    {
+      return derived().innerStride();
+    }
+
+    /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
+      *          in a column-major matrix).
+      *
+      * \sa innerStride(), rowStride(), colStride()
+      */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT
+    {
+      return derived().outerStride();
+    }
+
+    // FIXME shall we remove it ?
+    EIGEN_CONSTEXPR inline Index stride() const EIGEN_NOEXCEPT
+    {
+      return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
+    }
+
+    /** \returns the pointer increment between two consecutive rows.
+      *
+      * \sa innerStride(), outerStride(), colStride()
+      */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rowStride() const EIGEN_NOEXCEPT
+    {
+      return Derived::IsRowMajor ? outerStride() : innerStride();
+    }
+
+    /** \returns the pointer increment between two consecutive columns.
+      *
+      * \sa innerStride(), outerStride(), rowStride()
+      */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index colStride() const EIGEN_NOEXCEPT
+    {
+      return Derived::IsRowMajor ? innerStride() : outerStride();
+    }
+};
+
+namespace internal {
+
+template<int Alignment, typename Derived, bool JustReturnZero>
+struct first_aligned_impl
+{
+  static EIGEN_CONSTEXPR inline Index run(const Derived&) EIGEN_NOEXCEPT
+  { return 0; }
+};
+
+template<int Alignment, typename Derived>
+struct first_aligned_impl<Alignment, Derived, false>
+{
+  static inline Index run(const Derived& m)
+  {
+    return internal::first_aligned<Alignment>(m.data(), m.size());
+  }
+};
+
+/** \internal \returns the index of the first element of the array stored by \a m that is properly aligned with respect to \a Alignment for vectorization.
+  *
+  * \tparam Alignment requested alignment in Bytes.
+  *
+  * There is also the variant first_aligned(const Scalar*, Integer) defined in Memory.h. See it for more
+  * documentation.
+  */
+template<int Alignment, typename Derived>
+static inline Index first_aligned(const DenseBase<Derived>& m)
+{
+  enum { ReturnZero = (int(evaluator<Derived>::Alignment) >= Alignment) || !(Derived::Flags & DirectAccessBit) };
+  return first_aligned_impl<Alignment, Derived, ReturnZero>::run(m.derived());
+}
+
+template<typename Derived>
+static inline Index first_default_aligned(const DenseBase<Derived>& m)
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type DefaultPacketType;
+  return internal::first_aligned<int(unpacket_traits<DefaultPacketType>::alignment),Derived>(m);
+}
+
+template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
+struct inner_stride_at_compile_time
+{
+  enum { ret = traits<Derived>::InnerStrideAtCompileTime };
+};
+
+template<typename Derived>
+struct inner_stride_at_compile_time<Derived, false>
+{
+  enum { ret = 0 };
+};
+
+template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
+struct outer_stride_at_compile_time
+{
+  enum { ret = traits<Derived>::OuterStrideAtCompileTime };
+};
+
+template<typename Derived>
+struct outer_stride_at_compile_time<Derived, false>
+{
+  enum { ret = 0 };
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_DENSECOEFFSBASE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/DenseStorage.h b/src/3rdparty/eigen/Eigen/src/Core/DenseStorage.h
new file mode 100644
index 000000000..08ef6c530
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/DenseStorage.h
@@ -0,0 +1,652 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2010-2013 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIXSTORAGE_H
+#define EIGEN_MATRIXSTORAGE_H
+
+#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) X; EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
+#else
+  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X)
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+struct constructor_without_unaligned_array_assert {};
+
+template<typename T, int Size>
+EIGEN_DEVICE_FUNC
+void check_static_allocation_size()
+{
+  // if EIGEN_STACK_ALLOCATION_LIMIT is defined to 0, then no limit
+  #if EIGEN_STACK_ALLOCATION_LIMIT
+  EIGEN_STATIC_ASSERT(Size * sizeof(T) <= EIGEN_STACK_ALLOCATION_LIMIT, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG);
+  #endif
+}
+
+/** \internal
+  * Static array. If the MatrixOrArrayOptions require auto-alignment, the array will be automatically aligned:
+  * to 16 bytes boundary if the total size is a multiple of 16 bytes.
+  */
+template <typename T, int Size, int MatrixOrArrayOptions,
+          int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
+                        : compute_default_alignment<T,Size>::value >
+struct plain_array
+{
+  T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array()
+  {
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert)
+  {
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+#if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
+  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
+#elif EIGEN_GNUC_AT_LEAST(4,7)
+  // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned.
+  // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
+  // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
+  template<typename PtrType>
+  EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
+  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
+    eigen_assert((internal::UIntPtr(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
+              && "this assertion is explained here: " \
+              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
+              " **** READ THIS WEB PAGE !!! ****");
+#else
+  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
+    eigen_assert((internal::UIntPtr(array) & (sizemask)) == 0 \
+              && "this assertion is explained here: " \
+              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
+              " **** READ THIS WEB PAGE !!! ****");
+#endif
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 8>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array()
+  {
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert)
+  {
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 16>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array()
+  {
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert)
+  {
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 32>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array()
+  {
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert)
+  {
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 64>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array()
+  {
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert)
+  {
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int MatrixOrArrayOptions, int Alignment>
+struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
+{
+  T array[1];
+  EIGEN_DEVICE_FUNC plain_array() {}
+  EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
+};
+
+struct plain_array_helper {
+  template<typename T, int Size, int MatrixOrArrayOptions, int Alignment>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static void copy(const plain_array<T, Size, MatrixOrArrayOptions, Alignment>& src, const Eigen::Index size,
+                         plain_array<T, Size, MatrixOrArrayOptions, Alignment>& dst) {
+    smart_copy(src.array, src.array + size, dst.array);
+  }
+  
+  template<typename T, int Size, int MatrixOrArrayOptions, int Alignment>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static void swap(plain_array<T, Size, MatrixOrArrayOptions, Alignment>& a, const Eigen::Index a_size,
+                   plain_array<T, Size, MatrixOrArrayOptions, Alignment>& b, const Eigen::Index b_size) {
+    if (a_size < b_size) {
+      std::swap_ranges(b.array, b.array + a_size, a.array);
+      smart_move(b.array + a_size, b.array + b_size, a.array + a_size);
+    } else if (a_size > b_size) {
+      std::swap_ranges(a.array, a.array + b_size, b.array);
+      smart_move(a.array + b_size, a.array + a_size, b.array + b_size);
+    } else {
+      std::swap_ranges(a.array, a.array + a_size, b.array);
+    }
+  }
+};
+
+} // end namespace internal
+
+/** \internal
+  *
+  * \class DenseStorage
+  * \ingroup Core_Module
+  *
+  * \brief Stores the data of a matrix
+  *
+  * This class stores the data of fixed-size, dynamic-size or mixed matrices
+  * in a way as compact as possible.
+  *
+  * \sa Matrix
+  */
+template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseStorage;
+
+// purely fixed-size matrix
+template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseStorage
+{
+    internal::plain_array<T,Size,_Options> m_data;
+  public:
+    EIGEN_DEVICE_FUNC DenseStorage() {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
+    }
+    EIGEN_DEVICE_FUNC
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+      : m_data(internal::constructor_without_unaligned_array_assert()) {}
+#if !EIGEN_HAS_CXX11 || defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN)
+    EIGEN_DEVICE_FUNC
+    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
+    }
+#else
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) = default;
+#endif
+#if !EIGEN_HAS_CXX11
+    EIGEN_DEVICE_FUNC
+    DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other) m_data = other.m_data;
+      return *this;
+    }
+#else
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) = default;
+#endif
+#if EIGEN_HAS_RVALUE_REFERENCES
+#if !EIGEN_HAS_CXX11
+    EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
+      : m_data(std::move(other.m_data))
+    {
+    }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
+    {
+      if (this != &other)
+        m_data = std::move(other.m_data);
+      return *this;
+    }
+#else
+    EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&&) = default;
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&&) = default;
+#endif
+#endif
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+      eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols);
+      EIGEN_UNUSED_VARIABLE(size);
+      EIGEN_UNUSED_VARIABLE(rows);
+      EIGEN_UNUSED_VARIABLE(cols);
+    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data, other.m_data);
+    }
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return _Cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
+};
+
+// null matrix
+template<typename T, int _Rows, int _Cols, int _Options> class DenseStorage<T, 0, _Rows, _Cols, _Options>
+{
+  public:
+    EIGEN_DEVICE_FUNC DenseStorage() {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) {}
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) { return *this; }
+    EIGEN_DEVICE_FUNC DenseStorage(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& ) {}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return _Cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC const T *data() const { return 0; }
+    EIGEN_DEVICE_FUNC T *data() { return 0; }
+};
+
+// more specializations for null matrices; these are necessary to resolve ambiguities
+template<typename T, int _Options> class DenseStorage<T, 0, Dynamic, Dynamic, _Options>
+: public DenseStorage<T, 0, 0, 0, _Options> { };
+
+template<typename T, int _Rows, int _Options> class DenseStorage<T, 0, _Rows, Dynamic, _Options>
+: public DenseStorage<T, 0, 0, 0, _Options> { };
+
+template<typename T, int _Cols, int _Options> class DenseStorage<T, 0, Dynamic, _Cols, _Options>
+: public DenseStorage<T, 0, 0, 0, _Options> { };
+
+// dynamic-size matrix with fixed-size storage
+template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic, Dynamic, _Options>
+{
+    internal::plain_array<T,Size,_Options> m_data;
+    Index m_rows;
+    Index m_cols;
+  public:
+    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows), m_cols(other.m_cols)
+    {
+      internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        m_rows = other.m_rows;
+        m_cols = other.m_cols;
+        internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data);
+      }
+      return *this;
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
+    {
+      internal::plain_array_helper::swap(m_data, m_rows * m_cols, other.m_data, other.m_rows * other.m_cols);
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
+    EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
+    EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
+};
+
+// dynamic-size matrix with fixed-size storage and fixed width
+template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Size, Dynamic, _Cols, _Options>
+{
+    internal::plain_array<T,Size,_Options> m_data;
+    Index m_rows;
+  public:
+    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows)
+    {
+      internal::plain_array_helper::copy(other.m_data, m_rows * _Cols, m_data);
+    }
+    
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        m_rows = other.m_rows;
+        internal::plain_array_helper::copy(other.m_data, m_rows * _Cols, m_data);
+      }
+      return *this;
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
+    { 
+      internal::plain_array_helper::swap(m_data, m_rows * _Cols, other.m_data, other.m_rows * _Cols);
+      numext::swap(m_rows, other.m_rows);
+    }
+    EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols(void) const EIGEN_NOEXCEPT {return _Cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index) { m_rows = rows; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
+};
+
+// dynamic-size matrix with fixed-size storage and fixed height
+template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Size, _Rows, Dynamic, _Options>
+{
+    internal::plain_array<T,Size,_Options> m_data;
+    Index m_cols;
+  public:
+    EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) 
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(other.m_cols)
+    {
+      internal::plain_array_helper::copy(other.m_data, _Rows * m_cols, m_data);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        m_cols = other.m_cols;
+        internal::plain_array_helper::copy(other.m_data, _Rows * m_cols, m_data);
+      }
+      return *this;
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      internal::plain_array_helper::swap(m_data, _Rows * m_cols, other.m_data, _Rows * other.m_cols);
+      numext::swap(m_cols, other.m_cols);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows(void) const EIGEN_NOEXCEPT {return _Rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
+};
+
+// purely dynamic matrix.
+template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynamic, _Options>
+{
+    T *m_data;
+    Index m_rows;
+    Index m_cols;
+  public:
+    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
+       : m_data(0), m_rows(0), m_cols(0) {}
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows), m_cols(cols)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+      eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*other.m_cols))
+      , m_rows(other.m_rows)
+      , m_cols(other.m_cols)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*m_cols)
+      internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        DenseStorage tmp(other);
+        this->swap(tmp);
+      }
+      return *this;
+    }
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
+      : m_data(std::move(other.m_data))
+      , m_rows(std::move(other.m_rows))
+      , m_cols(std::move(other.m_cols))
+    {
+      other.m_data = nullptr;
+      other.m_rows = 0;
+      other.m_cols = 0;
+    }
+    EIGEN_DEVICE_FUNC
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
+    {
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
+      numext::swap(m_cols, other.m_cols);
+      return *this;
+    }
+#endif
+    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
+    EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
+    void conservativeResize(Index size, Index rows, Index cols)
+    {
+      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*m_cols);
+      m_rows = rows;
+      m_cols = cols;
+    }
+    EIGEN_DEVICE_FUNC void resize(Index size, Index rows, Index cols)
+    {
+      if(size != m_rows*m_cols)
+      {
+        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols);
+        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
+          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
+        else
+          m_data = 0;
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+      }
+      m_rows = rows;
+      m_cols = cols;
+    }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data; }
+};
+
+// matrix with dynamic width and fixed height (so that matrix has dynamic size).
+template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Rows, Dynamic, _Options>
+{
+    T *m_data;
+    Index m_cols;
+  public:
+    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_cols(0) {}
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(cols)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+      eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0);
+      EIGEN_UNUSED_VARIABLE(rows);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols))
+      , m_cols(other.m_cols)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols*_Rows)
+      internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        DenseStorage tmp(other);
+        this->swap(tmp);
+      }
+      return *this;
+    }
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
+      : m_data(std::move(other.m_data))
+      , m_cols(std::move(other.m_cols))
+    {
+      other.m_data = nullptr;
+      other.m_cols = 0;
+    }
+    EIGEN_DEVICE_FUNC
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
+    {
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_cols, other.m_cols);
+      return *this;
+    }
+#endif
+    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_cols,other.m_cols);
+    }
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
+    {
+      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, _Rows*m_cols);
+      m_cols = cols;
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index size, Index, Index cols)
+    {
+      if(size != _Rows*m_cols)
+      {
+        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols);
+        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
+          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
+        else
+          m_data = 0;
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+      }
+      m_cols = cols;
+    }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data; }
+};
+
+// matrix with dynamic height and fixed width (so that matrix has dynamic size).
+template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dynamic, _Cols, _Options>
+{
+    T *m_data;
+    Index m_rows;
+  public:
+    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0) {}
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+      eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols);
+      EIGEN_UNUSED_VARIABLE(cols);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols))
+      , m_rows(other.m_rows)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*_Cols)
+      internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        DenseStorage tmp(other);
+        this->swap(tmp);
+      }
+      return *this;
+    }
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
+      : m_data(std::move(other.m_data))
+      , m_rows(std::move(other.m_rows))
+    {
+      other.m_data = nullptr;
+      other.m_rows = 0;
+    }
+    EIGEN_DEVICE_FUNC
+    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
+    {
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
+      return *this;
+    }
+#endif
+    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+    }
+    EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) {return _Cols;}
+    void conservativeResize(Index size, Index rows, Index)
+    {
+      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*_Cols);
+      m_rows = rows;
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index size, Index rows, Index)
+    {
+      if(size != m_rows*_Cols)
+      {
+        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows);
+        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
+          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
+        else
+          m_data = 0;
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+      }
+      m_rows = rows;
+    }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data; }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Diagonal.h b/src/3rdparty/eigen/Eigen/src/Core/Diagonal.h
new file mode 100644
index 000000000..3112d2c16
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Diagonal.h
@@ -0,0 +1,258 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DIAGONAL_H
+#define EIGEN_DIAGONAL_H
+
+namespace Eigen {
+
+/** \class Diagonal
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a diagonal/subdiagonal/superdiagonal in a matrix
+  *
+  * \param MatrixType the type of the object in which we are taking a sub/main/super diagonal
+  * \param DiagIndex the index of the sub/super diagonal. The default is 0 and it means the main diagonal.
+  *              A positive value means a superdiagonal, a negative value means a subdiagonal.
+  *              You can also use DynamicIndex so the index can be set at runtime.
+  *
+  * The matrix is not required to be square.
+  *
+  * This class represents an expression of the main diagonal, or any sub/super diagonal
+  * of a square matrix. It is the return type of MatrixBase::diagonal() and MatrixBase::diagonal(Index) and most of the
+  * time this is the only way it is used.
+  *
+  * \sa MatrixBase::diagonal(), MatrixBase::diagonal(Index)
+  */
+
+namespace internal {
+template<typename MatrixType, int DiagIndex>
+struct traits<Diagonal<MatrixType,DiagIndex> >
+ : traits<MatrixType>
+{
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
+  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
+  typedef typename MatrixType::StorageKind StorageKind;
+  enum {
+    RowsAtCompileTime = (int(DiagIndex) == DynamicIndex || int(MatrixType::SizeAtCompileTime) == Dynamic) ? Dynamic
+                      : (EIGEN_PLAIN_ENUM_MIN(MatrixType::RowsAtCompileTime - EIGEN_PLAIN_ENUM_MAX(-DiagIndex, 0),
+                                              MatrixType::ColsAtCompileTime - EIGEN_PLAIN_ENUM_MAX( DiagIndex, 0))),
+    ColsAtCompileTime = 1,
+    MaxRowsAtCompileTime = int(MatrixType::MaxSizeAtCompileTime) == Dynamic ? Dynamic
+                         : DiagIndex == DynamicIndex ? EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::MaxRowsAtCompileTime,
+                                                                              MatrixType::MaxColsAtCompileTime)
+                         : (EIGEN_PLAIN_ENUM_MIN(MatrixType::MaxRowsAtCompileTime - EIGEN_PLAIN_ENUM_MAX(-DiagIndex, 0),
+                                                 MatrixType::MaxColsAtCompileTime - EIGEN_PLAIN_ENUM_MAX( DiagIndex, 0))),
+    MaxColsAtCompileTime = 1,
+    MaskLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags = (unsigned int)_MatrixTypeNested::Flags & (RowMajorBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit, // FIXME DirectAccessBit should not be handled by expressions
+    MatrixTypeOuterStride = outer_stride_at_compile_time<MatrixType>::ret,
+    InnerStrideAtCompileTime = MatrixTypeOuterStride == Dynamic ? Dynamic : MatrixTypeOuterStride+1,
+    OuterStrideAtCompileTime = 0
+  };
+};
+}
+
+template<typename MatrixType, int _DiagIndex> class Diagonal
+   : public internal::dense_xpr_base< Diagonal<MatrixType,_DiagIndex> >::type
+{
+  public:
+
+    enum { DiagIndex = _DiagIndex };
+    typedef typename internal::dense_xpr_base<Diagonal>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
+
+    EIGEN_DEVICE_FUNC
+    explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index)
+    {
+      eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() );
+    }
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
+
+    EIGEN_DEVICE_FUNC
+    inline Index rows() const
+    {
+      return m_index.value()<0 ? numext::mini<Index>(m_matrix.cols(),m_matrix.rows()+m_index.value())
+                               : numext::mini<Index>(m_matrix.rows(),m_matrix.cols()-m_index.value());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return 1; }
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT {
+      return m_matrix.outerStride() + 1;
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return 0; }
+
+    typedef typename internal::conditional<
+                       internal::is_lvalue<MatrixType>::value,
+                       Scalar,
+                       const Scalar
+                     >::type ScalarWithConstIfNotLvalue;
+
+    EIGEN_DEVICE_FUNC
+    inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }
+    EIGEN_DEVICE_FUNC
+    inline const Scalar* data() const { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }
+
+    EIGEN_DEVICE_FUNC
+    inline Scalar& coeffRef(Index row, Index)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      return m_matrix.coeffRef(row+rowOffset(), row+colOffset());
+    }
+
+    EIGEN_DEVICE_FUNC
+    inline const Scalar& coeffRef(Index row, Index) const
+    {
+      return m_matrix.coeffRef(row+rowOffset(), row+colOffset());
+    }
+
+    EIGEN_DEVICE_FUNC
+    inline CoeffReturnType coeff(Index row, Index) const
+    {
+      return m_matrix.coeff(row+rowOffset(), row+colOffset());
+    }
+
+    EIGEN_DEVICE_FUNC
+    inline Scalar& coeffRef(Index idx)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset());
+    }
+
+    EIGEN_DEVICE_FUNC
+    inline const Scalar& coeffRef(Index idx) const
+    {
+      return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset());
+    }
+
+    EIGEN_DEVICE_FUNC
+    inline CoeffReturnType coeff(Index idx) const
+    {
+      return m_matrix.coeff(idx+rowOffset(), idx+colOffset());
+    }
+
+    EIGEN_DEVICE_FUNC
+    inline const typename internal::remove_all<typename MatrixType::Nested>::type&
+    nestedExpression() const
+    {
+      return m_matrix;
+    }
+
+    EIGEN_DEVICE_FUNC
+    inline Index index() const
+    {
+      return m_index.value();
+    }
+
+  protected:
+    typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
+    const internal::variable_if_dynamicindex<Index, DiagIndex> m_index;
+
+  private:
+    // some compilers may fail to optimize std::max etc in case of compile-time constants...
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index absDiagIndex() const EIGEN_NOEXCEPT { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rowOffset() const EIGEN_NOEXCEPT { return m_index.value()>0 ? 0 : -m_index.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index colOffset() const EIGEN_NOEXCEPT { return m_index.value()>0 ? m_index.value() : 0; }
+    // trigger a compile-time error if someone try to call packet
+    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index) const;
+    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index,Index) const;
+};
+
+/** \returns an expression of the main diagonal of the matrix \c *this
+  *
+  * \c *this is not required to be square.
+  *
+  * Example: \include MatrixBase_diagonal.cpp
+  * Output: \verbinclude MatrixBase_diagonal.out
+  *
+  * \sa class Diagonal */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType
+MatrixBase<Derived>::diagonal()
+{
+  return DiagonalReturnType(derived());
+}
+
+/** This is the const version of diagonal(). */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
+MatrixBase<Derived>::diagonal() const
+{
+  return ConstDiagonalReturnType(derived());
+}
+
+/** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this
+  *
+  * \c *this is not required to be square.
+  *
+  * The template parameter \a DiagIndex represent a super diagonal if \a DiagIndex > 0
+  * and a sub diagonal otherwise. \a DiagIndex == 0 is equivalent to the main diagonal.
+  *
+  * Example: \include MatrixBase_diagonal_int.cpp
+  * Output: \verbinclude MatrixBase_diagonal_int.out
+  *
+  * \sa MatrixBase::diagonal(), class Diagonal */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
+MatrixBase<Derived>::diagonal(Index index)
+{
+  return DiagonalDynamicIndexReturnType(derived(), index);
+}
+
+/** This is the const version of diagonal(Index). */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
+MatrixBase<Derived>::diagonal(Index index) const
+{
+  return ConstDiagonalDynamicIndexReturnType(derived(), index);
+}
+
+/** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this
+  *
+  * \c *this is not required to be square.
+  *
+  * The template parameter \a DiagIndex represent a super diagonal if \a DiagIndex > 0
+  * and a sub diagonal otherwise. \a DiagIndex == 0 is equivalent to the main diagonal.
+  *
+  * Example: \include MatrixBase_diagonal_template_int.cpp
+  * Output: \verbinclude MatrixBase_diagonal_template_int.out
+  *
+  * \sa MatrixBase::diagonal(), class Diagonal */
+template<typename Derived>
+template<int Index_>
+EIGEN_DEVICE_FUNC
+inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
+MatrixBase<Derived>::diagonal()
+{
+  return typename DiagonalIndexReturnType<Index_>::Type(derived());
+}
+
+/** This is the const version of diagonal<int>(). */
+template<typename Derived>
+template<int Index_>
+EIGEN_DEVICE_FUNC
+inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
+MatrixBase<Derived>::diagonal() const
+{
+  return typename ConstDiagonalIndexReturnType<Index_>::Type(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_DIAGONAL_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/DiagonalMatrix.h b/src/3rdparty/eigen/Eigen/src/Core/DiagonalMatrix.h
new file mode 100644
index 000000000..542685c65
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/DiagonalMatrix.h
@@ -0,0 +1,391 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DIAGONALMATRIX_H
+#define EIGEN_DIAGONALMATRIX_H
+
+namespace Eigen { 
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename Derived>
+class DiagonalBase : public EigenBase<Derived>
+{
+  public:
+    typedef typename internal::traits<Derived>::DiagonalVectorType DiagonalVectorType;
+    typedef typename DiagonalVectorType::Scalar Scalar;
+    typedef typename DiagonalVectorType::RealScalar RealScalar;
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+
+    enum {
+      RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
+      ColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
+      MaxRowsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+      MaxColsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+      IsVectorAtCompileTime = 0,
+      Flags = NoPreferredStorageOrderBit
+    };
+
+    typedef Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime, 0, MaxRowsAtCompileTime, MaxColsAtCompileTime> DenseMatrixType;
+    typedef DenseMatrixType DenseType;
+    typedef DiagonalMatrix<Scalar,DiagonalVectorType::SizeAtCompileTime,DiagonalVectorType::MaxSizeAtCompileTime> PlainObject;
+
+    EIGEN_DEVICE_FUNC
+    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    EIGEN_DEVICE_FUNC
+    inline Derived& derived() { return *static_cast<Derived*>(this); }
+
+    EIGEN_DEVICE_FUNC
+    DenseMatrixType toDenseMatrix() const { return derived(); }
+
+    EIGEN_DEVICE_FUNC
+    inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
+    EIGEN_DEVICE_FUNC
+    inline DiagonalVectorType& diagonal() { return derived().diagonal(); }
+
+    EIGEN_DEVICE_FUNC
+    inline Index rows() const { return diagonal().size(); }
+    EIGEN_DEVICE_FUNC
+    inline Index cols() const { return diagonal().size(); }
+
+    template<typename MatrixDerived>
+    EIGEN_DEVICE_FUNC
+    const Product<Derived,MatrixDerived,LazyProduct>
+    operator*(const MatrixBase<MatrixDerived> &matrix) const
+    {
+      return Product<Derived, MatrixDerived, LazyProduct>(derived(),matrix.derived());
+    }
+
+    typedef DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType> > InverseReturnType;
+    EIGEN_DEVICE_FUNC
+    inline const InverseReturnType
+    inverse() const
+    {
+      return InverseReturnType(diagonal().cwiseInverse());
+    }
+    
+    EIGEN_DEVICE_FUNC
+    inline const DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >
+    operator*(const Scalar& scalar) const
+    {
+      return DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType,Scalar,product) >(diagonal() * scalar);
+    }
+    EIGEN_DEVICE_FUNC
+    friend inline const DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >
+    operator*(const Scalar& scalar, const DiagonalBase& other)
+    {
+      return DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >(scalar * other.diagonal());
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    inline unspecified_expression_type
+    #else
+    inline const DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(DiagonalVectorType,typename OtherDerived::DiagonalVectorType,sum) >
+    #endif
+    operator+(const DiagonalBase<OtherDerived>& other) const
+    {
+      return (diagonal() + other.diagonal()).asDiagonal();
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    inline unspecified_expression_type
+    #else
+    inline const DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(DiagonalVectorType,typename OtherDerived::DiagonalVectorType,difference) >
+    #endif
+    operator-(const DiagonalBase<OtherDerived>& other) const
+    {
+      return (diagonal() - other.diagonal()).asDiagonal();
+    }
+};
+
+#endif
+
+/** \class DiagonalMatrix
+  * \ingroup Core_Module
+  *
+  * \brief Represents a diagonal matrix with its storage
+  *
+  * \param _Scalar the type of coefficients
+  * \param SizeAtCompileTime the dimension of the matrix, or Dynamic
+  * \param MaxSizeAtCompileTime the dimension of the matrix, or Dynamic. This parameter is optional and defaults
+  *        to SizeAtCompileTime. Most of the time, you do not need to specify it.
+  *
+  * \sa class DiagonalWrapper
+  */
+
+namespace internal {
+template<typename _Scalar, int SizeAtCompileTime, int MaxSizeAtCompileTime>
+struct traits<DiagonalMatrix<_Scalar,SizeAtCompileTime,MaxSizeAtCompileTime> >
+ : traits<Matrix<_Scalar,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+{
+  typedef Matrix<_Scalar,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1> DiagonalVectorType;
+  typedef DiagonalShape StorageKind;
+  enum {
+    Flags = LvalueBit | NoPreferredStorageOrderBit
+  };
+};
+}
+template<typename _Scalar, int SizeAtCompileTime, int MaxSizeAtCompileTime>
+class DiagonalMatrix
+  : public DiagonalBase<DiagonalMatrix<_Scalar,SizeAtCompileTime,MaxSizeAtCompileTime> >
+{
+  public:
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef typename internal::traits<DiagonalMatrix>::DiagonalVectorType DiagonalVectorType;
+    typedef const DiagonalMatrix& Nested;
+    typedef _Scalar Scalar;
+    typedef typename internal::traits<DiagonalMatrix>::StorageKind StorageKind;
+    typedef typename internal::traits<DiagonalMatrix>::StorageIndex StorageIndex;
+    #endif
+
+  protected:
+
+    DiagonalVectorType m_diagonal;
+
+  public:
+
+    /** const version of diagonal(). */
+    EIGEN_DEVICE_FUNC
+    inline const DiagonalVectorType& diagonal() const { return m_diagonal; }
+    /** \returns a reference to the stored vector of diagonal coefficients. */
+    EIGEN_DEVICE_FUNC
+    inline DiagonalVectorType& diagonal() { return m_diagonal; }
+
+    /** Default constructor without initialization */
+    EIGEN_DEVICE_FUNC
+    inline DiagonalMatrix() {}
+
+    /** Constructs a diagonal matrix with given dimension  */
+    EIGEN_DEVICE_FUNC
+    explicit inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}
+
+    /** 2D constructor. */
+    EIGEN_DEVICE_FUNC
+    inline DiagonalMatrix(const Scalar& x, const Scalar& y) : m_diagonal(x,y) {}
+
+    /** 3D constructor. */
+    EIGEN_DEVICE_FUNC
+    inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {}
+
+    #if EIGEN_HAS_CXX11
+    /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients. \cpp11
+      * 
+      * There exists C++98 anologue constructors for fixed-size diagonal matrices having 2 or 3 coefficients.
+      * 
+      * \warning To construct a diagonal matrix of fixed size, the number of values passed to this 
+      * constructor must match the fixed dimension of \c *this.
+      * 
+      * \sa DiagonalMatrix(const Scalar&, const Scalar&)
+      * \sa DiagonalMatrix(const Scalar&, const Scalar&, const Scalar&)
+      */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const ArgTypes&... args)
+      : m_diagonal(a0, a1, a2, args...) {}
+
+    /** \brief Constructs a DiagonalMatrix and initializes it by elements given by an initializer list of initializer
+      * lists \cpp11
+      */
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE DiagonalMatrix(const std::initializer_list<std::initializer_list<Scalar>>& list)
+      : m_diagonal(list) {}
+    #endif  // EIGEN_HAS_CXX11
+
+    /** Copy constructor. */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    inline DiagonalMatrix(const DiagonalBase<OtherDerived>& other) : m_diagonal(other.diagonal()) {}
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** copy constructor. prevent a default copy constructor from hiding the other templated constructor */
+    inline DiagonalMatrix(const DiagonalMatrix& other) : m_diagonal(other.diagonal()) {}
+    #endif
+
+    /** generic constructor from expression of the diagonal coefficients */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    explicit inline DiagonalMatrix(const MatrixBase<OtherDerived>& other) : m_diagonal(other)
+    {}
+
+    /** Copy operator. */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    DiagonalMatrix& operator=(const DiagonalBase<OtherDerived>& other)
+    {
+      m_diagonal = other.diagonal();
+      return *this;
+    }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    EIGEN_DEVICE_FUNC
+    DiagonalMatrix& operator=(const DiagonalMatrix& other)
+    {
+      m_diagonal = other.diagonal();
+      return *this;
+    }
+    #endif
+
+    /** Resizes to given size. */
+    EIGEN_DEVICE_FUNC
+    inline void resize(Index size) { m_diagonal.resize(size); }
+    /** Sets all coefficients to zero. */
+    EIGEN_DEVICE_FUNC
+    inline void setZero() { m_diagonal.setZero(); }
+    /** Resizes and sets all coefficients to zero. */
+    EIGEN_DEVICE_FUNC
+    inline void setZero(Index size) { m_diagonal.setZero(size); }
+    /** Sets this matrix to be the identity matrix of the current size. */
+    EIGEN_DEVICE_FUNC
+    inline void setIdentity() { m_diagonal.setOnes(); }
+    /** Sets this matrix to be the identity matrix of the given size. */
+    EIGEN_DEVICE_FUNC
+    inline void setIdentity(Index size) { m_diagonal.setOnes(size); }
+};
+
+/** \class DiagonalWrapper
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a diagonal matrix
+  *
+  * \param _DiagonalVectorType the type of the vector of diagonal coefficients
+  *
+  * This class is an expression of a diagonal matrix, but not storing its own vector of diagonal coefficients,
+  * instead wrapping an existing vector expression. It is the return type of MatrixBase::asDiagonal()
+  * and most of the time this is the only way that it is used.
+  *
+  * \sa class DiagonalMatrix, class DiagonalBase, MatrixBase::asDiagonal()
+  */
+
+namespace internal {
+template<typename _DiagonalVectorType>
+struct traits<DiagonalWrapper<_DiagonalVectorType> >
+{
+  typedef _DiagonalVectorType DiagonalVectorType;
+  typedef typename DiagonalVectorType::Scalar Scalar;
+  typedef typename DiagonalVectorType::StorageIndex StorageIndex;
+  typedef DiagonalShape StorageKind;
+  typedef typename traits<DiagonalVectorType>::XprKind XprKind;
+  enum {
+    RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
+    ColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
+    MaxRowsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+    MaxColsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+    Flags =  (traits<DiagonalVectorType>::Flags & LvalueBit) | NoPreferredStorageOrderBit
+  };
+};
+}
+
+template<typename _DiagonalVectorType>
+class DiagonalWrapper
+  : public DiagonalBase<DiagonalWrapper<_DiagonalVectorType> >, internal::no_assignment_operator
+{
+  public:
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef _DiagonalVectorType DiagonalVectorType;
+    typedef DiagonalWrapper Nested;
+    #endif
+
+    /** Constructor from expression of diagonal coefficients to wrap. */
+    EIGEN_DEVICE_FUNC
+    explicit inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}
+
+    /** \returns a const reference to the wrapped expression of diagonal coefficients. */
+    EIGEN_DEVICE_FUNC
+    const DiagonalVectorType& diagonal() const { return m_diagonal; }
+
+  protected:
+    typename DiagonalVectorType::Nested m_diagonal;
+};
+
+/** \returns a pseudo-expression of a diagonal matrix with *this as vector of diagonal coefficients
+  *
+  * \only_for_vectors
+  *
+  * Example: \include MatrixBase_asDiagonal.cpp
+  * Output: \verbinclude MatrixBase_asDiagonal.out
+  *
+  * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
+  **/
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived>
+MatrixBase<Derived>::asDiagonal() const
+{
+  return DiagonalWrapper<const Derived>(derived());
+}
+
+/** \returns true if *this is approximately equal to a diagonal matrix,
+  *          within the precision given by \a prec.
+  *
+  * Example: \include MatrixBase_isDiagonal.cpp
+  * Output: \verbinclude MatrixBase_isDiagonal.out
+  *
+  * \sa asDiagonal()
+  */
+template<typename Derived>
+bool MatrixBase<Derived>::isDiagonal(const RealScalar& prec) const
+{
+  if(cols() != rows()) return false;
+  RealScalar maxAbsOnDiagonal = static_cast<RealScalar>(-1);
+  for(Index j = 0; j < cols(); ++j)
+  {
+    RealScalar absOnDiagonal = numext::abs(coeff(j,j));
+    if(absOnDiagonal > maxAbsOnDiagonal) maxAbsOnDiagonal = absOnDiagonal;
+  }
+  for(Index j = 0; j < cols(); ++j)
+    for(Index i = 0; i < j; ++i)
+    {
+      if(!internal::isMuchSmallerThan(coeff(i, j), maxAbsOnDiagonal, prec)) return false;
+      if(!internal::isMuchSmallerThan(coeff(j, i), maxAbsOnDiagonal, prec)) return false;
+    }
+  return true;
+}
+
+namespace internal {
+
+template<> struct storage_kind_to_shape<DiagonalShape> { typedef DiagonalShape Shape; };
+
+struct Diagonal2Dense {};
+
+template<> struct AssignmentKind<DenseShape,DiagonalShape> { typedef Diagonal2Dense Kind; };
+
+// Diagonal matrix to Dense assignment
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense>
+{
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+    
+    dst.setZero();
+    dst.diagonal() = src.diagonal();
+  }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.diagonal() += src.diagonal(); }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.diagonal() -= src.diagonal(); }
+};
+
+} // namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_DIAGONALMATRIX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/DiagonalProduct.h b/src/3rdparty/eigen/Eigen/src/Core/DiagonalProduct.h
new file mode 100644
index 000000000..7911d1cd1
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/DiagonalProduct.h
@@ -0,0 +1,28 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DIAGONALPRODUCT_H
+#define EIGEN_DIAGONALPRODUCT_H
+
+namespace Eigen { 
+
+/** \returns the diagonal matrix product of \c *this by the diagonal matrix \a diagonal.
+  */
+template<typename Derived>
+template<typename DiagonalDerived>
+EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct>
+MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
+{
+  return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_DIAGONALPRODUCT_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Dot.h b/src/3rdparty/eigen/Eigen/src/Core/Dot.h
new file mode 100644
index 000000000..5c3441b92
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Dot.h
@@ -0,0 +1,318 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008, 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DOT_H
+#define EIGEN_DOT_H
+
+namespace Eigen { 
+
+namespace internal {
+
+// helper function for dot(). The problem is that if we put that in the body of dot(), then upon calling dot
+// with mismatched types, the compiler emits errors about failing to instantiate cwiseProduct BEFORE
+// looking at the static assertions. Thus this is a trick to get better compile errors.
+template<typename T, typename U,
+// the NeedToTranspose condition here is taken straight from Assign.h
+         bool NeedToTranspose = T::IsVectorAtCompileTime
+                && U::IsVectorAtCompileTime
+                && ((int(T::RowsAtCompileTime) == 1 && int(U::ColsAtCompileTime) == 1)
+                      |  // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
+                         // revert to || as soon as not needed anymore.
+                    (int(T::ColsAtCompileTime) == 1 && int(U::RowsAtCompileTime) == 1))
+>
+struct dot_nocheck
+{
+  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
+  typedef typename conj_prod::result_type ResScalar;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE
+  static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  {
+    return a.template binaryExpr<conj_prod>(b).sum();
+  }
+};
+
+template<typename T, typename U>
+struct dot_nocheck<T, U, true>
+{
+  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
+  typedef typename conj_prod::result_type ResScalar;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE
+  static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  {
+    return a.transpose().template binaryExpr<conj_prod>(b).sum();
+  }
+};
+
+} // end namespace internal
+
+/** \fn MatrixBase::dot
+  * \returns the dot product of *this with other.
+  *
+  * \only_for_vectors
+  *
+  * \note If the scalar type is complex numbers, then this function returns the hermitian
+  * (sesquilinear) dot product, conjugate-linear in the first variable and linear in the
+  * second variable.
+  *
+  * \sa squaredNorm(), norm()
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE
+typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
+MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
+#if !(defined(EIGEN_NO_STATIC_ASSERT) && defined(EIGEN_NO_DEBUG))
+  typedef internal::scalar_conj_product_op<Scalar,typename OtherDerived::Scalar> func;
+  EIGEN_CHECK_BINARY_COMPATIBILIY(func,Scalar,typename OtherDerived::Scalar);
+#endif
+  
+  eigen_assert(size() == other.size());
+
+  return internal::dot_nocheck<Derived,OtherDerived>::run(*this, other);
+}
+
+//---------- implementation of L2 norm and related functions ----------
+
+/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the squared Frobenius norm.
+  * In both cases, it consists in the sum of the square of all the matrix entries.
+  * For vectors, this is also equals to the dot product of \c *this with itself.
+  *
+  * \sa dot(), norm(), lpNorm()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
+{
+  return numext::real((*this).cwiseAbs2().sum());
+}
+
+/** \returns, for vectors, the \em l2 norm of \c *this, and for matrices the Frobenius norm.
+  * In both cases, it consists in the square root of the sum of the square of all the matrix entries.
+  * For vectors, this is also equals to the square root of the dot product of \c *this with itself.
+  *
+  * \sa lpNorm(), dot(), squaredNorm()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
+{
+  return numext::sqrt(squaredNorm());
+}
+
+/** \returns an expression of the quotient of \c *this by its own norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0),
+  *          then this function returns a copy of the input.
+  *
+  * \only_for_vectors
+  *
+  * \sa norm(), normalize()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
+MatrixBase<Derived>::normalized() const
+{
+  typedef typename internal::nested_eval<Derived,2>::type _Nested;
+  _Nested n(derived());
+  RealScalar z = n.squaredNorm();
+  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
+  if(z>RealScalar(0))
+    return n / numext::sqrt(z);
+  else
+    return n;
+}
+
+/** Normalizes the vector, i.e. divides it by its own norm.
+  *
+  * \only_for_vectors
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged.
+  *
+  * \sa norm(), normalized()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
+{
+  RealScalar z = squaredNorm();
+  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
+  if(z>RealScalar(0))
+    derived() /= numext::sqrt(z);
+}
+
+/** \returns an expression of the quotient of \c *this by its own norm while avoiding underflow and overflow.
+  *
+  * \only_for_vectors
+  *
+  * This method is analogue to the normalized() method, but it reduces the risk of
+  * underflow and overflow when computing the norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0),
+  *          then this function returns a copy of the input.
+  *
+  * \sa stableNorm(), stableNormalize(), normalized()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
+MatrixBase<Derived>::stableNormalized() const
+{
+  typedef typename internal::nested_eval<Derived,3>::type _Nested;
+  _Nested n(derived());
+  RealScalar w = n.cwiseAbs().maxCoeff();
+  RealScalar z = (n/w).squaredNorm();
+  if(z>RealScalar(0))
+    return n / (numext::sqrt(z)*w);
+  else
+    return n;
+}
+
+/** Normalizes the vector while avoid underflow and overflow
+  *
+  * \only_for_vectors
+  *
+  * This method is analogue to the normalize() method, but it reduces the risk of
+  * underflow and overflow when computing the norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged.
+  *
+  * \sa stableNorm(), stableNormalized(), normalize()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
+{
+  RealScalar w = cwiseAbs().maxCoeff();
+  RealScalar z = (derived()/w).squaredNorm();
+  if(z>RealScalar(0))
+    derived() /= numext::sqrt(z)*w;
+}
+
+//---------- implementation of other norms ----------
+
+namespace internal {
+
+template<typename Derived, int p>
+struct lpNorm_selector
+{
+  typedef typename NumTraits<typename traits<Derived>::Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const MatrixBase<Derived>& m)
+  {
+    EIGEN_USING_STD(pow)
+    return pow(m.cwiseAbs().array().pow(p).sum(), RealScalar(1)/p);
+  }
+};
+
+template<typename Derived>
+struct lpNorm_selector<Derived, 1>
+{
+  EIGEN_DEVICE_FUNC
+  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
+  {
+    return m.cwiseAbs().sum();
+  }
+};
+
+template<typename Derived>
+struct lpNorm_selector<Derived, 2>
+{
+  EIGEN_DEVICE_FUNC
+  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
+  {
+    return m.norm();
+  }
+};
+
+template<typename Derived>
+struct lpNorm_selector<Derived, Infinity>
+{
+  typedef typename NumTraits<typename traits<Derived>::Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const MatrixBase<Derived>& m)
+  {
+    if(Derived::SizeAtCompileTime==0 || (Derived::SizeAtCompileTime==Dynamic && m.size()==0))
+      return RealScalar(0);
+    return m.cwiseAbs().maxCoeff();
+  }
+};
+
+} // end namespace internal
+
+/** \returns the \b coefficient-wise \f$ \ell^p \f$ norm of \c *this, that is, returns the p-th root of the sum of the p-th powers of the absolute values
+  *          of the coefficients of \c *this. If \a p is the special value \a Eigen::Infinity, this function returns the \f$ \ell^\infty \f$
+  *          norm, that is the maximum of the absolute values of the coefficients of \c *this.
+  *
+  * In all cases, if \c *this is empty, then the value 0 is returned.
+  *
+  * \note For matrices, this function does not compute the <a href="https://en.wikipedia.org/wiki/Operator_norm">operator-norm</a>. That is, if \c *this is a matrix, then its coefficients are interpreted as a 1D vector. Nonetheless, you can easily compute the 1-norm and \f$\infty\f$-norm matrix operator norms using \link TutorialReductionsVisitorsBroadcastingReductionsNorm partial reductions \endlink.
+  *
+  * \sa norm()
+  */
+template<typename Derived>
+template<int p>
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+#else
+EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
+#endif
+MatrixBase<Derived>::lpNorm() const
+{
+  return internal::lpNorm_selector<Derived, p>::run(*this);
+}
+
+//---------- implementation of isOrthogonal / isUnitary ----------
+
+/** \returns true if *this is approximately orthogonal to \a other,
+  *          within the precision given by \a prec.
+  *
+  * Example: \include MatrixBase_isOrthogonal.cpp
+  * Output: \verbinclude MatrixBase_isOrthogonal.out
+  */
+template<typename Derived>
+template<typename OtherDerived>
+bool MatrixBase<Derived>::isOrthogonal
+(const MatrixBase<OtherDerived>& other, const RealScalar& prec) const
+{
+  typename internal::nested_eval<Derived,2>::type nested(derived());
+  typename internal::nested_eval<OtherDerived,2>::type otherNested(other.derived());
+  return numext::abs2(nested.dot(otherNested)) <= prec * prec * nested.squaredNorm() * otherNested.squaredNorm();
+}
+
+/** \returns true if *this is approximately an unitary matrix,
+  *          within the precision given by \a prec. In the case where the \a Scalar
+  *          type is real numbers, a unitary matrix is an orthogonal matrix, whence the name.
+  *
+  * \note This can be used to check whether a family of vectors forms an orthonormal basis.
+  *       Indeed, \c m.isUnitary() returns true if and only if the columns (equivalently, the rows) of m form an
+  *       orthonormal basis.
+  *
+  * Example: \include MatrixBase_isUnitary.cpp
+  * Output: \verbinclude MatrixBase_isUnitary.out
+  */
+template<typename Derived>
+bool MatrixBase<Derived>::isUnitary(const RealScalar& prec) const
+{
+  typename internal::nested_eval<Derived,1>::type self(derived());
+  for(Index i = 0; i < cols(); ++i)
+  {
+    if(!internal::isApprox(self.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
+      return false;
+    for(Index j = 0; j < i; ++j)
+      if(!internal::isMuchSmallerThan(self.col(i).dot(self.col(j)), static_cast<Scalar>(1), prec))
+        return false;
+  }
+  return true;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_DOT_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/EigenBase.h b/src/3rdparty/eigen/Eigen/src/Core/EigenBase.h
new file mode 100644
index 000000000..6b3c7d374
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/EigenBase.h
@@ -0,0 +1,160 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EIGENBASE_H
+#define EIGEN_EIGENBASE_H
+
+namespace Eigen {
+
+/** \class EigenBase
+  * \ingroup Core_Module
+  *
+  * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
+  *
+  * In other words, an EigenBase object is an object that can be copied into a MatrixBase.
+  *
+  * Besides MatrixBase-derived classes, this also includes special matrix classes such as diagonal matrices, etc.
+  *
+  * Notice that this class is trivial, it is only used to disambiguate overloaded functions.
+  *
+  * \sa \blank \ref TopicClassHierarchy
+  */
+template<typename Derived> struct EigenBase
+{
+//   typedef typename internal::plain_matrix_type<Derived>::type PlainObject;
+
+  /** \brief The interface type of indices
+    * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
+    * \sa StorageIndex, \ref TopicPreprocessorDirectives.
+    * DEPRECATED: Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead.
+    * Deprecation is not marked with a doxygen comment because there are too many existing usages to add the deprecation attribute.
+    */
+  typedef Eigen::Index Index;
+
+  // FIXME is it needed?
+  typedef typename internal::traits<Derived>::StorageKind StorageKind;
+
+  /** \returns a reference to the derived object */
+  EIGEN_DEVICE_FUNC
+  Derived& derived() { return *static_cast<Derived*>(this); }
+  /** \returns a const reference to the derived object */
+  EIGEN_DEVICE_FUNC
+  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+  EIGEN_DEVICE_FUNC
+  inline Derived& const_cast_derived() const
+  { return *static_cast<Derived*>(const_cast<EigenBase*>(this)); }
+  EIGEN_DEVICE_FUNC
+  inline const Derived& const_derived() const
+  { return *static_cast<const Derived*>(this); }
+
+  /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
+  /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); }
+  /** \returns the number of coefficients, which is rows()*cols().
+    * \sa rows(), cols(), SizeAtCompileTime. */
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index size() const EIGEN_NOEXCEPT { return rows() * cols(); }
+
+  /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC
+  inline void evalTo(Dest& dst) const
+  { derived().evalTo(dst); }
+
+  /** \internal Don't use it, but do the equivalent: \code dst += *this; \endcode */
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC
+  inline void addTo(Dest& dst) const
+  {
+    // This is the default implementation,
+    // derived class can reimplement it in a more optimized way.
+    typename Dest::PlainObject res(rows(),cols());
+    evalTo(res);
+    dst += res;
+  }
+
+  /** \internal Don't use it, but do the equivalent: \code dst -= *this; \endcode */
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC
+  inline void subTo(Dest& dst) const
+  {
+    // This is the default implementation,
+    // derived class can reimplement it in a more optimized way.
+    typename Dest::PlainObject res(rows(),cols());
+    evalTo(res);
+    dst -= res;
+  }
+
+  /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheRight(*this); \endcode */
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC inline void applyThisOnTheRight(Dest& dst) const
+  {
+    // This is the default implementation,
+    // derived class can reimplement it in a more optimized way.
+    dst = dst * this->derived();
+  }
+
+  /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheLeft(*this); \endcode */
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC inline void applyThisOnTheLeft(Dest& dst) const
+  {
+    // This is the default implementation,
+    // derived class can reimplement it in a more optimized way.
+    dst = this->derived() * dst;
+  }
+
+};
+
+/***************************************************************************
+* Implementation of matrix base methods
+***************************************************************************/
+
+/** \brief Copies the generic expression \a other into *this.
+  *
+  * \details The expression must provide a (templated) evalTo(Derived& dst) const
+  * function which does the actual job. In practice, this allows any user to write
+  * its own special matrix without having to modify MatrixBase
+  *
+  * \returns a reference to *this.
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
+{
+  call_assignment(derived(), other.derived());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
+{
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
+{
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_EIGENBASE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/ForceAlignedAccess.h b/src/3rdparty/eigen/Eigen/src/Core/ForceAlignedAccess.h
new file mode 100644
index 000000000..817a43afc
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/ForceAlignedAccess.h
@@ -0,0 +1,150 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FORCEALIGNEDACCESS_H
+#define EIGEN_FORCEALIGNEDACCESS_H
+
+namespace Eigen {
+
+/** \class ForceAlignedAccess
+  * \ingroup Core_Module
+  *
+  * \brief Enforce aligned packet loads and stores regardless of what is requested
+  *
+  * \param ExpressionType the type of the object of which we are forcing aligned packet access
+  *
+  * This class is the return type of MatrixBase::forceAlignedAccess()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::forceAlignedAccess()
+  */
+
+namespace internal {
+template<typename ExpressionType>
+struct traits<ForceAlignedAccess<ExpressionType> > : public traits<ExpressionType>
+{};
+}
+
+template<typename ExpressionType> class ForceAlignedAccess
+  : public internal::dense_xpr_base< ForceAlignedAccess<ExpressionType> >::type
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<ForceAlignedAccess>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(ForceAlignedAccess)
+
+    EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); }
+
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
+    {
+      return m_expression.coeff(row, col);
+    }
+
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
+    {
+      return m_expression.const_cast_derived().coeffRef(row, col);
+    }
+
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
+    {
+      return m_expression.coeff(index);
+    }
+
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
+    {
+      return m_expression.const_cast_derived().coeffRef(index);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index row, Index col) const
+    {
+      return m_expression.template packet<Aligned>(row, col);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index row, Index col, const PacketScalar& x)
+    {
+      m_expression.const_cast_derived().template writePacket<Aligned>(row, col, x);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index index) const
+    {
+      return m_expression.template packet<Aligned>(index);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index index, const PacketScalar& x)
+    {
+      m_expression.const_cast_derived().template writePacket<Aligned>(index, x);
+    }
+
+    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
+
+  protected:
+    const ExpressionType& m_expression;
+
+  private:
+    ForceAlignedAccess& operator=(const ForceAlignedAccess&);
+};
+
+/** \returns an expression of *this with forced aligned access
+  * \sa forceAlignedAccessIf(),class ForceAlignedAccess
+  */
+template<typename Derived>
+inline const ForceAlignedAccess<Derived>
+MatrixBase<Derived>::forceAlignedAccess() const
+{
+  return ForceAlignedAccess<Derived>(derived());
+}
+
+/** \returns an expression of *this with forced aligned access
+  * \sa forceAlignedAccessIf(), class ForceAlignedAccess
+  */
+template<typename Derived>
+inline ForceAlignedAccess<Derived>
+MatrixBase<Derived>::forceAlignedAccess()
+{
+  return ForceAlignedAccess<Derived>(derived());
+}
+
+/** \returns an expression of *this with forced aligned access if \a Enable is true.
+  * \sa forceAlignedAccess(), class ForceAlignedAccess
+  */
+template<typename Derived>
+template<bool Enable>
+inline typename internal::add_const_on_value_type<typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type>::type
+MatrixBase<Derived>::forceAlignedAccessIf() const
+{
+  return derived();  // FIXME This should not work but apparently is never used
+}
+
+/** \returns an expression of *this with forced aligned access if \a Enable is true.
+  * \sa forceAlignedAccess(), class ForceAlignedAccess
+  */
+template<typename Derived>
+template<bool Enable>
+inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type
+MatrixBase<Derived>::forceAlignedAccessIf()
+{
+  return derived();  // FIXME This should not work but apparently is never used
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_FORCEALIGNEDACCESS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Fuzzy.h b/src/3rdparty/eigen/Eigen/src/Core/Fuzzy.h
new file mode 100644
index 000000000..43aa49b2b
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Fuzzy.h
@@ -0,0 +1,155 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FUZZY_H
+#define EIGEN_FUZZY_H
+
+namespace Eigen { 
+
+namespace internal
+{
+
+template<typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
+struct isApprox_selector
+{
+  EIGEN_DEVICE_FUNC
+  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
+  {
+    typename internal::nested_eval<Derived,2>::type nested(x);
+    typename internal::nested_eval<OtherDerived,2>::type otherNested(y);
+    return (nested - otherNested).cwiseAbs2().sum() <= prec * prec * numext::mini(nested.cwiseAbs2().sum(), otherNested.cwiseAbs2().sum());
+  }
+};
+
+template<typename Derived, typename OtherDerived>
+struct isApprox_selector<Derived, OtherDerived, true>
+{
+  EIGEN_DEVICE_FUNC
+  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar&)
+  {
+    return x.matrix() == y.matrix();
+  }
+};
+
+template<typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
+struct isMuchSmallerThan_object_selector
+{
+  EIGEN_DEVICE_FUNC
+  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
+  {
+    return x.cwiseAbs2().sum() <= numext::abs2(prec) * y.cwiseAbs2().sum();
+  }
+};
+
+template<typename Derived, typename OtherDerived>
+struct isMuchSmallerThan_object_selector<Derived, OtherDerived, true>
+{
+  EIGEN_DEVICE_FUNC
+  static bool run(const Derived& x, const OtherDerived&, const typename Derived::RealScalar&)
+  {
+    return x.matrix() == Derived::Zero(x.rows(), x.cols()).matrix();
+  }
+};
+
+template<typename Derived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
+struct isMuchSmallerThan_scalar_selector
+{
+  EIGEN_DEVICE_FUNC
+  static bool run(const Derived& x, const typename Derived::RealScalar& y, const typename Derived::RealScalar& prec)
+  {
+    return x.cwiseAbs2().sum() <= numext::abs2(prec * y);
+  }
+};
+
+template<typename Derived>
+struct isMuchSmallerThan_scalar_selector<Derived, true>
+{
+  EIGEN_DEVICE_FUNC
+  static bool run(const Derived& x, const typename Derived::RealScalar&, const typename Derived::RealScalar&)
+  {
+    return x.matrix() == Derived::Zero(x.rows(), x.cols()).matrix();
+  }
+};
+
+} // end namespace internal
+
+
+/** \returns \c true if \c *this is approximately equal to \a other, within the precision
+  * determined by \a prec.
+  *
+  * \note The fuzzy compares are done multiplicatively. Two vectors \f$ v \f$ and \f$ w \f$
+  * are considered to be approximately equal within precision \f$ p \f$ if
+  * \f[ \Vert v - w \Vert \leqslant p\,\min(\Vert v\Vert, \Vert w\Vert). \f]
+  * For matrices, the comparison is done using the Hilbert-Schmidt norm (aka Frobenius norm
+  * L2 norm).
+  *
+  * \note Because of the multiplicativeness of this comparison, one can't use this function
+  * to check whether \c *this is approximately equal to the zero matrix or vector.
+  * Indeed, \c isApprox(zero) returns false unless \c *this itself is exactly the zero matrix
+  * or vector. If you want to test whether \c *this is zero, use internal::isMuchSmallerThan(const
+  * RealScalar&, RealScalar) instead.
+  *
+  * \sa internal::isMuchSmallerThan(const RealScalar&, RealScalar) const
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
+  const DenseBase<OtherDerived>& other,
+  const RealScalar& prec
+) const
+{
+  return internal::isApprox_selector<Derived, OtherDerived>::run(derived(), other.derived(), prec);
+}
+
+/** \returns \c true if the norm of \c *this is much smaller than \a other,
+  * within the precision determined by \a prec.
+  *
+  * \note The fuzzy compares are done multiplicatively. A vector \f$ v \f$ is
+  * considered to be much smaller than \f$ x \f$ within precision \f$ p \f$ if
+  * \f[ \Vert v \Vert \leqslant p\,\vert x\vert. \f]
+  *
+  * For matrices, the comparison is done using the Hilbert-Schmidt norm. For this reason,
+  * the value of the reference scalar \a other should come from the Hilbert-Schmidt norm
+  * of a reference matrix of same dimensions.
+  *
+  * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
+  const typename NumTraits<Scalar>::Real& other,
+  const RealScalar& prec
+) const
+{
+  return internal::isMuchSmallerThan_scalar_selector<Derived>::run(derived(), other, prec);
+}
+
+/** \returns \c true if the norm of \c *this is much smaller than the norm of \a other,
+  * within the precision determined by \a prec.
+  *
+  * \note The fuzzy compares are done multiplicatively. A vector \f$ v \f$ is
+  * considered to be much smaller than a vector \f$ w \f$ within precision \f$ p \f$ if
+  * \f[ \Vert v \Vert \leqslant p\,\Vert w\Vert. \f]
+  * For matrices, the comparison is done using the Hilbert-Schmidt norm.
+  *
+  * \sa isApprox(), isMuchSmallerThan(const RealScalar&, RealScalar) const
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
+  const DenseBase<OtherDerived>& other,
+  const RealScalar& prec
+) const
+{
+  return internal::isMuchSmallerThan_object_selector<Derived, OtherDerived>::run(derived(), other.derived(), prec);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_FUZZY_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/GeneralProduct.h b/src/3rdparty/eigen/Eigen/src/Core/GeneralProduct.h
new file mode 100644
index 000000000..6906aa75d
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/GeneralProduct.h
@@ -0,0 +1,465 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_PRODUCT_H
+#define EIGEN_GENERAL_PRODUCT_H
+
+namespace Eigen {
+
+enum {
+  Large = 2,
+  Small = 3
+};
+
+// Define the threshold value to fallback from the generic matrix-matrix product
+// implementation (heavy) to the lightweight coeff-based product one.
+// See generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
+// in products/GeneralMatrixMatrix.h for more details.
+// TODO This threshold should also be used in the compile-time selector below.
+#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
+// This default value has been obtained on a Haswell architecture.
+#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20
+#endif
+
+namespace internal {
+
+template<int Rows, int Cols, int Depth> struct product_type_selector;
+
+template<int Size, int MaxSize> struct product_size_category
+{
+  enum {
+    #ifndef EIGEN_GPU_COMPILE_PHASE
+    is_large = MaxSize == Dynamic ||
+               Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
+               (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
+    #else
+    is_large = 0,
+    #endif
+    value = is_large  ? Large
+          : Size == 1 ? 1
+                      : Small
+  };
+};
+
+template<typename Lhs, typename Rhs> struct product_type
+{
+  typedef typename remove_all<Lhs>::type _Lhs;
+  typedef typename remove_all<Rhs>::type _Rhs;
+  enum {
+    MaxRows = traits<_Lhs>::MaxRowsAtCompileTime,
+    Rows    = traits<_Lhs>::RowsAtCompileTime,
+    MaxCols = traits<_Rhs>::MaxColsAtCompileTime,
+    Cols    = traits<_Rhs>::ColsAtCompileTime,
+    MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(traits<_Lhs>::MaxColsAtCompileTime,
+                                           traits<_Rhs>::MaxRowsAtCompileTime),
+    Depth = EIGEN_SIZE_MIN_PREFER_FIXED(traits<_Lhs>::ColsAtCompileTime,
+                                        traits<_Rhs>::RowsAtCompileTime)
+  };
+
+  // the splitting into different lines of code here, introducing the _select enums and the typedef below,
+  // is to work around an internal compiler error with gcc 4.1 and 4.2.
+private:
+  enum {
+    rows_select = product_size_category<Rows,MaxRows>::value,
+    cols_select = product_size_category<Cols,MaxCols>::value,
+    depth_select = product_size_category<Depth,MaxDepth>::value
+  };
+  typedef product_type_selector<rows_select, cols_select, depth_select> selector;
+
+public:
+  enum {
+    value = selector::ret,
+    ret = selector::ret
+  };
+#ifdef EIGEN_DEBUG_PRODUCT
+  static void debug()
+  {
+      EIGEN_DEBUG_VAR(Rows);
+      EIGEN_DEBUG_VAR(Cols);
+      EIGEN_DEBUG_VAR(Depth);
+      EIGEN_DEBUG_VAR(rows_select);
+      EIGEN_DEBUG_VAR(cols_select);
+      EIGEN_DEBUG_VAR(depth_select);
+      EIGEN_DEBUG_VAR(value);
+  }
+#endif
+};
+
+/* The following allows to select the kind of product at compile time
+ * based on the three dimensions of the product.
+ * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
+// FIXME I'm not sure the current mapping is the ideal one.
+template<int M, int N>  struct product_type_selector<M,N,1>              { enum { ret = OuterProduct }; };
+template<int M>         struct product_type_selector<M, 1, 1>            { enum { ret = LazyCoeffBasedProductMode }; };
+template<int N>         struct product_type_selector<1, N, 1>            { enum { ret = LazyCoeffBasedProductMode }; };
+template<int Depth>     struct product_type_selector<1,    1,    Depth>  { enum { ret = InnerProduct }; };
+template<>              struct product_type_selector<1,    1,    1>      { enum { ret = InnerProduct }; };
+template<>              struct product_type_selector<Small,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<1,    Small,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Small,Small,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Small, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
+template<>              struct product_type_selector<Small, Large, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
+template<>              struct product_type_selector<Large, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
+template<>              struct product_type_selector<1,    Large,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<1,    Large,Large>  { enum { ret = GemvProduct }; };
+template<>              struct product_type_selector<1,    Small,Large>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Large,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Large,1,    Large>  { enum { ret = GemvProduct }; };
+template<>              struct product_type_selector<Small,1,    Large>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Small,Small,Large>  { enum { ret = GemmProduct }; };
+template<>              struct product_type_selector<Large,Small,Large>  { enum { ret = GemmProduct }; };
+template<>              struct product_type_selector<Small,Large,Large>  { enum { ret = GemmProduct }; };
+template<>              struct product_type_selector<Large,Large,Large>  { enum { ret = GemmProduct }; };
+template<>              struct product_type_selector<Large,Small,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Small,Large,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct product_type_selector<Large,Large,Small>  { enum { ret = GemmProduct }; };
+
+} // end namespace internal
+
+/***********************************************************************
+*  Implementation of Inner Vector Vector Product
+***********************************************************************/
+
+// FIXME : maybe the "inner product" could return a Scalar
+// instead of a 1x1 matrix ??
+// Pro: more natural for the user
+// Cons: this could be a problem if in a meta unrolled algorithm a matrix-matrix
+// product ends up to a row-vector times col-vector product... To tackle this use
+// case, we could have a specialization for Block<MatrixType,1,1> with: operator=(Scalar x);
+
+/***********************************************************************
+*  Implementation of Outer Vector Vector Product
+***********************************************************************/
+
+/***********************************************************************
+*  Implementation of General Matrix Vector Product
+***********************************************************************/
+
+/*  According to the shape/flags of the matrix we have to distinghish 3 different cases:
+ *   1 - the matrix is col-major, BLAS compatible and M is large => call fast BLAS-like colmajor routine
+ *   2 - the matrix is row-major, BLAS compatible and N is large => call fast BLAS-like rowmajor routine
+ *   3 - all other cases are handled using a simple loop along the outer-storage direction.
+ *  Therefore we need a lower level meta selector.
+ *  Furthermore, if the matrix is the rhs, then the product has to be transposed.
+ */
+namespace internal {
+
+template<int Side, int StorageOrder, bool BlasCompatible>
+struct gemv_dense_selector;
+
+} // end namespace internal
+
+namespace internal {
+
+template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vector_if;
+
+template<typename Scalar,int Size,int MaxSize>
+struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
+{
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
+};
+
+template<typename Scalar,int Size>
+struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
+{
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; }
+};
+
+template<typename Scalar,int Size,int MaxSize>
+struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
+{
+  enum {
+    ForceAlignment  = internal::packet_traits<Scalar>::Vectorizable,
+    PacketSize      = internal::packet_traits<Scalar>::size
+  };
+  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
+  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0,EIGEN_PLAIN_ENUM_MIN(AlignedMax,PacketSize)> m_data;
+  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
+  #else
+  // Some architectures cannot align on the stack,
+  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
+  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?EIGEN_MAX_ALIGN_BYTES:0),0> m_data;
+  EIGEN_STRONG_INLINE Scalar* data() {
+    return ForceAlignment
+            ? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
+            : m_data.array;
+  }
+  #endif
+};
+
+// The vector is on the left => transposition
+template<int StorageOrder, bool BlasCompatible>
+struct gemv_dense_selector<OnTheLeft,StorageOrder,BlasCompatible>
+{
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+  {
+    Transpose<Dest> destT(dest);
+    enum { OtherStorageOrder = StorageOrder == RowMajor ? ColMajor : RowMajor };
+    gemv_dense_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
+      ::run(rhs.transpose(), lhs.transpose(), destT, alpha);
+  }
+};
+
+template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
+{
+  template<typename Lhs, typename Rhs, typename Dest>
+  static inline void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+  {
+    typedef typename Lhs::Scalar   LhsScalar;
+    typedef typename Rhs::Scalar   RhsScalar;
+    typedef typename Dest::Scalar  ResScalar;
+    typedef typename Dest::RealScalar  RealScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
+
+    ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
+    ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);
+
+    ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs);
+
+    // make sure Dest is a compile-time vector type (bug 1166)
+    typedef typename conditional<Dest::IsVectorAtCompileTime, Dest, typename Dest::ColXpr>::type ActualDest;
+
+    enum {
+      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
+      // on, the other hand it is good for the cache to pack the vector anyways...
+      EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1),
+      ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
+      MightCannotUseDest = ((!EvalToDestAtCompileTime) || ComplexByReal) && (ActualDest::MaxSizeAtCompileTime!=0)
+    };
+
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
+
+    if(!MightCannotUseDest)
+    {
+      // shortcut if we are sure to be able to use dest directly,
+      // this ease the compiler to generate cleaner and more optimzized code for most common cases
+      general_matrix_vector_product
+          <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
+          actualLhs.rows(), actualLhs.cols(),
+          LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+          RhsMapper(actualRhs.data(), actualRhs.innerStride()),
+          dest.data(), 1,
+          compatibleAlpha);
+    }
+    else
+    {
+      gemv_static_vector_if<ResScalar,ActualDest::SizeAtCompileTime,ActualDest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
+
+      const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
+      const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
+
+      ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+                                                    evalToDest ? dest.data() : static_dest.data());
+
+      if(!evalToDest)
+      {
+        #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+        Index size = dest.size();
+        EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+        #endif
+        if(!alphaIsCompatible)
+        {
+          MappedDest(actualDestPtr, dest.size()).setZero();
+          compatibleAlpha = RhsScalar(1);
+        }
+        else
+          MappedDest(actualDestPtr, dest.size()) = dest;
+      }
+
+      general_matrix_vector_product
+          <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
+          actualLhs.rows(), actualLhs.cols(),
+          LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+          RhsMapper(actualRhs.data(), actualRhs.innerStride()),
+          actualDestPtr, 1,
+          compatibleAlpha);
+
+      if (!evalToDest)
+      {
+        if(!alphaIsCompatible)
+          dest.matrix() += actualAlpha * MappedDest(actualDestPtr, dest.size());
+        else
+          dest = MappedDest(actualDestPtr, dest.size());
+      }
+    }
+  }
+};
+
+template<> struct gemv_dense_selector<OnTheRight,RowMajor,true>
+{
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+  {
+    typedef typename Lhs::Scalar   LhsScalar;
+    typedef typename Rhs::Scalar   RhsScalar;
+    typedef typename Dest::Scalar  ResScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+
+    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
+    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
+
+    ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs);
+
+    enum {
+      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
+      // on, the other hand it is good for the cache to pack the vector anyways...
+      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 || ActualRhsTypeCleaned::MaxSizeAtCompileTime==0
+    };
+
+    gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
+
+    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
+        DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
+
+    if(!DirectlyUseRhs)
+    {
+      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      Index size = actualRhs.size();
+      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      #endif
+      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+    }
+
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
+    general_matrix_vector_product
+        <Index,LhsScalar,LhsMapper,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
+        actualLhs.rows(), actualLhs.cols(),
+        LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+        RhsMapper(actualRhsPtr, 1),
+        dest.data(), dest.col(0).innerStride(), //NOTE  if dest is not a vector at compile-time, then dest.innerStride() might be wrong. (bug 1166)
+        actualAlpha);
+  }
+};
+
+template<> struct gemv_dense_selector<OnTheRight,ColMajor,false>
+{
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+  {
+    EIGEN_STATIC_ASSERT((!nested_eval<Lhs,1>::Evaluate),EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
+    // TODO if rhs is large enough it might be beneficial to make sure that dest is sequentially stored in memory, otherwise use a temp
+    typename nested_eval<Rhs,1>::type actual_rhs(rhs);
+    const Index size = rhs.rows();
+    for(Index k=0; k<size; ++k)
+      dest += (alpha*actual_rhs.coeff(k)) * lhs.col(k);
+  }
+};
+
+template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
+{
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+  {
+    EIGEN_STATIC_ASSERT((!nested_eval<Lhs,1>::Evaluate),EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
+    typename nested_eval<Rhs,Lhs::RowsAtCompileTime>::type actual_rhs(rhs);
+    const Index rows = dest.rows();
+    for(Index i=0; i<rows; ++i)
+      dest.coeffRef(i) += alpha * (lhs.row(i).cwiseProduct(actual_rhs.transpose())).sum();
+  }
+};
+
+} // end namespace internal
+
+/***************************************************************************
+* Implementation of matrix base methods
+***************************************************************************/
+
+/** \returns the matrix product of \c *this and \a other.
+  *
+  * \note If instead of the matrix product you want the coefficient-wise product, see Cwise::operator*().
+  *
+  * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const Product<Derived, OtherDerived>
+MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
+{
+  // A note regarding the function declaration: In MSVC, this function will sometimes
+  // not be inlined since DenseStorage is an unwindable object for dynamic
+  // matrices and product types are holding a member to store the result.
+  // Thus it does not help tagging this function with EIGEN_STRONG_INLINE.
+  enum {
+    ProductIsValid =  Derived::ColsAtCompileTime==Dynamic
+                   || OtherDerived::RowsAtCompileTime==Dynamic
+                   || int(Derived::ColsAtCompileTime)==int(OtherDerived::RowsAtCompileTime),
+    AreVectors = Derived::IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime,
+    SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(Derived,OtherDerived)
+  };
+  // note to the lost user:
+  //    * for a dot product use: v1.dot(v2)
+  //    * for a coeff-wise product use: v1.cwiseProduct(v2)
+  EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
+    INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
+  EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
+    INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
+  EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
+#ifdef EIGEN_DEBUG_PRODUCT
+  internal::product_type<Derived,OtherDerived>::debug();
+#endif
+
+  return Product<Derived, OtherDerived>(derived(), other.derived());
+}
+
+/** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
+  *
+  * The returned product will behave like any other expressions: the coefficients of the product will be
+  * computed once at a time as requested. This might be useful in some extremely rare cases when only
+  * a small and no coherent fraction of the result's coefficients have to be computed.
+  *
+  * \warning This version of the matrix product can be much much slower. So use it only if you know
+  * what you are doing and that you measured a true speed improvement.
+  *
+  * \sa operator*(const MatrixBase&)
+  */
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const Product<Derived,OtherDerived,LazyProduct>
+MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
+{
+  enum {
+    ProductIsValid =  Derived::ColsAtCompileTime==Dynamic
+                   || OtherDerived::RowsAtCompileTime==Dynamic
+                   || int(Derived::ColsAtCompileTime)==int(OtherDerived::RowsAtCompileTime),
+    AreVectors = Derived::IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime,
+    SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(Derived,OtherDerived)
+  };
+  // note to the lost user:
+  //    * for a dot product use: v1.dot(v2)
+  //    * for a coeff-wise product use: v1.cwiseProduct(v2)
+  EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
+    INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
+  EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
+    INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
+  EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
+
+  return Product<Derived,OtherDerived,LazyProduct>(derived(), other.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_PRODUCT_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/GenericPacketMath.h b/src/3rdparty/eigen/Eigen/src/Core/GenericPacketMath.h
new file mode 100644
index 000000000..cf677a190
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/GenericPacketMath.h
@@ -0,0 +1,1040 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERIC_PACKET_MATH_H
+#define EIGEN_GENERIC_PACKET_MATH_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+  * \file GenericPacketMath.h
+  *
+  * Default implementation for types not supported by the vectorization.
+  * In practice these functions are provided to make easier the writing
+  * of generic vectorized code.
+  */
+
+#ifndef EIGEN_DEBUG_ALIGNED_LOAD
+#define EIGEN_DEBUG_ALIGNED_LOAD
+#endif
+
+#ifndef EIGEN_DEBUG_UNALIGNED_LOAD
+#define EIGEN_DEBUG_UNALIGNED_LOAD
+#endif
+
+#ifndef EIGEN_DEBUG_ALIGNED_STORE
+#define EIGEN_DEBUG_ALIGNED_STORE
+#endif
+
+#ifndef EIGEN_DEBUG_UNALIGNED_STORE
+#define EIGEN_DEBUG_UNALIGNED_STORE
+#endif
+
+struct default_packet_traits
+{
+  enum {
+    HasHalfPacket = 0,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 0,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 1,
+    HasBlend     = 0,
+    // This flag is used to indicate whether packet comparison is supported.
+    // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true.
+    HasCmp       = 0,
+
+    HasDiv    = 0,
+    HasSqrt   = 0,
+    HasRsqrt  = 0,
+    HasExp    = 0,
+    HasExpm1  = 0,
+    HasLog    = 0,
+    HasLog1p  = 0,
+    HasLog10  = 0,
+    HasPow    = 0,
+
+    HasSin    = 0,
+    HasCos    = 0,
+    HasTan    = 0,
+    HasASin   = 0,
+    HasACos   = 0,
+    HasATan   = 0,
+    HasSinh   = 0,
+    HasCosh   = 0,
+    HasTanh   = 0,
+    HasLGamma = 0,
+    HasDiGamma = 0,
+    HasZeta = 0,
+    HasPolygamma = 0,
+    HasErf = 0,
+    HasErfc = 0,
+    HasNdtri = 0,
+    HasBessel = 0,
+    HasIGamma = 0,
+    HasIGammaDerA = 0,
+    HasGammaSampleDerAlpha = 0,
+    HasIGammac = 0,
+    HasBetaInc = 0,
+
+    HasRound  = 0,
+    HasRint   = 0,
+    HasFloor  = 0,
+    HasCeil   = 0,
+    HasSign   = 0
+  };
+};
+
+template<typename T> struct packet_traits : default_packet_traits
+{
+  typedef T type;
+  typedef T half;
+  enum {
+    Vectorizable = 0,
+    size = 1,
+    AlignedOnScalar = 0,
+    HasHalfPacket = 0
+  };
+  enum {
+    HasAdd    = 0,
+    HasSub    = 0,
+    HasMul    = 0,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<typename T> struct packet_traits<const T> : packet_traits<T> { };
+
+template<typename T> struct unpacket_traits
+{
+  typedef T type;
+  typedef T half;
+  enum
+  {
+    size = 1,
+    alignment = 1,
+    vectorizable = false,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
+
+template<typename T> struct unpacket_traits<const T> : unpacket_traits<T> { };
+
+template <typename Src, typename Tgt> struct type_casting_traits {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+/** \internal Wrapper to ensure that multiple packet types can map to the same
+    same underlying vector type. */
+template<typename T, int unique_id = 0>
+struct eigen_packet_wrapper
+{
+  EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
+  EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
+    m_val = v;
+    return *this;
+  }
+
+  T m_val;
+};
+
+
+/** \internal A convenience utility for determining if the type is a scalar.
+ * This is used to enable some generic packet implementations.
+ */
+template<typename Packet>
+struct is_scalar {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  enum {
+    value = internal::is_same<Packet, Scalar>::value
+  };
+};
+
+/** \internal \returns static_cast<TgtType>(a) (coeff-wise) */
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a) {
+  return static_cast<TgtPacket>(a);
+}
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a, const SrcPacket& /*b*/) {
+  return static_cast<TgtPacket>(a);
+}
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/) {
+  return static_cast<TgtPacket>(a);
+}
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/,
+      const SrcPacket& /*e*/, const SrcPacket& /*f*/, const SrcPacket& /*g*/, const SrcPacket& /*h*/) {
+  return static_cast<TgtPacket>(a);
+}
+
+/** \internal \returns reinterpret_cast<Target>(a) */
+template <typename Target, typename Packet>
+EIGEN_DEVICE_FUNC inline Target
+preinterpret(const Packet& a); /* { return reinterpret_cast<const Target&>(a); } */
+
+/** \internal \returns a + b (coeff-wise) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+padd(const Packet& a, const Packet& b) { return a+b; }
+// Avoid compiler warning for boolean algebra.
+template<> EIGEN_DEVICE_FUNC inline bool
+padd(const bool& a, const bool& b) { return a || b; }
+
+/** \internal \returns a - b (coeff-wise) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+psub(const Packet& a, const Packet& b) { return a-b; }
+
+/** \internal \returns -a (coeff-wise) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pnegate(const Packet& a) { return -a; }
+
+template<> EIGEN_DEVICE_FUNC inline bool
+pnegate(const bool& a) { return !a; }
+
+/** \internal \returns conj(a) (coeff-wise) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pconj(const Packet& a) { return numext::conj(a); }
+
+/** \internal \returns a * b (coeff-wise) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pmul(const Packet& a, const Packet& b) { return a*b; }
+// Avoid compiler warning for boolean algebra.
+template<> EIGEN_DEVICE_FUNC inline bool
+pmul(const bool& a, const bool& b) { return a && b; }
+
+/** \internal \returns a / b (coeff-wise) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pdiv(const Packet& a, const Packet& b) { return a/b; }
+
+// In the generic case, memset to all one bits.
+template<typename Packet, typename EnableIf = void>
+struct ptrue_impl {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/){
+    Packet b;
+    memset(static_cast<void*>(&b), 0xff, sizeof(Packet));
+    return b;
+  }
+};
+
+// For non-trivial scalars, set to Scalar(1) (i.e. a non-zero value).
+// Although this is technically not a valid bitmask, the scalar path for pselect
+// uses a comparison to zero, so this should still work in most cases. We don't
+// have another option, since the scalar type requires initialization.
+template<typename T>
+struct ptrue_impl<T, 
+    typename internal::enable_if<is_scalar<T>::value && NumTraits<T>::RequireInitialization>::type > {
+  static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/){
+    return T(1);
+  }
+};
+
+/** \internal \returns one bits. */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ptrue(const Packet& a) {
+  return ptrue_impl<Packet>::run(a);
+}
+
+// In the general case, memset to zero.
+template<typename Packet, typename EnableIf = void>
+struct pzero_impl {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) {
+    Packet b;
+    memset(static_cast<void*>(&b), 0x00, sizeof(Packet));
+    return b;
+  }
+};
+
+// For scalars, explicitly set to Scalar(0), since the underlying representation
+// for zero may not consist of all-zero bits.
+template<typename T>
+struct pzero_impl<T,
+    typename internal::enable_if<is_scalar<T>::value>::type> {
+  static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/) {
+    return T(0);
+  }
+};
+
+/** \internal \returns packet of zeros */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pzero(const Packet& a) {
+  return pzero_impl<Packet>::run(a);
+}
+
+/** \internal \returns a <= b as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_le(const Packet& a, const Packet& b)  { return a<=b ? ptrue(a) : pzero(a); }
+
+/** \internal \returns a < b as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_lt(const Packet& a, const Packet& b)  { return a<b ? ptrue(a) : pzero(a); }
+
+/** \internal \returns a == b as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); }
+
+/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_lt_or_nan(const Packet& a, const Packet& b) { return a>=b ? pzero(a) : ptrue(a); }
+
+template<typename T>
+struct bit_and {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const {
+    return a & b;
+  }
+};
+
+template<typename T>
+struct bit_or {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const {
+    return a | b;
+  }
+};
+
+template<typename T>
+struct bit_xor {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const {
+    return a ^ b;
+  }
+};
+
+template<typename T>
+struct bit_not {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a) const {
+    return ~a;
+  }
+};
+
+// Use operators &, |, ^, ~.
+template<typename T>
+struct operator_bitwise_helper {
+  EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { return bit_and<T>()(a, b); }
+  EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { return bit_or<T>()(a, b); }
+  EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { return bit_xor<T>()(a, b); }
+  EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { return bit_not<T>()(a); }
+};
+
+// Apply binary operations byte-by-byte
+template<typename T>
+struct bytewise_bitwise_helper {
+  EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) {
+    return binary(a, b, bit_and<unsigned char>());
+  }
+  EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { 
+    return binary(a, b, bit_or<unsigned char>());
+   }
+  EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) {
+    return binary(a, b, bit_xor<unsigned char>());
+  }
+  EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { 
+    return unary(a,bit_not<unsigned char>());
+   }
+  
+ private:
+  template<typename Op>
+  EIGEN_DEVICE_FUNC static inline T unary(const T& a, Op op) {
+    const unsigned char* a_ptr = reinterpret_cast<const unsigned char*>(&a);
+    T c;
+    unsigned char* c_ptr = reinterpret_cast<unsigned char*>(&c);
+    for (size_t i = 0; i < sizeof(T); ++i) {
+      *c_ptr++ = op(*a_ptr++);
+    }
+    return c;
+  }
+
+  template<typename Op>
+  EIGEN_DEVICE_FUNC static inline T binary(const T& a, const T& b, Op op) {
+    const unsigned char* a_ptr = reinterpret_cast<const unsigned char*>(&a);
+    const unsigned char* b_ptr = reinterpret_cast<const unsigned char*>(&b);
+    T c;
+    unsigned char* c_ptr = reinterpret_cast<unsigned char*>(&c);
+    for (size_t i = 0; i < sizeof(T); ++i) {
+      *c_ptr++ = op(*a_ptr++, *b_ptr++);
+    }
+    return c;
+  }
+};
+
+// In the general case, use byte-by-byte manipulation.
+template<typename T, typename EnableIf = void>
+struct bitwise_helper : public bytewise_bitwise_helper<T> {};
+
+// For integers or non-trivial scalars, use binary operators.
+template<typename T>
+struct bitwise_helper<T,
+  typename internal::enable_if<
+    is_scalar<T>::value && (NumTraits<T>::IsInteger || NumTraits<T>::RequireInitialization)>::type
+  > : public operator_bitwise_helper<T> {};
+
+/** \internal \returns the bitwise and of \a a and \a b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pand(const Packet& a, const Packet& b) {
+  return bitwise_helper<Packet>::bitwise_and(a, b);
+}
+
+/** \internal \returns the bitwise or of \a a and \a b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+por(const Packet& a, const Packet& b) {
+  return bitwise_helper<Packet>::bitwise_or(a, b);
+}
+
+/** \internal \returns the bitwise xor of \a a and \a b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pxor(const Packet& a, const Packet& b) {
+  return bitwise_helper<Packet>::bitwise_xor(a, b);
+}
+
+/** \internal \returns the bitwise not of \a a */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pnot(const Packet& a) {
+  return bitwise_helper<Packet>::bitwise_not(a);
+}
+
+/** \internal \returns the bitwise and of \a a and not \a b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pandnot(const Packet& a, const Packet& b) { return pand(a, pnot(b)); }
+
+// In the general case, use bitwise select.
+template<typename Packet, typename EnableIf = void>
+struct pselect_impl {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) {
+    return por(pand(a,mask),pandnot(b,mask));
+  }
+};
+
+// For scalars, use ternary select.
+template<typename Packet>
+struct pselect_impl<Packet, 
+    typename internal::enable_if<is_scalar<Packet>::value>::type > {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) {
+    return numext::equal_strict(mask, Packet(0)) ? b : a;
+  }
+};
+
+/** \internal \returns \a or \b for each field in packet according to \mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pselect(const Packet& mask, const Packet& a, const Packet& b) {
+  return pselect_impl<Packet>::run(mask, a, b);
+}
+
+template<> EIGEN_DEVICE_FUNC inline bool pselect<bool>(
+    const bool& cond, const bool& a, const bool& b) {
+  return cond ? a : b;
+}
+
+/** \internal \returns the min or of \a a and \a b (coeff-wise)
+    If either \a a or \a b are NaN, the result is implementation defined. */
+template<int NaNPropagation>
+struct pminmax_impl {
+  template <typename Packet, typename Op>
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
+    return op(a,b);
+  }
+};
+
+/** \internal \returns the min or max of \a a and \a b (coeff-wise)
+    If either \a a or \a b are NaN, NaN is returned. */
+template<>
+struct pminmax_impl<PropagateNaN> {
+  template <typename Packet, typename Op>
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
+  Packet not_nan_mask_a = pcmp_eq(a, a);
+  Packet not_nan_mask_b = pcmp_eq(b, b);
+  return pselect(not_nan_mask_a,
+                 pselect(not_nan_mask_b, op(a, b), b),
+                 a);
+  }
+};
+
+/** \internal \returns the min or max of \a a and \a b (coeff-wise)
+    If both \a a and \a b are NaN, NaN is returned.
+    Equivalent to std::fmin(a, b).  */
+template<>
+struct pminmax_impl<PropagateNumbers> {
+  template <typename Packet, typename Op>
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
+  Packet not_nan_mask_a = pcmp_eq(a, a);
+  Packet not_nan_mask_b = pcmp_eq(b, b);
+  return pselect(not_nan_mask_a,
+                 pselect(not_nan_mask_b, op(a, b), a),
+                 b);
+  }
+};
+
+
+#ifndef SYCL_DEVICE_ONLY
+#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) Func
+#else
+#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) \
+[](const Type& a, const Type& b) { \
+        return Func(a, b);}
+#endif
+
+/** \internal \returns the min of \a a and \a b  (coeff-wise).
+    If \a a or \b b is NaN, the return value is implementation defined. */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pmin(const Packet& a, const Packet& b) { return numext::mini(a,b); }
+
+/** \internal \returns the min of \a a and \a b  (coeff-wise).
+    NaNPropagation determines the NaN propagation semantics. */
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) {
+  return pminmax_impl<NaNPropagation>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmin<Packet>)));
+}
+
+/** \internal \returns the max of \a a and \a b  (coeff-wise)
+    If \a a or \b b is NaN, the return value is implementation defined. */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pmax(const Packet& a, const Packet& b) { return numext::maxi(a, b); }
+
+/** \internal \returns the max of \a a and \a b  (coeff-wise).
+    NaNPropagation determines the NaN propagation semantics. */
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pmax(const Packet& a, const Packet& b) {
+  return pminmax_impl<NaNPropagation>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet,(pmax<Packet>)));
+}
+
+/** \internal \returns the absolute value of \a a */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pabs(const Packet& a) { return numext::abs(a); }
+template<> EIGEN_DEVICE_FUNC inline unsigned int
+pabs(const unsigned int& a) { return a; }
+template<> EIGEN_DEVICE_FUNC inline unsigned long
+pabs(const unsigned long& a) { return a; }
+template<> EIGEN_DEVICE_FUNC inline unsigned long long
+pabs(const unsigned long long& a) { return a; }
+
+/** \internal \returns the addsub value of \a a,b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+paddsub(const Packet& a, const Packet& b) {
+  return pselect(peven_mask(a), padd(a, b), psub(a, b));
+ }
+
+/** \internal \returns the phase angle of \a a */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+parg(const Packet& a) { using numext::arg; return arg(a); }
+
+
+/** \internal \returns \a a logically shifted by N bits to the right */
+template<int N> EIGEN_DEVICE_FUNC inline int
+parithmetic_shift_right(const int& a) { return a >> N; }
+template<int N> EIGEN_DEVICE_FUNC inline long int
+parithmetic_shift_right(const long int& a) { return a >> N; }
+
+/** \internal \returns \a a arithmetically shifted by N bits to the right */
+template<int N> EIGEN_DEVICE_FUNC inline int
+plogical_shift_right(const int& a) { return static_cast<int>(static_cast<unsigned int>(a) >> N); }
+template<int N> EIGEN_DEVICE_FUNC inline long int
+plogical_shift_right(const long int& a) { return static_cast<long>(static_cast<unsigned long>(a) >> N); }
+
+/** \internal \returns \a a shifted by N bits to the left */
+template<int N> EIGEN_DEVICE_FUNC inline int
+plogical_shift_left(const int& a) { return a << N; }
+template<int N> EIGEN_DEVICE_FUNC inline long int
+plogical_shift_left(const long int& a) { return a << N; }
+
+/** \internal \returns the significant and exponent of the underlying floating point numbers
+  * See https://en.cppreference.com/w/cpp/numeric/math/frexp
+  */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pfrexp(const Packet& a, Packet& exponent) {
+  int exp;
+  EIGEN_USING_STD(frexp);
+  Packet result = static_cast<Packet>(frexp(a, &exp));
+  exponent = static_cast<Packet>(exp);
+  return result;
+}
+
+/** \internal \returns a * 2^((int)exponent)
+  * See https://en.cppreference.com/w/cpp/numeric/math/ldexp
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pldexp(const Packet &a, const Packet &exponent) {
+  EIGEN_USING_STD(ldexp)
+  return static_cast<Packet>(ldexp(a, static_cast<int>(exponent)));
+}
+
+/** \internal \returns the min of \a a and \a b  (coeff-wise) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); }
+
+/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
+
+/** \internal \returns a packet version of \a *from, (un-aligned load) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
+
+/** \internal \returns a packet version of \a *from, (un-aligned masked load)
+ * There is no generic implementation. We only have implementations for specialized
+ * cases. Generic case should not be called.
+ */
+template<typename Packet> EIGEN_DEVICE_FUNC inline
+typename enable_if<unpacket_traits<Packet>::masked_load_available, Packet>::type
+ploadu(const typename unpacket_traits<Packet>::type* from, typename unpacket_traits<Packet>::mask_t umask);
+
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
+
+/** \internal \returns a packet with constant coefficients set from bits */
+template<typename Packet,typename BitsType> EIGEN_DEVICE_FUNC inline Packet
+pset1frombits(BitsType a);
+
+/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(*a); }
+
+/** \internal \returns a packet with elements of \a *from duplicated.
+  * For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and
+  * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
+  * Currently, this function is only used for scalar * complex products.
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
+ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
+
+/** \internal \returns a packet with elements of \a *from quadrupled.
+  * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and
+  * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]}
+  * Currently, this function is only used in matrix products.
+  * For packet-size smaller or equal to 4, this function is equivalent to pload1
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ploadquad(const typename unpacket_traits<Packet>::type* from)
+{ return pload1<Packet>(from); }
+
+/** \internal equivalent to
+  * \code
+  * a0 = pload1(a+0);
+  * a1 = pload1(a+1);
+  * a2 = pload1(a+2);
+  * a3 = pload1(a+3);
+  * \endcode
+  * \sa pset1, pload1, ploaddup, pbroadcast2
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC
+inline void pbroadcast4(const typename unpacket_traits<Packet>::type *a,
+                        Packet& a0, Packet& a1, Packet& a2, Packet& a3)
+{
+  a0 = pload1<Packet>(a+0);
+  a1 = pload1<Packet>(a+1);
+  a2 = pload1<Packet>(a+2);
+  a3 = pload1<Packet>(a+3);
+}
+
+/** \internal equivalent to
+  * \code
+  * a0 = pload1(a+0);
+  * a1 = pload1(a+1);
+  * \endcode
+  * \sa pset1, pload1, ploaddup, pbroadcast4
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC
+inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
+                        Packet& a0, Packet& a1)
+{
+  a0 = pload1<Packet>(a+0);
+  a1 = pload1<Packet>(a+1);
+}
+
+/** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
+plset(const typename unpacket_traits<Packet>::type& a) { return a; }
+
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (x, 0, x, 0),
+     where x is the value of all 1-bits. */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+peven_mask(const Packet& /*a*/) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  const size_t n = unpacket_traits<Packet>::size;
+  EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
+  for(size_t i = 0; i < n; ++i) {
+    memset(elements+i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar));
+  }
+  return ploadu<Packet>(elements);
+}
+
+
+/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
+{ (*to) = from; }
+
+/** \internal copy the packet \a from to \a *to, (un-aligned store) */
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
+{  (*to) = from; }
+
+/** \internal copy the packet \a from to \a *to, (un-aligned store with a mask)
+ * There is no generic implementation. We only have implementations for specialized
+ * cases. Generic case should not be called.
+ */
+template<typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline
+typename enable_if<unpacket_traits<Packet>::masked_store_available, void>::type
+pstoreu(Scalar* to, const Packet& from, typename unpacket_traits<Packet>::mask_t umask);
+
+ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
+ { return ploadu<Packet>(from); }
+
+ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/)
+ { pstore(to, from); }
+
+/** \internal tries to do cache prefetching of \a addr */
+template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
+{
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+  // do nothing
+#elif defined(EIGEN_CUDA_ARCH)
+#if defined(__LP64__) || EIGEN_OS_WIN64
+  // 64-bit pointer operand constraint for inlined asm
+  asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
+#else
+  // 32-bit pointer operand constraint for inlined asm
+  asm(" prefetch.L1 [ %1 ];" : "=r"(addr) : "r"(addr));
+#endif
+#elif (!EIGEN_COMP_MSVC) && (EIGEN_COMP_GNUC || EIGEN_COMP_CLANG || EIGEN_COMP_ICC)
+  __builtin_prefetch(addr);
+#endif
+}
+
+/** \internal \returns the reversed elements of \a a*/
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
+{ return a; }
+
+/** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
+{
+  return Packet(numext::imag(a),numext::real(a));
+}
+
+/**************************
+* Special math functions
+***************************/
+
+/** \internal \returns the sine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet psin(const Packet& a) { EIGEN_USING_STD(sin); return sin(a); }
+
+/** \internal \returns the cosine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pcos(const Packet& a) { EIGEN_USING_STD(cos); return cos(a); }
+
+/** \internal \returns the tan of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet ptan(const Packet& a) { EIGEN_USING_STD(tan); return tan(a); }
+
+/** \internal \returns the arc sine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pasin(const Packet& a) { EIGEN_USING_STD(asin); return asin(a); }
+
+/** \internal \returns the arc cosine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pacos(const Packet& a) { EIGEN_USING_STD(acos); return acos(a); }
+
+/** \internal \returns the arc tangent of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet patan(const Packet& a) { EIGEN_USING_STD(atan); return atan(a); }
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet psinh(const Packet& a) { EIGEN_USING_STD(sinh); return sinh(a); }
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pcosh(const Packet& a) { EIGEN_USING_STD(cosh); return cosh(a); }
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet ptanh(const Packet& a) { EIGEN_USING_STD(tanh); return tanh(a); }
+
+/** \internal \returns the exp of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pexp(const Packet& a) { EIGEN_USING_STD(exp); return exp(a); }
+
+/** \internal \returns the expm1 of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pexpm1(const Packet& a) { return numext::expm1(a); }
+
+/** \internal \returns the log of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plog(const Packet& a) { EIGEN_USING_STD(log); return log(a); }
+
+/** \internal \returns the log1p of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plog1p(const Packet& a) { return numext::log1p(a); }
+
+/** \internal \returns the log10 of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plog10(const Packet& a) { EIGEN_USING_STD(log10); return log10(a); }
+
+/** \internal \returns the log10 of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plog2(const Packet& a) {
+  typedef typename internal::unpacket_traits<Packet>::type Scalar;
+  return pmul(pset1<Packet>(Scalar(EIGEN_LOG2E)), plog(a)); 
+}
+
+/** \internal \returns the square-root of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet psqrt(const Packet& a) { return numext::sqrt(a); }
+
+/** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet prsqrt(const Packet& a) {
+  typedef typename internal::unpacket_traits<Packet>::type Scalar;
+  return pdiv(pset1<Packet>(Scalar(1)), psqrt(a));
+}
+
+/** \internal \returns the rounded value of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pround(const Packet& a) { using numext::round; return round(a); }
+
+/** \internal \returns the floor of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pfloor(const Packet& a) { using numext::floor; return floor(a); }
+
+/** \internal \returns the rounded value of \a a (coeff-wise) with current
+ * rounding mode */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet print(const Packet& a) { using numext::rint; return rint(a); }
+
+/** \internal \returns the ceil of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }
+
+/** \internal \returns the first element of a packet */
+template<typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type
+pfirst(const Packet& a)
+{ return a; }
+
+/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4.
+  * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
+  * For packet-size smaller or equal to 4, this boils down to a noop.
+  */
+template<typename Packet>
+EIGEN_DEVICE_FUNC inline typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
+predux_half_dowto4(const Packet& a)
+{ return a; }
+
+// Slow generic implementation of Packet reduction.
+template <typename Packet, typename Op>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type
+predux_helper(const Packet& a, Op op) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  const size_t n = unpacket_traits<Packet>::size;
+  EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
+  pstoreu<Scalar>(elements, a);
+  for(size_t k = n / 2; k > 0; k /= 2)  {
+    for(size_t i = 0; i < k; ++i) {
+      elements[i] = op(elements[i], elements[i + k]);
+    }
+  }
+  return elements[0];
+}
+
+/** \internal \returns the sum of the elements of \a a*/
+template<typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type
+predux(const Packet& a)
+{
+  return a;
+}
+
+/** \internal \returns the product of the elements of \a a */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(
+    const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar; 
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmul<Scalar>)));
+}
+
+/** \internal \returns the min of the elements of \a a */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(
+    const Packet &a) {
+  typedef typename unpacket_traits<Packet>::type Scalar; 
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<PropagateFast, Scalar>)));
+}
+
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(
+    const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar; 
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<NaNPropagation, Scalar>)));
+}
+
+/** \internal \returns the min of the elements of \a a */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(
+    const Packet &a) {
+  typedef typename unpacket_traits<Packet>::type Scalar; 
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<PropagateFast, Scalar>)));
+}
+
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(
+    const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar; 
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<NaNPropagation, Scalar>)));
+}
+
+#undef EIGEN_BINARY_OP_NAN_PROPAGATION
+
+/** \internal \returns true if all coeffs of \a a means "true"
+  * It is supposed to be called on values returned by pcmp_*.
+  */
+// not needed yet
+// template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a)
+// { return bool(a); }
+
+/** \internal \returns true if any coeffs of \a a means "true"
+  * It is supposed to be called on values returned by pcmp_*.
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a)
+{
+  // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames.
+  // It is expected that "true" is either:
+  //  - Scalar(1)
+  //  - bits full of ones (NaN for floats),
+  //  - or first bit equals to 1 (1 for ints, smallest denormal for floats).
+  // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars.
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  return numext::not_equal_strict(predux(a), Scalar(0));
+}
+
+/***************************************************************************
+* The following functions might not have to be overwritten for vectorized types
+***************************************************************************/
+
+/** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */
+// NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type)
+template<typename Packet>
+inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename unpacket_traits<Packet>::type& a)
+{
+  pstore(to, pset1<Packet>(a));
+}
+
+/** \internal \returns a * b + c (coeff-wise) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pmadd(const Packet&  a,
+         const Packet&  b,
+         const Packet&  c)
+{ return padd(pmul(a, b),c); }
+
+/** \internal \returns a packet version of \a *from.
+  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template<typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from)
+{
+  if(Alignment >= unpacket_traits<Packet>::alignment)
+    return pload<Packet>(from);
+  else
+    return ploadu<Packet>(from);
+}
+
+/** \internal copy the packet \a from to \a *to.
+  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template<typename Scalar, typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from)
+{
+  if(Alignment >= unpacket_traits<Packet>::alignment)
+    pstore(to, from);
+  else
+    pstoreu(to, from);
+}
+
+/** \internal \returns a packet version of \a *from.
+  * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
+  * hardware if available to speedup the loading of data that won't be modified
+  * by the current computation.
+  */
+template<typename Packet, int LoadMode>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
+{
+  return ploadt<Packet, LoadMode>(from);
+}
+
+/***************************************************************************
+* Fast complex products (GCC generates a function call which is very slow)
+***************************************************************************/
+
+// Eigen+CUDA does not support complexes.
+#if !defined(EIGEN_GPUCC)
+
+template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
+{ return std::complex<float>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }
+
+template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
+{ return std::complex<double>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }
+
+#endif
+
+
+/***************************************************************************
+ * PacketBlock, that is a collection of N packets where the number of words
+ * in the packet is a multiple of N.
+***************************************************************************/
+template <typename Packet,int N=unpacket_traits<Packet>::size> struct PacketBlock {
+  Packet packet[N];
+};
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
+  // Nothing to do in the scalar case, i.e. a 1x1 matrix.
+}
+
+/***************************************************************************
+ * Selector, i.e. vector of N boolean values used to select (i.e. blend)
+ * words from 2 packets.
+***************************************************************************/
+template <size_t N> struct Selector {
+  bool select[N];
+};
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
+  return ifPacket.select[0] ? thenPacket : elsePacket;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERIC_PACKET_MATH_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/GlobalFunctions.h b/src/3rdparty/eigen/Eigen/src/Core/GlobalFunctions.h
new file mode 100644
index 000000000..629af94b9
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/GlobalFunctions.h
@@ -0,0 +1,194 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GLOBAL_FUNCTIONS_H
+#define EIGEN_GLOBAL_FUNCTIONS_H
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+
+#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR,DOC_OP,DOC_DETAILS) \
+  /** \returns an expression of the coefficient-wise DOC_OP of \a x
+
+    DOC_DETAILS
+
+    \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_##NAME">Math functions</a>, class CwiseUnaryOp
+    */ \
+  template<typename Derived> \
+  inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
+  NAME(const Eigen::ArrayBase<Derived>& x);
+
+#else
+
+#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR,DOC_OP,DOC_DETAILS) \
+  template<typename Derived> \
+  inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
+  (NAME)(const Eigen::ArrayBase<Derived>& x) { \
+    return Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived>(x.derived()); \
+  }
+
+#endif // EIGEN_PARSED_BY_DOXYGEN
+
+#define EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(NAME,FUNCTOR) \
+  \
+  template<typename Derived> \
+  struct NAME##_retval<ArrayBase<Derived> > \
+  { \
+    typedef const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> type; \
+  }; \
+  template<typename Derived> \
+  struct NAME##_impl<ArrayBase<Derived> > \
+  { \
+    static inline typename NAME##_retval<ArrayBase<Derived> >::type run(const Eigen::ArrayBase<Derived>& x) \
+    { \
+      return typename NAME##_retval<ArrayBase<Derived> >::type(x.derived()); \
+    } \
+  };
+
+namespace Eigen
+{
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real,scalar_real_op,real part,\sa ArrayBase::real)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(imag,scalar_imag_op,imaginary part,\sa ArrayBase::imag)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(conj,scalar_conjugate_op,complex conjugate,\sa ArrayBase::conjugate)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(inverse,scalar_inverse_op,inverse,\sa ArrayBase::inverse)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sin,scalar_sin_op,sine,\sa ArrayBase::sin)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cos,scalar_cos_op,cosine,\sa ArrayBase::cos)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op,tangent,\sa ArrayBase::tan)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op,arc-tangent,\sa ArrayBase::atan)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op,arc-sine,\sa ArrayBase::asin)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op,arc-consine,\sa ArrayBase::acos)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
+#if EIGEN_HAS_CXX11_MATH
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh,scalar_asinh_op,inverse hyperbolic sine,\sa ArrayBase::asinh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh,scalar_acosh_op,inverse hyperbolic cosine,\sa ArrayBase::acosh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh,scalar_atanh_op,inverse hyperbolic tangent,\sa ArrayBase::atanh)
+#endif
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op,complement error function,\sa ArrayBase::erfc)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ndtri,scalar_ndtri_op,inverse normal distribution function,\sa ArrayBase::ndtri)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op,exponential,\sa ArrayBase::exp)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1,scalar_expm1_op,exponential of a value minus 1,\sa ArrayBase::expm1)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p,scalar_log1p_op,natural logarithm of 1 plus the value,\sa ArrayBase::log1p)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log10)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log2,scalar_log2_op,base 2 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log2)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op,absolute value,\sa ArrayBase::abs DOXCOMMA MatrixBase::cwiseAbs)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op,squared absolute value,\sa ArrayBase::abs2 DOXCOMMA MatrixBase::cwiseAbs2)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg DOXCOMMA MatrixBase::cwiseArg)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op,square root,\sa ArrayBase::sqrt DOXCOMMA MatrixBase::cwiseSqrt)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op,cube (power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rint,scalar_rint_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op,nearest integer not greater than the giben value,\sa Eigen::ceil DOXCOMMA ArrayBase::floor)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op,nearest integer not less than the giben value,\sa Eigen::floor DOXCOMMA ArrayBase::ceil)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isnan,scalar_isnan_op,not-a-number test,\sa Eigen::isinf DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isnan)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign)
+
+  /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent.
+    *
+    * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar).
+    *
+    * \sa ArrayBase::pow()
+    *
+    * \relates ArrayBase
+    */
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  template<typename Derived,typename ScalarExponent>
+  inline const CwiseBinaryOp<internal::scalar_pow_op<Derived::Scalar,ScalarExponent>,Derived,Constant<ScalarExponent> >
+  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
+#else
+  template <typename Derived,typename ScalarExponent>
+  EIGEN_DEVICE_FUNC inline
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
+    const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<typename Derived::Scalar
+                                                 EIGEN_COMMA ScalarExponent EIGEN_COMMA
+                                                 EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type,pow))
+  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent)
+  {
+    typedef typename internal::promote_scalar_arg<typename Derived::Scalar,ScalarExponent,
+                                                  EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type PromotedExponent;
+    return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(),
+           typename internal::plain_constant_type<Derived,PromotedExponent>::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op<PromotedExponent>(exponent)));
+  }
+#endif
+
+  /** \returns an expression of the coefficient-wise power of \a x to the given array of \a exponents.
+    *
+    * This function computes the coefficient-wise power.
+    *
+    * Example: \include Cwise_array_power_array.cpp
+    * Output: \verbinclude Cwise_array_power_array.out
+    *
+    * \sa ArrayBase::pow()
+    *
+    * \relates ArrayBase
+    */
+  template<typename Derived,typename ExponentDerived>
+  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
+  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents)
+  {
+    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
+      x.derived(),
+      exponents.derived()
+    );
+  }
+
+  /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
+    *
+    * This function computes the coefficient-wise power between a scalar and an array of exponents.
+    *
+    * \tparam Scalar is the scalar type of \a x. It must be compatible with the scalar type of the given array expression (\c Derived::Scalar).
+    *
+    * Example: \include Cwise_scalar_power_array.cpp
+    * Output: \verbinclude Cwise_scalar_power_array.out
+    *
+    * \sa ArrayBase::pow()
+    *
+    * \relates ArrayBase
+    */
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  template<typename Scalar,typename Derived>
+  inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar,Derived::Scalar>,Constant<Scalar>,Derived>
+  pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x);
+#else
+  template <typename Scalar, typename Derived>
+  EIGEN_DEVICE_FUNC inline
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
+    const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<typename Derived::Scalar
+                                                 EIGEN_COMMA Scalar EIGEN_COMMA
+                                                 EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type,Derived,pow))
+  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents) {
+    typedef typename internal::promote_scalar_arg<typename Derived::Scalar,Scalar,
+                                                  EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type PromotedScalar;
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)(
+           typename internal::plain_constant_type<Derived,PromotedScalar>::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op<PromotedScalar>(x)), exponents.derived());
+  }
+#endif
+
+
+  namespace internal
+  {
+    EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(real,scalar_real_op)
+    EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(imag,scalar_imag_op)
+    EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(abs2,scalar_abs2_op)
+  }
+}
+
+// TODO: cleanly disable those functions that are not supported on Array (numext::real_ref, internal::random, internal::isApprox...)
+
+#endif // EIGEN_GLOBAL_FUNCTIONS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/IO.h b/src/3rdparty/eigen/Eigen/src/Core/IO.h
new file mode 100644
index 000000000..e81c31521
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/IO.h
@@ -0,0 +1,258 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_IO_H
+#define EIGEN_IO_H
+
+namespace Eigen { 
+
+enum { DontAlignCols = 1 };
+enum { StreamPrecision = -1,
+       FullPrecision = -2 };
+
+namespace internal {
+template<typename Derived>
+std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& fmt);
+}
+
+/** \class IOFormat
+  * \ingroup Core_Module
+  *
+  * \brief Stores a set of parameters controlling the way matrices are printed
+  *
+  * List of available parameters:
+  *  - \b precision number of digits for floating point values, or one of the special constants \c StreamPrecision and \c FullPrecision.
+  *                 The default is the special value \c StreamPrecision which means to use the
+  *                 stream's own precision setting, as set for instance using \c cout.precision(3). The other special value
+  *                 \c FullPrecision means that the number of digits will be computed to match the full precision of each floating-point
+  *                 type.
+  *  - \b flags an OR-ed combination of flags, the default value is 0, the only currently available flag is \c DontAlignCols which
+  *             allows to disable the alignment of columns, resulting in faster code.
+  *  - \b coeffSeparator string printed between two coefficients of the same row
+  *  - \b rowSeparator string printed between two rows
+  *  - \b rowPrefix string printed at the beginning of each row
+  *  - \b rowSuffix string printed at the end of each row
+  *  - \b matPrefix string printed at the beginning of the matrix
+  *  - \b matSuffix string printed at the end of the matrix
+  *  - \b fill character printed to fill the empty space in aligned columns
+  *
+  * Example: \include IOFormat.cpp
+  * Output: \verbinclude IOFormat.out
+  *
+  * \sa DenseBase::format(), class WithFormat
+  */
+struct IOFormat
+{
+  /** Default constructor, see class IOFormat for the meaning of the parameters */
+  IOFormat(int _precision = StreamPrecision, int _flags = 0,
+    const std::string& _coeffSeparator = " ",
+    const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="",
+    const std::string& _matPrefix="", const std::string& _matSuffix="", const char _fill=' ')
+  : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator),
+    rowSpacer(""), coeffSeparator(_coeffSeparator), fill(_fill), precision(_precision), flags(_flags)
+  {
+    // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
+    // don't add rowSpacer if columns are not to be aligned
+    if((flags & DontAlignCols))
+      return;
+    int i = int(matSuffix.length())-1;
+    while (i>=0 && matSuffix[i]!='\n')
+    {
+      rowSpacer += ' ';
+      i--;
+    }
+  }
+  std::string matPrefix, matSuffix;
+  std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer;
+  std::string coeffSeparator;
+  char fill;
+  int precision;
+  int flags;
+};
+
+/** \class WithFormat
+  * \ingroup Core_Module
+  *
+  * \brief Pseudo expression providing matrix output with given format
+  *
+  * \tparam ExpressionType the type of the object on which IO stream operations are performed
+  *
+  * This class represents an expression with stream operators controlled by a given IOFormat.
+  * It is the return type of DenseBase::format()
+  * and most of the time this is the only way it is used.
+  *
+  * See class IOFormat for some examples.
+  *
+  * \sa DenseBase::format(), class IOFormat
+  */
+template<typename ExpressionType>
+class WithFormat
+{
+  public:
+
+    WithFormat(const ExpressionType& matrix, const IOFormat& format)
+      : m_matrix(matrix), m_format(format)
+    {}
+
+    friend std::ostream & operator << (std::ostream & s, const WithFormat& wf)
+    {
+      return internal::print_matrix(s, wf.m_matrix.eval(), wf.m_format);
+    }
+
+  protected:
+    typename ExpressionType::Nested m_matrix;
+    IOFormat m_format;
+};
+
+namespace internal {
+
+// NOTE: This helper is kept for backward compatibility with previous code specializing
+//       this internal::significant_decimals_impl structure. In the future we should directly
+//       call digits10() which has been introduced in July 2016 in 3.3.
+template<typename Scalar>
+struct significant_decimals_impl
+{
+  static inline int run()
+  {
+    return NumTraits<Scalar>::digits10();
+  }
+};
+
+/** \internal
+  * print the matrix \a _m to the output stream \a s using the output format \a fmt */
+template<typename Derived>
+std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& fmt)
+{
+  using internal::is_same;
+  using internal::conditional;
+
+  if(_m.size() == 0)
+  {
+    s << fmt.matPrefix << fmt.matSuffix;
+    return s;
+  }
+  
+  typename Derived::Nested m = _m;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename
+      conditional<
+          is_same<Scalar, char>::value ||
+            is_same<Scalar, unsigned char>::value ||
+            is_same<Scalar, numext::int8_t>::value ||
+            is_same<Scalar, numext::uint8_t>::value,
+          int,
+          typename conditional<
+              is_same<Scalar, std::complex<char> >::value ||
+                is_same<Scalar, std::complex<unsigned char> >::value ||
+                is_same<Scalar, std::complex<numext::int8_t> >::value ||
+                is_same<Scalar, std::complex<numext::uint8_t> >::value,
+              std::complex<int>,
+              const Scalar&
+            >::type
+        >::type PrintType;
+
+  Index width = 0;
+
+  std::streamsize explicit_precision;
+  if(fmt.precision == StreamPrecision)
+  {
+    explicit_precision = 0;
+  }
+  else if(fmt.precision == FullPrecision)
+  {
+    if (NumTraits<Scalar>::IsInteger)
+    {
+      explicit_precision = 0;
+    }
+    else
+    {
+      explicit_precision = significant_decimals_impl<Scalar>::run();
+    }
+  }
+  else
+  {
+    explicit_precision = fmt.precision;
+  }
+
+  std::streamsize old_precision = 0;
+  if(explicit_precision) old_precision = s.precision(explicit_precision);
+
+  bool align_cols = !(fmt.flags & DontAlignCols);
+  if(align_cols)
+  {
+    // compute the largest width
+    for(Index j = 0; j < m.cols(); ++j)
+      for(Index i = 0; i < m.rows(); ++i)
+      {
+        std::stringstream sstr;
+        sstr.copyfmt(s);
+        sstr << static_cast<PrintType>(m.coeff(i,j));
+        width = std::max<Index>(width, Index(sstr.str().length()));
+      }
+  }
+  std::streamsize old_width = s.width();
+  char old_fill_character = s.fill();
+  s << fmt.matPrefix;
+  for(Index i = 0; i < m.rows(); ++i)
+  {
+    if (i)
+      s << fmt.rowSpacer;
+    s << fmt.rowPrefix;
+    if(width) {
+      s.fill(fmt.fill);
+      s.width(width);
+    }
+    s << static_cast<PrintType>(m.coeff(i, 0));
+    for(Index j = 1; j < m.cols(); ++j)
+    {
+      s << fmt.coeffSeparator;
+      if(width) {
+        s.fill(fmt.fill);
+        s.width(width);
+      }
+      s << static_cast<PrintType>(m.coeff(i, j));
+    }
+    s << fmt.rowSuffix;
+    if( i < m.rows() - 1)
+      s << fmt.rowSeparator;
+  }
+  s << fmt.matSuffix;
+  if(explicit_precision) s.precision(old_precision);
+  if(width) {
+    s.fill(old_fill_character);
+    s.width(old_width);
+  }
+  return s;
+}
+
+} // end namespace internal
+
+/** \relates DenseBase
+  *
+  * Outputs the matrix, to the given stream.
+  *
+  * If you wish to print the matrix with a format different than the default, use DenseBase::format().
+  *
+  * It is also possible to change the default format by defining EIGEN_DEFAULT_IO_FORMAT before including Eigen headers.
+  * If not defined, this will automatically be defined to Eigen::IOFormat(), that is the Eigen::IOFormat with default parameters.
+  *
+  * \sa DenseBase::format()
+  */
+template<typename Derived>
+std::ostream & operator <<
+(std::ostream & s,
+ const DenseBase<Derived> & m)
+{
+  return internal::print_matrix(s, m.eval(), EIGEN_DEFAULT_IO_FORMAT);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_IO_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/IndexedView.h b/src/3rdparty/eigen/Eigen/src/Core/IndexedView.h
new file mode 100644
index 000000000..08476251d
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/IndexedView.h
@@ -0,0 +1,237 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INDEXED_VIEW_H
+#define EIGEN_INDEXED_VIEW_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename XprType, typename RowIndices, typename ColIndices>
+struct traits<IndexedView<XprType, RowIndices, ColIndices> >
+ : traits<XprType>
+{
+  enum {
+    RowsAtCompileTime = int(array_size<RowIndices>::value),
+    ColsAtCompileTime = int(array_size<ColIndices>::value),
+    MaxRowsAtCompileTime = RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime) : Dynamic,
+    MaxColsAtCompileTime = ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime) : Dynamic,
+
+    XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
+    IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+               : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+               : XprTypeIsRowMajor,
+
+    RowIncr = int(get_compile_time_incr<RowIndices>::value),
+    ColIncr = int(get_compile_time_incr<ColIndices>::value),
+    InnerIncr = IsRowMajor ? ColIncr : RowIncr,
+    OuterIncr = IsRowMajor ? RowIncr : ColIncr,
+
+    HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor),
+    XprInnerStride = HasSameStorageOrderAsXprType ? int(inner_stride_at_compile_time<XprType>::ret) : int(outer_stride_at_compile_time<XprType>::ret),
+    XprOuterstride = HasSameStorageOrderAsXprType ? int(outer_stride_at_compile_time<XprType>::ret) : int(inner_stride_at_compile_time<XprType>::ret),
+
+    InnerSize = XprTypeIsRowMajor ? ColsAtCompileTime : RowsAtCompileTime,
+    IsBlockAlike = InnerIncr==1 && OuterIncr==1,
+    IsInnerPannel = HasSameStorageOrderAsXprType && is_same<AllRange<InnerSize>,typename conditional<XprTypeIsRowMajor,ColIndices,RowIndices>::type>::value,
+
+    InnerStrideAtCompileTime = InnerIncr<0 || InnerIncr==DynamicIndex || XprInnerStride==Dynamic ? Dynamic : XprInnerStride * InnerIncr,
+    OuterStrideAtCompileTime = OuterIncr<0 || OuterIncr==DynamicIndex || XprOuterstride==Dynamic ? Dynamic : XprOuterstride * OuterIncr,
+
+    ReturnAsScalar = is_same<RowIndices,SingleRange>::value && is_same<ColIndices,SingleRange>::value,
+    ReturnAsBlock = (!ReturnAsScalar) && IsBlockAlike,
+    ReturnAsIndexedView = (!ReturnAsScalar) && (!ReturnAsBlock),
+
+    // FIXME we deal with compile-time strides if and only if we have DirectAccessBit flag,
+    // but this is too strict regarding negative strides...
+    DirectAccessMask = (int(InnerIncr)!=UndefinedIncr && int(OuterIncr)!=UndefinedIncr && InnerIncr>=0 && OuterIncr>=0) ? DirectAccessBit : 0,
+    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
+    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
+    Flags = (traits<XprType>::Flags & (HereditaryBits | DirectAccessMask )) | FlagsLvalueBit | FlagsRowMajorBit | FlagsLinearAccessBit
+  };
+
+  typedef Block<XprType,RowsAtCompileTime,ColsAtCompileTime,IsInnerPannel> BlockType;
+};
+
+}
+
+template<typename XprType, typename RowIndices, typename ColIndices, typename StorageKind>
+class IndexedViewImpl;
+
+
+/** \class IndexedView
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a non-sequential sub-matrix defined by arbitrary sequences of row and column indices
+  *
+  * \tparam XprType the type of the expression in which we are taking the intersections of sub-rows and sub-columns
+  * \tparam RowIndices the type of the object defining the sequence of row indices
+  * \tparam ColIndices the type of the object defining the sequence of column indices
+  *
+  * This class represents an expression of a sub-matrix (or sub-vector) defined as the intersection
+  * of sub-sets of rows and columns, that are themself defined by generic sequences of row indices \f$ \{r_0,r_1,..r_{m-1}\} \f$
+  * and column indices \f$ \{c_0,c_1,..c_{n-1} \}\f$. Let \f$ A \f$  be the nested matrix, then the resulting matrix \f$ B \f$ has \c m
+  * rows and \c n columns, and its entries are given by: \f$ B(i,j) = A(r_i,c_j) \f$.
+  *
+  * The \c RowIndices and \c ColIndices types must be compatible with the following API:
+  * \code
+  * <integral type> operator[](Index) const;
+  * Index size() const;
+  * \endcode
+  *
+  * Typical supported types thus include:
+  *  - std::vector<int>
+  *  - std::valarray<int>
+  *  - std::array<int>
+  *  - Plain C arrays: int[N]
+  *  - Eigen::ArrayXi
+  *  - decltype(ArrayXi::LinSpaced(...))
+  *  - Any view/expressions of the previous types
+  *  - Eigen::ArithmeticSequence
+  *  - Eigen::internal::AllRange      (helper for Eigen::all)
+  *  - Eigen::internal::SingleRange  (helper for single index)
+  *  - etc.
+  *
+  * In typical usages of %Eigen, this class should never be used directly. It is the return type of
+  * DenseBase::operator()(const RowIndices&, const ColIndices&).
+  *
+  * \sa class Block
+  */
+template<typename XprType, typename RowIndices, typename ColIndices>
+class IndexedView : public IndexedViewImpl<XprType, RowIndices, ColIndices, typename internal::traits<XprType>::StorageKind>
+{
+public:
+  typedef typename IndexedViewImpl<XprType, RowIndices, ColIndices, typename internal::traits<XprType>::StorageKind>::Base Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(IndexedView)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(IndexedView)
+
+  typedef typename internal::ref_selector<XprType>::non_const_type MatrixTypeNested;
+  typedef typename internal::remove_all<XprType>::type NestedExpression;
+
+  template<typename T0, typename T1>
+  IndexedView(XprType& xpr, const T0& rowIndices, const T1& colIndices)
+    : m_xpr(xpr), m_rowIndices(rowIndices), m_colIndices(colIndices)
+  {}
+
+  /** \returns number of rows */
+  Index rows() const { return internal::size(m_rowIndices); }
+
+  /** \returns number of columns */
+  Index cols() const { return internal::size(m_colIndices); }
+
+  /** \returns the nested expression */
+  const typename internal::remove_all<XprType>::type&
+  nestedExpression() const { return m_xpr; }
+
+  /** \returns the nested expression */
+  typename internal::remove_reference<XprType>::type&
+  nestedExpression() { return m_xpr; }
+
+  /** \returns a const reference to the object storing/generating the row indices */
+  const RowIndices& rowIndices() const { return m_rowIndices; }
+
+  /** \returns a const reference to the object storing/generating the column indices */
+  const ColIndices& colIndices() const { return m_colIndices; }
+
+protected:
+  MatrixTypeNested m_xpr;
+  RowIndices m_rowIndices;
+  ColIndices m_colIndices;
+};
+
+
+// Generic API dispatcher
+template<typename XprType, typename RowIndices, typename ColIndices, typename StorageKind>
+class IndexedViewImpl
+  : public internal::generic_xpr_base<IndexedView<XprType, RowIndices, ColIndices> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<IndexedView<XprType, RowIndices, ColIndices> >::type Base;
+};
+
+namespace internal {
+
+
+template<typename ArgType, typename RowIndices, typename ColIndices>
+struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
+  : evaluator_base<IndexedView<ArgType, RowIndices, ColIndices> >
+{
+  typedef IndexedView<ArgType, RowIndices, ColIndices> XprType;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost /* TODO + cost of row/col index */,
+
+    FlagsLinearAccessBit = (traits<XprType>::RowsAtCompileTime == 1 || traits<XprType>::ColsAtCompileTime == 1) ? LinearAccessBit : 0,
+
+    FlagsRowMajorBit = traits<XprType>::FlagsRowMajorBit, 
+
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits & ~RowMajorBit /*| LinearAccessBit | DirectAccessBit*/)) | FlagsLinearAccessBit | FlagsRowMajorBit,
+
+    Alignment = 0
+  };
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
+  {
+    EIGEN_STATIC_ASSERT_LVALUE(XprType)
+    Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
+    Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
+    return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar& coeffRef(Index index) const
+  {
+    Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
+    Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
+    return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const CoeffReturnType coeff(Index index) const
+  {
+    Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
+    Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
+    return m_argImpl.coeff( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+protected:
+
+  evaluator<ArgType> m_argImpl;
+  const XprType& m_xpr;
+
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_INDEXED_VIEW_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Inverse.h b/src/3rdparty/eigen/Eigen/src/Core/Inverse.h
new file mode 100644
index 000000000..c514438c4
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Inverse.h
@@ -0,0 +1,117 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INVERSE_H
+#define EIGEN_INVERSE_H
+
+namespace Eigen {
+
+template<typename XprType,typename StorageKind> class InverseImpl;
+
+namespace internal {
+
+template<typename XprType>
+struct traits<Inverse<XprType> >
+  : traits<typename XprType::PlainObject>
+{
+  typedef typename XprType::PlainObject PlainObject;
+  typedef traits<PlainObject> BaseTraits;
+  enum {
+    Flags = BaseTraits::Flags & RowMajorBit
+  };
+};
+
+} // end namespace internal
+
+/** \class Inverse
+  *
+  * \brief Expression of the inverse of another expression
+  *
+  * \tparam XprType the type of the expression we are taking the inverse
+  *
+  * This class represents an abstract expression of A.inverse()
+  * and most of the time this is the only way it is used.
+  *
+  */
+template<typename XprType>
+class Inverse : public InverseImpl<XprType,typename internal::traits<XprType>::StorageKind>
+{
+public:
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename XprType::Scalar                            Scalar;
+  typedef typename internal::ref_selector<XprType>::type      XprTypeNested;
+  typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
+  typedef typename internal::ref_selector<Inverse>::type Nested;
+  typedef typename internal::remove_all<XprType>::type NestedExpression;
+
+  explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr)
+    : m_xpr(xpr)
+  {}
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR  Index rows() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR  Index cols() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
+
+  EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
+
+protected:
+  XprTypeNested m_xpr;
+};
+
+// Generic API dispatcher
+template<typename XprType, typename StorageKind>
+class InverseImpl
+  : public internal::generic_xpr_base<Inverse<XprType> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<Inverse<XprType> >::type Base;
+  typedef typename XprType::Scalar Scalar;
+private:
+
+  Scalar coeff(Index row, Index col) const;
+  Scalar coeff(Index i) const;
+};
+
+namespace internal {
+
+/** \internal
+  * \brief Default evaluator for Inverse expression.
+  *
+  * This default evaluator for Inverse expression simply evaluate the inverse into a temporary
+  * by a call to internal::call_assignment_no_alias.
+  * Therefore, inverse implementers only have to specialize Assignment<Dst,Inverse<...>, ...> for
+  * there own nested expression.
+  *
+  * \sa class Inverse
+  */
+template<typename ArgType>
+struct unary_evaluator<Inverse<ArgType> >
+  : public evaluator<typename Inverse<ArgType>::PlainObject>
+{
+  typedef Inverse<ArgType> InverseType;
+  typedef typename InverseType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
+
+  unary_evaluator(const InverseType& inv_xpr)
+    : m_result(inv_xpr.rows(), inv_xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    internal::call_assignment_no_alias(m_result, inv_xpr);
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_INVERSE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Map.h b/src/3rdparty/eigen/Eigen/src/Core/Map.h
new file mode 100644
index 000000000..218cc157f
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Map.h
@@ -0,0 +1,171 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MAP_H
+#define EIGEN_MAP_H
+
+namespace Eigen {
+
+namespace internal {
+template<typename PlainObjectType, int MapOptions, typename StrideType>
+struct traits<Map<PlainObjectType, MapOptions, StrideType> >
+  : public traits<PlainObjectType>
+{
+  typedef traits<PlainObjectType> TraitsBase;
+  enum {
+    PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags&RowMajorBit)==RowMajorBit)
+                             ? PlainObjectType::ColsAtCompileTime
+                             : PlainObjectType::RowsAtCompileTime,
+
+    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
+                             ? int(PlainObjectType::InnerStrideAtCompileTime)
+                             : int(StrideType::InnerStrideAtCompileTime),
+    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
+                             ? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic
+                                ? Dynamic
+                                : int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
+                             : int(StrideType::OuterStrideAtCompileTime),
+    Alignment = int(MapOptions)&int(AlignedMask),
+    Flags0 = TraitsBase::Flags & (~NestByRefBit),
+    Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit)
+  };
+private:
+  enum { Options }; // Expressions don't have Options
+};
+}
+
+/** \class Map
+  * \ingroup Core_Module
+  *
+  * \brief A matrix or vector expression mapping an existing array of data.
+  *
+  * \tparam PlainObjectType the equivalent matrix type of the mapped data
+  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
+  *                The default is \c #Unaligned.
+  * \tparam StrideType optionally specifies strides. By default, Map assumes the memory layout
+  *                   of an ordinary, contiguous array. This can be overridden by specifying strides.
+  *                   The type passed here must be a specialization of the Stride template, see examples below.
+  *
+  * This class represents a matrix or vector expression mapping an existing array of data.
+  * It can be used to let Eigen interface without any overhead with non-Eigen data structures,
+  * such as plain C arrays or structures from other libraries. By default, it assumes that the
+  * data is laid out contiguously in memory. You can however override this by explicitly specifying
+  * inner and outer strides.
+  *
+  * Here's an example of simply mapping a contiguous array as a \ref TopicStorageOrders "column-major" matrix:
+  * \include Map_simple.cpp
+  * Output: \verbinclude Map_simple.out
+  *
+  * If you need to map non-contiguous arrays, you can do so by specifying strides:
+  *
+  * Here's an example of mapping an array as a vector, specifying an inner stride, that is, the pointer
+  * increment between two consecutive coefficients. Here, we're specifying the inner stride as a compile-time
+  * fixed value.
+  * \include Map_inner_stride.cpp
+  * Output: \verbinclude Map_inner_stride.out
+  *
+  * Here's an example of mapping an array while specifying an outer stride. Here, since we're mapping
+  * as a column-major matrix, 'outer stride' means the pointer increment between two consecutive columns.
+  * Here, we're specifying the outer stride as a runtime parameter. Note that here \c OuterStride<> is
+  * a short version of \c OuterStride<Dynamic> because the default template parameter of OuterStride
+  * is  \c Dynamic
+  * \include Map_outer_stride.cpp
+  * Output: \verbinclude Map_outer_stride.out
+  *
+  * For more details and for an example of specifying both an inner and an outer stride, see class Stride.
+  *
+  * \b Tip: to change the array of data mapped by a Map object, you can use the C++
+  * placement new syntax:
+  *
+  * Example: \include Map_placement_new.cpp
+  * Output: \verbinclude Map_placement_new.out
+  *
+  * This class is the return type of PlainObjectBase::Map() but can also be used directly.
+  *
+  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
+  */
+template<typename PlainObjectType, int MapOptions, typename StrideType> class Map
+  : public MapBase<Map<PlainObjectType, MapOptions, StrideType> >
+{
+  public:
+
+    typedef MapBase<Map> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Map)
+
+    typedef typename Base::PointerType PointerType;
+    typedef PointerType PointerArgType;
+    EIGEN_DEVICE_FUNC
+    inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const
+    {
+      return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const
+    {
+      return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
+           : internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
+           : IsVectorAtCompileTime ? (this->size() * innerStride())
+           : int(Flags)&RowMajorBit ? (this->cols() * innerStride())
+           : (this->rows() * innerStride());
+    }
+
+    /** Constructor in the fixed-size case.
+      *
+      * \param dataPtr pointer to the array to map
+      * \param stride optional Stride object, passing the strides.
+      */
+    EIGEN_DEVICE_FUNC
+    explicit inline Map(PointerArgType dataPtr, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr)), m_stride(stride)
+    {
+      PlainObjectType::Base::_check_template_params();
+    }
+
+    /** Constructor in the dynamic-size vector case.
+      *
+      * \param dataPtr pointer to the array to map
+      * \param size the size of the vector expression
+      * \param stride optional Stride object, passing the strides.
+      */
+    EIGEN_DEVICE_FUNC
+    inline Map(PointerArgType dataPtr, Index size, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr), size), m_stride(stride)
+    {
+      PlainObjectType::Base::_check_template_params();
+    }
+
+    /** Constructor in the dynamic-size matrix case.
+      *
+      * \param dataPtr pointer to the array to map
+      * \param rows the number of rows of the matrix expression
+      * \param cols the number of columns of the matrix expression
+      * \param stride optional Stride object, passing the strides.
+      */
+    EIGEN_DEVICE_FUNC
+    inline Map(PointerArgType dataPtr, Index rows, Index cols, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr), rows, cols), m_stride(stride)
+    {
+      PlainObjectType::Base::_check_template_params();
+    }
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
+
+  protected:
+    StrideType m_stride;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_MAP_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/MapBase.h b/src/3rdparty/eigen/Eigen/src/Core/MapBase.h
new file mode 100644
index 000000000..d856447f0
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/MapBase.h
@@ -0,0 +1,310 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MAPBASE_H
+#define EIGEN_MAPBASE_H
+
+#define EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived) \
+      EIGEN_STATIC_ASSERT((int(internal::evaluator<Derived>::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \
+                          YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT)
+
+namespace Eigen {
+
+/** \ingroup Core_Module
+  *
+  * \brief Base class for dense Map and Block expression with direct access
+  *
+  * This base class provides the const low-level accessors (e.g. coeff, coeffRef) of dense
+  * Map and Block objects with direct access.
+  * Typical users do not have to directly deal with this class.
+  *
+  * This class can be extended by through the macro plugin \c EIGEN_MAPBASE_PLUGIN.
+  * See \link TopicCustomizing_Plugins customizing Eigen \endlink for details.
+  *
+  * The \c Derived class has to provide the following two methods describing the memory layout:
+  *  \code Index innerStride() const; \endcode
+  *  \code Index outerStride() const; \endcode
+  *
+  * \sa class Map, class Block
+  */
+template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
+  : public internal::dense_xpr_base<Derived>::type
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<Derived>::type Base;
+    enum {
+      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+      InnerStrideAtCompileTime = internal::traits<Derived>::InnerStrideAtCompileTime,
+      SizeAtCompileTime = Base::SizeAtCompileTime
+    };
+
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename internal::conditional<
+                         bool(internal::is_lvalue<Derived>::value),
+                         Scalar *,
+                         const Scalar *>::type
+                     PointerType;
+
+    using Base::derived;
+//    using Base::RowsAtCompileTime;
+//    using Base::ColsAtCompileTime;
+//    using Base::SizeAtCompileTime;
+    using Base::MaxRowsAtCompileTime;
+    using Base::MaxColsAtCompileTime;
+    using Base::MaxSizeAtCompileTime;
+    using Base::IsVectorAtCompileTime;
+    using Base::Flags;
+    using Base::IsRowMajor;
+
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::coeff;
+    using Base::coeffRef;
+    using Base::lazyAssign;
+    using Base::eval;
+
+    using Base::innerStride;
+    using Base::outerStride;
+    using Base::rowStride;
+    using Base::colStride;
+
+    // bug 217 - compile error on ICC 11.1
+    using Base::operator=;
+
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+
+    /** \copydoc DenseBase::rows() */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_rows.value(); }
+    /** \copydoc DenseBase::cols() */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_cols.value(); }
+
+    /** Returns a pointer to the first coefficient of the matrix or vector.
+      *
+      * \note When addressing this data, make sure to honor the strides returned by innerStride() and outerStride().
+      *
+      * \sa innerStride(), outerStride()
+      */
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_data; }
+
+    /** \copydoc PlainObjectBase::coeff(Index,Index) const */
+    EIGEN_DEVICE_FUNC
+    inline const Scalar& coeff(Index rowId, Index colId) const
+    {
+      return m_data[colId * colStride() + rowId * rowStride()];
+    }
+
+    /** \copydoc PlainObjectBase::coeff(Index) const */
+    EIGEN_DEVICE_FUNC
+    inline const Scalar& coeff(Index index) const
+    {
+      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
+      return m_data[index * innerStride()];
+    }
+
+    /** \copydoc PlainObjectBase::coeffRef(Index,Index) const */
+    EIGEN_DEVICE_FUNC
+    inline const Scalar& coeffRef(Index rowId, Index colId) const
+    {
+      return this->m_data[colId * colStride() + rowId * rowStride()];
+    }
+
+    /** \copydoc PlainObjectBase::coeffRef(Index) const */
+    EIGEN_DEVICE_FUNC
+    inline const Scalar& coeffRef(Index index) const
+    {
+      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
+      return this->m_data[index * innerStride()];
+    }
+
+    /** \internal */
+    template<int LoadMode>
+    inline PacketScalar packet(Index rowId, Index colId) const
+    {
+      return internal::ploadt<PacketScalar, LoadMode>
+               (m_data + (colId * colStride() + rowId * rowStride()));
+    }
+
+    /** \internal */
+    template<int LoadMode>
+    inline PacketScalar packet(Index index) const
+    {
+      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
+      return internal::ploadt<PacketScalar, LoadMode>(m_data + index * innerStride());
+    }
+
+    /** \internal Constructor for fixed size matrices or vectors */
+    EIGEN_DEVICE_FUNC
+    explicit inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
+    {
+      EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
+      checkSanity<Derived>();
+    }
+
+    /** \internal Constructor for dynamically sized vectors */
+    EIGEN_DEVICE_FUNC
+    inline MapBase(PointerType dataPtr, Index vecSize)
+            : m_data(dataPtr),
+              m_rows(RowsAtCompileTime == Dynamic ? vecSize : Index(RowsAtCompileTime)),
+              m_cols(ColsAtCompileTime == Dynamic ? vecSize : Index(ColsAtCompileTime))
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+      eigen_assert(vecSize >= 0);
+      eigen_assert(dataPtr == 0 || SizeAtCompileTime == Dynamic || SizeAtCompileTime == vecSize);
+      checkSanity<Derived>();
+    }
+
+    /** \internal Constructor for dynamically sized matrices */
+    EIGEN_DEVICE_FUNC
+    inline MapBase(PointerType dataPtr, Index rows, Index cols)
+            : m_data(dataPtr), m_rows(rows), m_cols(cols)
+    {
+      eigen_assert( (dataPtr == 0)
+              || (   rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
+                  && cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols)));
+      checkSanity<Derived>();
+    }
+
+    #ifdef EIGEN_MAPBASE_PLUGIN
+    #include EIGEN_MAPBASE_PLUGIN
+    #endif
+
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
+
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const
+    {
+#if EIGEN_MAX_ALIGN_BYTES>0
+      // innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value:
+      const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime);
+      EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride);
+      eigen_assert((   ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0)
+                    || (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
+#endif
+    }
+
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    void checkSanity(typename internal::enable_if<internal::traits<T>::Alignment==0,void*>::type = 0) const
+    {}
+
+    PointerType m_data;
+    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_rows;
+    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_cols;
+};
+
+/** \ingroup Core_Module
+  *
+  * \brief Base class for non-const dense Map and Block expression with direct access
+  *
+  * This base class provides the non-const low-level accessors (e.g. coeff and coeffRef) of
+  * dense Map and Block objects with direct access.
+  * It inherits MapBase<Derived, ReadOnlyAccessors> which defines the const variant for reading specific entries.
+  *
+  * \sa class Map, class Block
+  */
+template<typename Derived> class MapBase<Derived, WriteAccessors>
+  : public MapBase<Derived, ReadOnlyAccessors>
+{
+    typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
+  public:
+
+    typedef MapBase<Derived, ReadOnlyAccessors> Base;
+
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::PacketScalar PacketScalar;
+    typedef typename Base::StorageIndex StorageIndex;
+    typedef typename Base::PointerType PointerType;
+
+    using Base::derived;
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::coeff;
+    using Base::coeffRef;
+
+    using Base::innerStride;
+    using Base::outerStride;
+    using Base::rowStride;
+    using Base::colStride;
+
+    typedef typename internal::conditional<
+                    internal::is_lvalue<Derived>::value,
+                    Scalar,
+                    const Scalar
+                  >::type ScalarWithConstIfNotLvalue;
+
+    EIGEN_DEVICE_FUNC
+    inline const Scalar* data() const { return this->m_data; }
+    EIGEN_DEVICE_FUNC
+    inline ScalarWithConstIfNotLvalue* data() { return this->m_data; } // no const-cast here so non-const-correct code will give a compile error
+
+    EIGEN_DEVICE_FUNC
+    inline ScalarWithConstIfNotLvalue& coeffRef(Index row, Index col)
+    {
+      return this->m_data[col * colStride() + row * rowStride()];
+    }
+
+    EIGEN_DEVICE_FUNC
+    inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
+    {
+      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
+      return this->m_data[index * innerStride()];
+    }
+
+    template<int StoreMode>
+    inline void writePacket(Index row, Index col, const PacketScalar& val)
+    {
+      internal::pstoret<Scalar, PacketScalar, StoreMode>
+               (this->m_data + (col * colStride() + row * rowStride()), val);
+    }
+
+    template<int StoreMode>
+    inline void writePacket(Index index, const PacketScalar& val)
+    {
+      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
+      internal::pstoret<Scalar, PacketScalar, StoreMode>
+                (this->m_data + index * innerStride(), val);
+    }
+
+    EIGEN_DEVICE_FUNC explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
+    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
+    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index rows, Index cols) : Base(dataPtr, rows, cols) {}
+
+    EIGEN_DEVICE_FUNC
+    Derived& operator=(const MapBase& other)
+    {
+      ReadOnlyMapBase::Base::operator=(other);
+      return derived();
+    }
+
+    // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
+    // see bugs 821 and 920.
+    using ReadOnlyMapBase::Base::operator=;
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
+};
+
+#undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
+
+} // end namespace Eigen
+
+#endif // EIGEN_MAPBASE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/MathFunctions.h b/src/3rdparty/eigen/Eigen/src/Core/MathFunctions.h
new file mode 100644
index 000000000..61b78f4f2
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/MathFunctions.h
@@ -0,0 +1,2057 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATHFUNCTIONS_H
+#define EIGEN_MATHFUNCTIONS_H
+
+// TODO this should better be moved to NumTraits
+// Source: WolframAlpha
+#define EIGEN_PI    3.141592653589793238462643383279502884197169399375105820974944592307816406L
+#define EIGEN_LOG2E 1.442695040888963407359924681001892137426645954152985934135449406931109219L
+#define EIGEN_LN2   0.693147180559945309417232121458176568075500134360255254120680009493393621L
+
+namespace Eigen {
+
+// On WINCE, std::abs is defined for int only, so let's defined our own overloads:
+// This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too.
+#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC<=1500
+long        abs(long        x) { return (labs(x));  }
+double      abs(double      x) { return (fabs(x));  }
+float       abs(float       x) { return (fabsf(x)); }
+long double abs(long double x) { return (fabsl(x)); }
+#endif
+
+namespace internal {
+
+/** \internal \class global_math_functions_filtering_base
+  *
+  * What it does:
+  * Defines a typedef 'type' as follows:
+  * - if type T has a member typedef Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl, then
+  *   global_math_functions_filtering_base<T>::type is a typedef for it.
+  * - otherwise, global_math_functions_filtering_base<T>::type is a typedef for T.
+  *
+  * How it's used:
+  * To allow to defined the global math functions (like sin...) in certain cases, like the Array expressions.
+  * When you do sin(array1+array2), the object array1+array2 has a complicated expression type, all what you want to know
+  * is that it inherits ArrayBase. So we implement a partial specialization of sin_impl for ArrayBase<Derived>.
+  * So we must make sure to use sin_impl<ArrayBase<Derived> > and not sin_impl<Derived>, otherwise our partial specialization
+  * won't be used. How does sin know that? That's exactly what global_math_functions_filtering_base tells it.
+  *
+  * How it's implemented:
+  * SFINAE in the style of enable_if. Highly susceptible of breaking compilers. With GCC, it sure does work, but if you replace
+  * the typename dummy by an integer template parameter, it doesn't work anymore!
+  */
+
+template<typename T, typename dummy = void>
+struct global_math_functions_filtering_base
+{
+  typedef T type;
+};
+
+template<typename T> struct always_void { typedef void type; };
+
+template<typename T>
+struct global_math_functions_filtering_base
+  <T,
+   typename always_void<typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl>::type
+  >
+{
+  typedef typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl type;
+};
+
+#define EIGEN_MATHFUNC_IMPL(func, scalar) Eigen::internal::func##_impl<typename Eigen::internal::global_math_functions_filtering_base<scalar>::type>
+#define EIGEN_MATHFUNC_RETVAL(func, scalar) typename Eigen::internal::func##_retval<typename Eigen::internal::global_math_functions_filtering_base<scalar>::type>::type
+
+/****************************************************************************
+* Implementation of real                                                 *
+****************************************************************************/
+
+template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct real_default_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    return x;
+  }
+};
+
+template<typename Scalar>
+struct real_default_impl<Scalar,true>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    using std::real;
+    return real(x);
+  }
+};
+
+template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template<typename T>
+struct real_impl<std::complex<T> >
+{
+  typedef T RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline T run(const std::complex<T>& x)
+  {
+    return x.real();
+  }
+};
+#endif
+
+template<typename Scalar>
+struct real_retval
+{
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of imag                                                 *
+****************************************************************************/
+
+template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct imag_default_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar&)
+  {
+    return RealScalar(0);
+  }
+};
+
+template<typename Scalar>
+struct imag_default_impl<Scalar,true>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    using std::imag;
+    return imag(x);
+  }
+};
+
+template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template<typename T>
+struct imag_impl<std::complex<T> >
+{
+  typedef T RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline T run(const std::complex<T>& x)
+  {
+    return x.imag();
+  }
+};
+#endif
+
+template<typename Scalar>
+struct imag_retval
+{
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of real_ref                                             *
+****************************************************************************/
+
+template<typename Scalar>
+struct real_ref_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar& run(Scalar& x)
+  {
+    return reinterpret_cast<RealScalar*>(&x)[0];
+  }
+  EIGEN_DEVICE_FUNC
+  static inline const RealScalar& run(const Scalar& x)
+  {
+    return reinterpret_cast<const RealScalar*>(&x)[0];
+  }
+};
+
+template<typename Scalar>
+struct real_ref_retval
+{
+  typedef typename NumTraits<Scalar>::Real & type;
+};
+
+/****************************************************************************
+* Implementation of imag_ref                                             *
+****************************************************************************/
+
+template<typename Scalar, bool IsComplex>
+struct imag_ref_default_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar& run(Scalar& x)
+  {
+    return reinterpret_cast<RealScalar*>(&x)[1];
+  }
+  EIGEN_DEVICE_FUNC
+  static inline const RealScalar& run(const Scalar& x)
+  {
+    return reinterpret_cast<RealScalar*>(&x)[1];
+  }
+};
+
+template<typename Scalar>
+struct imag_ref_default_impl<Scalar, false>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline Scalar run(Scalar&)
+  {
+    return Scalar(0);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline const Scalar run(const Scalar&)
+  {
+    return Scalar(0);
+  }
+};
+
+template<typename Scalar>
+struct imag_ref_impl : imag_ref_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
+
+template<typename Scalar>
+struct imag_ref_retval
+{
+  typedef typename NumTraits<Scalar>::Real & type;
+};
+
+/****************************************************************************
+* Implementation of conj                                                 *
+****************************************************************************/
+
+template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct conj_default_impl
+{
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x)
+  {
+    return x;
+  }
+};
+
+template<typename Scalar>
+struct conj_default_impl<Scalar,true>
+{
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x)
+  {
+    using std::conj;
+    return conj(x);
+  }
+};
+
+template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct conj_impl : conj_default_impl<Scalar, IsComplex> {};
+
+template<typename Scalar>
+struct conj_retval
+{
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of abs2                                                 *
+****************************************************************************/
+
+template<typename Scalar,bool IsComplex>
+struct abs2_impl_default
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    return x*x;
+  }
+};
+
+template<typename Scalar>
+struct abs2_impl_default<Scalar, true> // IsComplex
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    return x.real()*x.real() + x.imag()*x.imag();
+  }
+};
+
+template<typename Scalar>
+struct abs2_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    return abs2_impl_default<Scalar,NumTraits<Scalar>::IsComplex>::run(x);
+  }
+};
+
+template<typename Scalar>
+struct abs2_retval
+{
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of sqrt/rsqrt                                             *
+****************************************************************************/
+
+template<typename Scalar>
+struct sqrt_impl
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE Scalar run(const Scalar& x)
+  {
+    EIGEN_USING_STD(sqrt);
+    return sqrt(x);
+  }
+};
+
+// Complex sqrt defined in MathFunctionsImpl.h.
+template<typename T> EIGEN_DEVICE_FUNC std::complex<T> complex_sqrt(const std::complex<T>& a_x);
+
+// Custom implementation is faster than `std::sqrt`, works on
+// GPU, and correctly handles special cases (unlike MSVC).
+template<typename T>
+struct sqrt_impl<std::complex<T> >
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x)
+  {
+    return complex_sqrt<T>(x);
+  }
+};
+
+template<typename Scalar>
+struct sqrt_retval
+{
+  typedef Scalar type;
+};
+
+// Default implementation relies on numext::sqrt, at bottom of file.
+template<typename T>
+struct rsqrt_impl;
+
+// Complex rsqrt defined in MathFunctionsImpl.h.
+template<typename T> EIGEN_DEVICE_FUNC std::complex<T> complex_rsqrt(const std::complex<T>& a_x);
+
+template<typename T>
+struct rsqrt_impl<std::complex<T> >
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x)
+  {
+    return complex_rsqrt<T>(x);
+  }
+};
+
+template<typename Scalar>
+struct rsqrt_retval
+{
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of norm1                                                *
+****************************************************************************/
+
+template<typename Scalar, bool IsComplex>
+struct norm1_default_impl;
+
+template<typename Scalar>
+struct norm1_default_impl<Scalar,true>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    EIGEN_USING_STD(abs);
+    return abs(x.real()) + abs(x.imag());
+  }
+};
+
+template<typename Scalar>
+struct norm1_default_impl<Scalar, false>
+{
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_USING_STD(abs);
+    return abs(x);
+  }
+};
+
+template<typename Scalar>
+struct norm1_impl : norm1_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
+
+template<typename Scalar>
+struct norm1_retval
+{
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of hypot                                                *
+****************************************************************************/
+
+template<typename Scalar> struct hypot_impl;
+
+template<typename Scalar>
+struct hypot_retval
+{
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of cast                                                 *
+****************************************************************************/
+
+template<typename OldType, typename NewType, typename EnableIf = void>
+struct cast_impl
+{
+  EIGEN_DEVICE_FUNC
+  static inline NewType run(const OldType& x)
+  {
+    return static_cast<NewType>(x);
+  }
+};
+
+// Casting from S -> Complex<T> leads to an implicit conversion from S to T,
+// generating warnings on clang.  Here we explicitly cast the real component.
+template<typename OldType, typename NewType>
+struct cast_impl<OldType, NewType,
+  typename internal::enable_if<
+    !NumTraits<OldType>::IsComplex && NumTraits<NewType>::IsComplex
+  >::type>
+{
+  EIGEN_DEVICE_FUNC
+  static inline NewType run(const OldType& x)
+  {
+    typedef typename NumTraits<NewType>::Real NewReal;
+    return static_cast<NewType>(static_cast<NewReal>(x));
+  }
+};
+
+// here, for once, we're plainly returning NewType: we don't want cast to do weird things.
+
+template<typename OldType, typename NewType>
+EIGEN_DEVICE_FUNC
+inline NewType cast(const OldType& x)
+{
+  return cast_impl<OldType, NewType>::run(x);
+}
+
+/****************************************************************************
+* Implementation of round                                                   *
+****************************************************************************/
+
+template<typename Scalar>
+struct round_impl
+{
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
+#if EIGEN_HAS_CXX11_MATH
+    EIGEN_USING_STD(round);
+#endif
+    return Scalar(round(x));
+  }
+};
+
+#if !EIGEN_HAS_CXX11_MATH
+#if EIGEN_HAS_C99_MATH
+// Use ::roundf for float.
+template<>
+struct round_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static inline float run(const float& x)
+  {
+    return ::roundf(x);
+  }
+};
+#else
+template<typename Scalar>
+struct round_using_floor_ceil_impl
+{
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
+    // Without C99 round/roundf, resort to floor/ceil.
+    EIGEN_USING_STD(floor);
+    EIGEN_USING_STD(ceil);
+    // If not enough precision to resolve a decimal at all, return the input.
+    // Otherwise, adding 0.5 can trigger an increment by 1.
+    const Scalar limit = Scalar(1ull << (NumTraits<Scalar>::digits() - 1));
+    if (x >= limit || x <= -limit) {
+      return x;
+    }
+    return (x > Scalar(0)) ? Scalar(floor(x + Scalar(0.5))) : Scalar(ceil(x - Scalar(0.5)));
+  }
+};
+
+template<>
+struct round_impl<float> : round_using_floor_ceil_impl<float> {};
+
+template<>
+struct round_impl<double> : round_using_floor_ceil_impl<double> {};
+#endif // EIGEN_HAS_C99_MATH
+#endif // !EIGEN_HAS_CXX11_MATH
+
+template<typename Scalar>
+struct round_retval
+{
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of rint                                                    *
+****************************************************************************/
+
+template<typename Scalar>
+struct rint_impl {
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
+#if EIGEN_HAS_CXX11_MATH
+      EIGEN_USING_STD(rint);
+#endif
+    return rint(x);
+  }
+};
+
+#if !EIGEN_HAS_CXX11_MATH
+template<>
+struct rint_impl<double> {
+  EIGEN_DEVICE_FUNC
+  static inline double run(const double& x)
+  {
+    return ::rint(x);
+  }
+};
+template<>
+struct rint_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static inline float run(const float& x)
+  {
+    return ::rintf(x);
+  }
+};
+#endif
+
+template<typename Scalar>
+struct rint_retval
+{
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of arg                                                     *
+****************************************************************************/
+
+// Visual Studio 2017 has a bug where arg(float) returns 0 for negative inputs.
+// This seems to be fixed in VS 2019.
+#if EIGEN_HAS_CXX11_MATH && (!EIGEN_COMP_MSVC || EIGEN_COMP_MSVC >= 1920)
+// std::arg is only defined for types of std::complex, or integer types or float/double/long double
+template<typename Scalar,
+          bool HasStdImpl = NumTraits<Scalar>::IsComplex || is_integral<Scalar>::value
+                            || is_same<Scalar, float>::value || is_same<Scalar, double>::value
+                            || is_same<Scalar, long double>::value >
+struct arg_default_impl;
+
+template<typename Scalar>
+struct arg_default_impl<Scalar, true> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    #if defined(EIGEN_HIP_DEVICE_COMPILE)
+    // HIP does not seem to have a native device side implementation for the math routine "arg"
+    using std::arg;
+    #else
+    EIGEN_USING_STD(arg);
+    #endif
+    return static_cast<RealScalar>(arg(x));
+  }
+};
+
+// Must be non-complex floating-point type (e.g. half/bfloat16).
+template<typename Scalar>
+struct arg_default_impl<Scalar, false> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    return (x < Scalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0);
+  }
+};
+#else
+template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct arg_default_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    return (x < RealScalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0);
+  }
+};
+
+template<typename Scalar>
+struct arg_default_impl<Scalar,true>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    EIGEN_USING_STD(arg);
+    return arg(x);
+  }
+};
+#endif
+template<typename Scalar> struct arg_impl : arg_default_impl<Scalar> {};
+
+template<typename Scalar>
+struct arg_retval
+{
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of expm1                                                   *
+****************************************************************************/
+
+// This implementation is based on GSL Math's expm1.
+namespace std_fallback {
+  // fallback expm1 implementation in case there is no expm1(Scalar) function in namespace of Scalar,
+  // or that there is no suitable std::expm1 function available. Implementation
+  // attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php.
+  template<typename Scalar>
+  EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    EIGEN_USING_STD(exp);
+    Scalar u = exp(x);
+    if (numext::equal_strict(u, Scalar(1))) {
+      return x;
+    }
+    Scalar um1 = u - RealScalar(1);
+    if (numext::equal_strict(um1, Scalar(-1))) {
+      return RealScalar(-1);
+    }
+
+    EIGEN_USING_STD(log);
+    Scalar logu = log(u);
+    return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu;
+  }
+}
+
+template<typename Scalar>
+struct expm1_impl {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    #if EIGEN_HAS_CXX11_MATH
+    using std::expm1;
+    #else
+    using std_fallback::expm1;
+    #endif
+    return expm1(x);
+  }
+};
+
+template<typename Scalar>
+struct expm1_retval
+{
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of log                                                     *
+****************************************************************************/
+
+// Complex log defined in MathFunctionsImpl.h.
+template<typename T> EIGEN_DEVICE_FUNC std::complex<T> complex_log(const std::complex<T>& z);
+
+template<typename Scalar>
+struct log_impl {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_USING_STD(log);
+    return static_cast<Scalar>(log(x));
+  }
+};
+
+template<typename Scalar>
+struct log_impl<std::complex<Scalar> > {
+  EIGEN_DEVICE_FUNC static inline std::complex<Scalar> run(const std::complex<Scalar>& z)
+  {
+    return complex_log(z);
+  }
+};
+
+/****************************************************************************
+* Implementation of log1p                                                   *
+****************************************************************************/
+
+namespace std_fallback {
+  // fallback log1p implementation in case there is no log1p(Scalar) function in namespace of Scalar,
+  // or that there is no suitable std::log1p function available
+  template<typename Scalar>
+  EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    EIGEN_USING_STD(log);
+    Scalar x1p = RealScalar(1) + x;
+    Scalar log_1p = log_impl<Scalar>::run(x1p);
+    const bool is_small = numext::equal_strict(x1p, Scalar(1));
+    const bool is_inf = numext::equal_strict(x1p, log_1p);
+    return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1)));
+  }
+}
+
+template<typename Scalar>
+struct log1p_impl {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    #if EIGEN_HAS_CXX11_MATH
+    using std::log1p;
+    #else
+    using std_fallback::log1p;
+    #endif
+    return log1p(x);
+  }
+};
+
+// Specialization for complex types that are not supported by std::log1p.
+template <typename RealScalar>
+struct log1p_impl<std::complex<RealScalar> > {
+  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
+      const std::complex<RealScalar>& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
+    return std_fallback::log1p(x);
+  }
+};
+
+template<typename Scalar>
+struct log1p_retval
+{
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of pow                                                  *
+****************************************************************************/
+
+template<typename ScalarX,typename ScalarY, bool IsInteger = NumTraits<ScalarX>::IsInteger&&NumTraits<ScalarY>::IsInteger>
+struct pow_impl
+{
+  //typedef Scalar retval;
+  typedef typename ScalarBinaryOpTraits<ScalarX,ScalarY,internal::scalar_pow_op<ScalarX,ScalarY> >::ReturnType result_type;
+  static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x, const ScalarY& y)
+  {
+    EIGEN_USING_STD(pow);
+    return pow(x, y);
+  }
+};
+
+template<typename ScalarX,typename ScalarY>
+struct pow_impl<ScalarX,ScalarY, true>
+{
+  typedef ScalarX result_type;
+  static EIGEN_DEVICE_FUNC inline ScalarX run(ScalarX x, ScalarY y)
+  {
+    ScalarX res(1);
+    eigen_assert(!NumTraits<ScalarY>::IsSigned || y >= 0);
+    if(y & 1) res *= x;
+    y >>= 1;
+    while(y)
+    {
+      x *= x;
+      if(y&1) res *= x;
+      y >>= 1;
+    }
+    return res;
+  }
+};
+
+/****************************************************************************
+* Implementation of random                                               *
+****************************************************************************/
+
+template<typename Scalar,
+         bool IsComplex,
+         bool IsInteger>
+struct random_default_impl {};
+
+template<typename Scalar>
+struct random_impl : random_default_impl<Scalar, NumTraits<Scalar>::IsComplex, NumTraits<Scalar>::IsInteger> {};
+
+template<typename Scalar>
+struct random_retval
+{
+  typedef Scalar type;
+};
+
+template<typename Scalar> inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(const Scalar& x, const Scalar& y);
+template<typename Scalar> inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random();
+
+template<typename Scalar>
+struct random_default_impl<Scalar, false, false>
+{
+  static inline Scalar run(const Scalar& x, const Scalar& y)
+  {
+    return x + (y-x) * Scalar(std::rand()) / Scalar(RAND_MAX);
+  }
+  static inline Scalar run()
+  {
+    return run(Scalar(NumTraits<Scalar>::IsSigned ? -1 : 0), Scalar(1));
+  }
+};
+
+enum {
+  meta_floor_log2_terminate,
+  meta_floor_log2_move_up,
+  meta_floor_log2_move_down,
+  meta_floor_log2_bogus
+};
+
+template<unsigned int n, int lower, int upper> struct meta_floor_log2_selector
+{
+  enum { middle = (lower + upper) / 2,
+         value = (upper <= lower + 1) ? int(meta_floor_log2_terminate)
+               : (n < (1 << middle)) ? int(meta_floor_log2_move_down)
+               : (n==0) ? int(meta_floor_log2_bogus)
+               : int(meta_floor_log2_move_up)
+  };
+};
+
+template<unsigned int n,
+         int lower = 0,
+         int upper = sizeof(unsigned int) * CHAR_BIT - 1,
+         int selector = meta_floor_log2_selector<n, lower, upper>::value>
+struct meta_floor_log2 {};
+
+template<unsigned int n, int lower, int upper>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_down>
+{
+  enum { value = meta_floor_log2<n, lower, meta_floor_log2_selector<n, lower, upper>::middle>::value };
+};
+
+template<unsigned int n, int lower, int upper>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_up>
+{
+  enum { value = meta_floor_log2<n, meta_floor_log2_selector<n, lower, upper>::middle, upper>::value };
+};
+
+template<unsigned int n, int lower, int upper>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_terminate>
+{
+  enum { value = (n >= ((unsigned int)(1) << (lower+1))) ? lower+1 : lower };
+};
+
+template<unsigned int n, int lower, int upper>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus>
+{
+  // no value, error at compile time
+};
+
+template<typename Scalar>
+struct random_default_impl<Scalar, false, true>
+{
+  static inline Scalar run(const Scalar& x, const Scalar& y)
+  {
+    if (y <= x)
+      return x;
+    // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
+    typedef typename make_unsigned<Scalar>::type ScalarU;
+    // ScalarX is the widest of ScalarU and unsigned int.
+    // We'll deal only with ScalarX and unsigned int below thus avoiding signed
+    // types and arithmetic and signed overflows (which are undefined behavior).
+    typedef typename conditional<(ScalarU(-1) > unsigned(-1)), ScalarU, unsigned>::type ScalarX;
+    // The following difference doesn't overflow, provided our integer types are two's
+    // complement and have the same number of padding bits in signed and unsigned variants.
+    // This is the case in most modern implementations of C++.
+    ScalarX range = ScalarX(y) - ScalarX(x);
+    ScalarX offset = 0;
+    ScalarX divisor = 1;
+    ScalarX multiplier = 1;
+    const unsigned rand_max = RAND_MAX;
+    if (range <= rand_max) divisor = (rand_max + 1) / (range + 1);
+    else                   multiplier = 1 + range / (rand_max + 1);
+    // Rejection sampling.
+    do {
+      offset = (unsigned(std::rand()) * multiplier) / divisor;
+    } while (offset > range);
+    return Scalar(ScalarX(x) + offset);
+  }
+
+  static inline Scalar run()
+  {
+#ifdef EIGEN_MAKING_DOCS
+    return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
+#else
+    enum { rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX)+1>::value,
+           scalar_bits = sizeof(Scalar) * CHAR_BIT,
+           shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
+           offset = NumTraits<Scalar>::IsSigned ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits,scalar_bits)-1)) : 0
+    };
+    return Scalar((std::rand() >> shift) - offset);
+#endif
+  }
+};
+
+template<typename Scalar>
+struct random_default_impl<Scalar, true, false>
+{
+  static inline Scalar run(const Scalar& x, const Scalar& y)
+  {
+    return Scalar(random(x.real(), y.real()),
+                  random(x.imag(), y.imag()));
+  }
+  static inline Scalar run()
+  {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    return Scalar(random<RealScalar>(), random<RealScalar>());
+  }
+};
+
+template<typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(const Scalar& x, const Scalar& y)
+{
+  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(x, y);
+}
+
+template<typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
+{
+  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
+}
+
+// Implementation of is* functions
+
+// std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang.
+#if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG)
+#define EIGEN_USE_STD_FPCLASSIFY 1
+#else
+#define EIGEN_USE_STD_FPCLASSIFY 0
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<internal::is_integral<T>::value,bool>::type
+isnan_impl(const T&) { return false; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<internal::is_integral<T>::value,bool>::type
+isinf_impl(const T&) { return false; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<internal::is_integral<T>::value,bool>::type
+isfinite_impl(const T&) { return true; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
+isfinite_impl(const T& x)
+{
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
+    return (::isfinite)(x);
+  #elif EIGEN_USE_STD_FPCLASSIFY
+    using std::isfinite;
+    return isfinite EIGEN_NOT_A_MACRO (x);
+  #else
+    return x<=NumTraits<T>::highest() && x>=NumTraits<T>::lowest();
+  #endif
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
+isinf_impl(const T& x)
+{
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
+    return (::isinf)(x);
+  #elif EIGEN_USE_STD_FPCLASSIFY
+    using std::isinf;
+    return isinf EIGEN_NOT_A_MACRO (x);
+  #else
+    return x>NumTraits<T>::highest() || x<NumTraits<T>::lowest();
+  #endif
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
+isnan_impl(const T& x)
+{
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
+    return (::isnan)(x);
+  #elif EIGEN_USE_STD_FPCLASSIFY
+    using std::isnan;
+    return isnan EIGEN_NOT_A_MACRO (x);
+  #else
+    return x != x;
+  #endif
+}
+
+#if (!EIGEN_USE_STD_FPCLASSIFY)
+
+#if EIGEN_COMP_MSVC
+
+template<typename T> EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x)
+{
+  return _fpclass(x)==_FPCLASS_NINF || _fpclass(x)==_FPCLASS_PINF;
+}
+
+//MSVC defines a _isnan builtin function, but for double only
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x)!=0; }
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x)      { return _isnan(x)!=0; }
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x)       { return _isnan(x)!=0; }
+
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); }
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x)      { return isinf_msvc_helper(x); }
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x)       { return isinf_msvc_helper(x); }
+
+#elif (defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ && EIGEN_COMP_GNUC)
+
+#if EIGEN_GNUC_AT_LEAST(5,0)
+  #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((optimize("no-finite-math-only")))
+#else
+  // NOTE the inline qualifier and noinline attribute are both needed: the former is to avoid linking issue (duplicate symbol),
+  //      while the second prevent too aggressive optimizations in fast-math mode:
+  #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((noinline,optimize("no-finite-math-only")))
+#endif
+
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) { return __builtin_isnan(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x)      { return __builtin_isnan(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x)       { return __builtin_isnan(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x)      { return __builtin_isinf(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x)       { return __builtin_isinf(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) { return __builtin_isinf(x); }
+
+#undef EIGEN_TMP_NOOPT_ATTRIB
+
+#endif
+
+#endif
+
+// The following overload are defined at the end of this file
+template<typename T> EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x);
+template<typename T> EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x);
+template<typename T> EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x);
+
+template<typename T> T generic_fast_tanh_float(const T& a_x);
+} // end namespace internal
+
+/****************************************************************************
+* Generic math functions                                                    *
+****************************************************************************/
+
+namespace numext {
+
+#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
+{
+  EIGEN_USING_STD(min)
+  return min EIGEN_NOT_A_MACRO (x,y);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
+{
+  EIGEN_USING_STD(max)
+  return max EIGEN_NOT_A_MACRO (x,y);
+}
+#else
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
+{
+  return y < x ? y : x;
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
+{
+  return fminf(x, y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
+{
+  return fmin(x, y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
+{
+#if defined(EIGEN_HIPCC)
+  // no "fminl" on HIP yet
+  return (x < y) ? x : y;
+#else
+  return fminl(x, y);
+#endif
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
+{
+  return x < y ? y : x;
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
+{
+  return fmaxf(x, y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
+{
+  return fmax(x, y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
+{
+#if defined(EIGEN_HIPCC)
+  // no "fmaxl" on HIP yet
+  return (x > y) ? x : y;
+#else
+  return fmaxl(x, y);
+#endif
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+
+
+#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char)   \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short)  \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int)    \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
+#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char)   \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short)  \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int)    \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
+#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)  \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)   \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
+#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)  \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)   \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
+#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC)
+#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(NAME, FUNC, RET_TYPE) \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double)
+
+#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
+template<>                                               \
+  EIGEN_DEVICE_FUNC                                      \
+  EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \
+    return cl::sycl::FUNC(x);                            \
+  }
+
+#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE)
+
+#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \
+  template<>                                                                  \
+  EIGEN_DEVICE_FUNC                                                           \
+  EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, const ARG_TYPE2& y) { \
+    return cl::sycl::FUNC(x, y);                                              \
+  }
+
+#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
+  SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE)
+
+#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \
+  SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE)
+
+SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin)
+SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
+
+#endif
+
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(real, Scalar) real(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) >::type real_ref(const Scalar& x)
+{
+  return internal::real_ref_impl<Scalar>::run(x);
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) real_ref(Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x);
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(imag, Scalar) imag(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(arg, Scalar) arg(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(arg, Scalar)::run(x);
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) >::type imag_ref(const Scalar& x)
+{
+  return internal::imag_ref_impl<Scalar>::run(x);
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) imag_ref(Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(imag_ref, Scalar)::run(x);
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(conj, Scalar) conj(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x);
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
+}
+
+EIGEN_DEVICE_FUNC
+inline bool abs2(bool x) { return x; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T absdiff(const T& x, const T& y)
+{
+  return x > y ? x - y : y - x;
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE float absdiff(const float& x, const float& y)
+{
+  return fabsf(x - y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE double absdiff(const double& x, const double& y)
+{
+  return fabs(x - y);
+}
+
+#if !defined(EIGEN_GPUCC)
+// HIP and CUDA do not support long double.
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE long double absdiff(const long double& x, const long double& y) {
+  return fabsl(x - y);
+}
+#endif
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(norm1, Scalar)::run(x);
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& y)
+{
+  return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+  SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot)
+#endif
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float log1p(const float &x) { return ::log1pf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double log1p(const double &x) { return ::log1p(x); }
+#endif
+
+template<typename ScalarX,typename ScalarY>
+EIGEN_DEVICE_FUNC
+inline typename internal::pow_impl<ScalarX,ScalarY>::result_type pow(const ScalarX& x, const ScalarY& y)
+{
+  return internal::pow_impl<ScalarX,ScalarY>::run(x, y);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow)
+#endif
+
+template<typename T> EIGEN_DEVICE_FUNC bool (isnan)   (const T &x) { return internal::isnan_impl(x); }
+template<typename T> EIGEN_DEVICE_FUNC bool (isinf)   (const T &x) { return internal::isinf_impl(x); }
+template<typename T> EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); }
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
+#endif
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(rint, Scalar) rint(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(rint, Scalar)::run(x);
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+T (floor)(const T& x)
+{
+  EIGEN_USING_STD(floor)
+  return floor(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float floor(const float &x) { return ::floorf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double floor(const double &x) { return ::floor(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+T (ceil)(const T& x)
+{
+  EIGEN_USING_STD(ceil);
+  return ceil(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float ceil(const float &x) { return ::ceilf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double ceil(const double &x) { return ::ceil(x); }
+#endif
+
+
+/** Log base 2 for 32 bits positive integers.
+  * Conveniently returns 0 for x==0. */
+inline int log2(int x)
+{
+  eigen_assert(x>=0);
+  unsigned int v(x);
+  static const int table[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  return table[(v * 0x07C4ACDDU) >> 27];
+}
+
+/** \returns the square root of \a x.
+  *
+  * It is essentially equivalent to
+  * \code using std::sqrt; return sqrt(x); \endcode
+  * but slightly faster for float/double and some compilers (e.g., gcc), thanks to
+  * specializations when SSE is enabled.
+  *
+  * It's usage is justified in performance critical functions, like norm/normalize.
+  */
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE EIGEN_MATHFUNC_RETVAL(sqrt, Scalar) sqrt(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(sqrt, Scalar)::run(x);
+}
+
+// Boolean specialization, avoids implicit float to bool conversion (-Wimplicit-conversion-floating-point-to-bool).
+template<>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC
+bool sqrt<bool>(const bool &x) { return x; }
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
+#endif
+
+/** \returns the reciprocal square root of \a x. **/
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T rsqrt(const T& x)
+{
+  return internal::rsqrt_impl<T>::run(x);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T log(const T &x) {
+  return internal::log_impl<T>::run(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log)
+#endif
+
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float log(const float &x) { return ::logf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double log(const double &x) { return ::log(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+typename internal::enable_if<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex,typename NumTraits<T>::Real>::type
+abs(const T &x) {
+  EIGEN_USING_STD(abs);
+  return abs(x);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+typename internal::enable_if<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex),typename NumTraits<T>::Real>::type
+abs(const T &x) {
+  return x;
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float abs(const float &x) { return ::fabsf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double abs(const double &x) { return ::fabs(x); }
+
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float abs(const std::complex<float>& x) {
+  return ::hypotf(x.real(), x.imag());
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double abs(const std::complex<double>& x) {
+  return ::hypot(x.real(), x.imag());
+}
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T exp(const T &x) {
+  EIGEN_USING_STD(exp);
+  return exp(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float exp(const float &x) { return ::expf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double exp(const double &x) { return ::exp(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+std::complex<float> exp(const std::complex<float>& x) {
+  float com = ::expf(x.real());
+  float res_real = com * ::cosf(x.imag());
+  float res_imag = com * ::sinf(x.imag());
+  return std::complex<float>(res_real, res_imag);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+std::complex<double> exp(const std::complex<double>& x) {
+  double com = ::exp(x.real());
+  double res_real = com * ::cos(x.imag());
+  double res_imag = com * ::sin(x.imag());
+  return std::complex<double>(res_real, res_imag);
+}
+#endif
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) expm1(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float expm1(const float &x) { return ::expm1f(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double expm1(const double &x) { return ::expm1(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T cos(const T &x) {
+  EIGEN_USING_STD(cos);
+  return cos(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos,cos)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float cos(const float &x) { return ::cosf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double cos(const double &x) { return ::cos(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T sin(const T &x) {
+  EIGEN_USING_STD(sin);
+  return sin(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float sin(const float &x) { return ::sinf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double sin(const double &x) { return ::sin(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T tan(const T &x) {
+  EIGEN_USING_STD(tan);
+  return tan(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float tan(const float &x) { return ::tanf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double tan(const double &x) { return ::tan(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T acos(const T &x) {
+  EIGEN_USING_STD(acos);
+  return acos(x);
+}
+
+#if EIGEN_HAS_CXX11_MATH
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T acosh(const T &x) {
+  EIGEN_USING_STD(acosh);
+  return static_cast<T>(acosh(x));
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float acos(const float &x) { return ::acosf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double acos(const double &x) { return ::acos(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T asin(const T &x) {
+  EIGEN_USING_STD(asin);
+  return asin(x);
+}
+
+#if EIGEN_HAS_CXX11_MATH
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T asinh(const T &x) {
+  EIGEN_USING_STD(asinh);
+  return static_cast<T>(asinh(x));
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float asin(const float &x) { return ::asinf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double asin(const double &x) { return ::asin(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T atan(const T &x) {
+  EIGEN_USING_STD(atan);
+  return static_cast<T>(atan(x));
+}
+
+#if EIGEN_HAS_CXX11_MATH
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T atanh(const T &x) {
+  EIGEN_USING_STD(atanh);
+  return static_cast<T>(atanh(x));
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float atan(const float &x) { return ::atanf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double atan(const double &x) { return ::atan(x); }
+#endif
+
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T cosh(const T &x) {
+  EIGEN_USING_STD(cosh);
+  return static_cast<T>(cosh(x));
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float cosh(const float &x) { return ::coshf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double cosh(const double &x) { return ::cosh(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T sinh(const T &x) {
+  EIGEN_USING_STD(sinh);
+  return static_cast<T>(sinh(x));
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float sinh(const float &x) { return ::sinhf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double sinh(const double &x) { return ::sinh(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T tanh(const T &x) {
+  EIGEN_USING_STD(tanh);
+  return tanh(x);
+}
+
+#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && !defined(SYCL_DEVICE_ONLY)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float tanh(float x) { return internal::generic_fast_tanh_float(x); }
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float tanh(const float &x) { return ::tanhf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double tanh(const double &x) { return ::tanh(x); }
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T fmod(const T& a, const T& b) {
+  EIGEN_USING_STD(fmod);
+  return fmod(a, b);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float fmod(const float& a, const float& b) {
+  return ::fmodf(a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double fmod(const double& a, const double& b) {
+  return ::fmod(a, b);
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE
+#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC
+#undef SYCL_SPECIALIZE_UNARY_FUNC
+#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC
+#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC
+#undef SYCL_SPECIALIZE_BINARY_FUNC
+#endif
+
+} // end namespace numext
+
+namespace internal {
+
+template<typename T>
+EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x)
+{
+  return (numext::isfinite)(numext::real(x)) && (numext::isfinite)(numext::imag(x));
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x)
+{
+  return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x));
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x)
+{
+  return ((numext::isinf)(numext::real(x)) || (numext::isinf)(numext::imag(x))) && (!(numext::isnan)(x));
+}
+
+/****************************************************************************
+* Implementation of fuzzy comparisons                                       *
+****************************************************************************/
+
+template<typename Scalar,
+         bool IsComplex,
+         bool IsInteger>
+struct scalar_fuzzy_default_impl {};
+
+template<typename Scalar>
+struct scalar_fuzzy_default_impl<Scalar, false, false>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
+  static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
+  {
+    return numext::abs(x) <= numext::abs(y) * prec;
+  }
+  EIGEN_DEVICE_FUNC
+  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
+  {
+    return numext::abs(x - y) <= numext::mini(numext::abs(x), numext::abs(y)) * prec;
+  }
+  EIGEN_DEVICE_FUNC
+  static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar& prec)
+  {
+    return x <= y || isApprox(x, y, prec);
+  }
+};
+
+template<typename Scalar>
+struct scalar_fuzzy_default_impl<Scalar, false, true>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
+  static inline bool isMuchSmallerThan(const Scalar& x, const Scalar&, const RealScalar&)
+  {
+    return x == Scalar(0);
+  }
+  EIGEN_DEVICE_FUNC
+  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar&)
+  {
+    return x == y;
+  }
+  EIGEN_DEVICE_FUNC
+  static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar&)
+  {
+    return x <= y;
+  }
+};
+
+template<typename Scalar>
+struct scalar_fuzzy_default_impl<Scalar, true, false>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
+  static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
+  {
+    return numext::abs2(x) <= numext::abs2(y) * prec * prec;
+  }
+  EIGEN_DEVICE_FUNC
+  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
+  {
+    return numext::abs2(x - y) <= numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
+  }
+};
+
+template<typename Scalar>
+struct scalar_fuzzy_impl : scalar_fuzzy_default_impl<Scalar, NumTraits<Scalar>::IsComplex, NumTraits<Scalar>::IsInteger> {};
+
+template<typename Scalar, typename OtherScalar> EIGEN_DEVICE_FUNC
+inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
+                              const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
+{
+  return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(x, y, precision);
+}
+
+template<typename Scalar> EIGEN_DEVICE_FUNC
+inline bool isApprox(const Scalar& x, const Scalar& y,
+                     const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
+{
+  return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision);
+}
+
+template<typename Scalar> EIGEN_DEVICE_FUNC
+inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y,
+                               const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
+{
+  return scalar_fuzzy_impl<Scalar>::isApproxOrLessThan(x, y, precision);
+}
+
+/******************************************
+***  The special case of the  bool type ***
+******************************************/
+
+template<> struct random_impl<bool>
+{
+  static inline bool run()
+  {
+    return random<int>(0,1)==0 ? false : true;
+  }
+
+  static inline bool run(const bool& a, const bool& b)
+  {
+    return random<int>(a, b)==0 ? false : true;
+  }
+};
+
+template<> struct scalar_fuzzy_impl<bool>
+{
+  typedef bool RealScalar;
+
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
+  static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&)
+  {
+    return !x;
+  }
+
+  EIGEN_DEVICE_FUNC
+  static inline bool isApprox(bool x, bool y, bool)
+  {
+    return x == y;
+  }
+
+  EIGEN_DEVICE_FUNC
+  static inline bool isApproxOrLessThan(const bool& x, const bool& y, const bool&)
+  {
+    return (!x) || y;
+  }
+
+};
+
+} // end namespace internal
+
+// Default implementations that rely on other numext implementations
+namespace internal {
+
+// Specialization for complex types that are not supported by std::expm1.
+template <typename RealScalar>
+struct expm1_impl<std::complex<RealScalar> > {
+  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
+      const std::complex<RealScalar>& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
+    RealScalar xr = x.real();
+    RealScalar xi = x.imag();
+    // expm1(z) = exp(z) - 1
+    //          = exp(x +  i * y) - 1
+    //          = exp(x) * (cos(y) + i * sin(y)) - 1
+    //          = exp(x) * cos(y) - 1 + i * exp(x) * sin(y)
+    // Imag(expm1(z)) = exp(x) * sin(y)
+    // Real(expm1(z)) = exp(x) * cos(y) - 1
+    //          = exp(x) * cos(y) - 1.
+    //          = expm1(x) + exp(x) * (cos(y) - 1)
+    //          = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2)
+    RealScalar erm1 = numext::expm1<RealScalar>(xr);
+    RealScalar er = erm1 + RealScalar(1.);
+    RealScalar sin2 = numext::sin(xi / RealScalar(2.));
+    sin2 = sin2 * sin2;
+    RealScalar s = numext::sin(xi);
+    RealScalar real_part = erm1 - RealScalar(2.) * er * sin2;
+    return std::complex<RealScalar>(real_part, er * s);
+  }
+};
+
+template<typename T>
+struct rsqrt_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE T run(const T& x) {
+    return T(1)/numext::sqrt(x);
+  }
+};
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template<typename T>
+struct conj_impl<std::complex<T>, true>
+{
+  EIGEN_DEVICE_FUNC
+  static inline std::complex<T> run(const std::complex<T>& x)
+  {
+    return std::complex<T>(numext::real(x), -numext::imag(x));
+  }
+};
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATHFUNCTIONS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/MathFunctionsImpl.h b/src/3rdparty/eigen/Eigen/src/Core/MathFunctionsImpl.h
new file mode 100644
index 000000000..4eaaaa784
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/MathFunctionsImpl.h
@@ -0,0 +1,200 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATHFUNCTIONSIMPL_H
+#define EIGEN_MATHFUNCTIONSIMPL_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
+    Doesn't do anything fancy, just a 13/6-degree rational interpolant which
+    is accurate up to a couple of ulps in the (approximate) range [-8, 8],
+    outside of which tanh(x) = +/-1 in single precision. The input is clamped
+    to the range [-c, c]. The value c is chosen as the smallest value where
+    the approximation evaluates to exactly 1. In the reange [-0.0004, 0.0004]
+    the approxmation tanh(x) ~= x is used for better accuracy as x tends to zero.
+
+    This implementation works on both scalars and packets.
+*/
+template<typename T>
+T generic_fast_tanh_float(const T& a_x)
+{
+  // Clamp the inputs to the range [-c, c]
+#ifdef EIGEN_VECTORIZE_FMA
+  const T plus_clamp = pset1<T>(7.99881172180175781f);
+  const T minus_clamp = pset1<T>(-7.99881172180175781f);
+#else
+  const T plus_clamp = pset1<T>(7.90531110763549805f);
+  const T minus_clamp = pset1<T>(-7.90531110763549805f);
+#endif
+  const T tiny = pset1<T>(0.0004f);
+  const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
+  const T tiny_mask = pcmp_lt(pabs(a_x), tiny);
+  // The monomial coefficients of the numerator polynomial (odd).
+  const T alpha_1 = pset1<T>(4.89352455891786e-03f);
+  const T alpha_3 = pset1<T>(6.37261928875436e-04f);
+  const T alpha_5 = pset1<T>(1.48572235717979e-05f);
+  const T alpha_7 = pset1<T>(5.12229709037114e-08f);
+  const T alpha_9 = pset1<T>(-8.60467152213735e-11f);
+  const T alpha_11 = pset1<T>(2.00018790482477e-13f);
+  const T alpha_13 = pset1<T>(-2.76076847742355e-16f);
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const T beta_0 = pset1<T>(4.89352518554385e-03f);
+  const T beta_2 = pset1<T>(2.26843463243900e-03f);
+  const T beta_4 = pset1<T>(1.18534705686654e-04f);
+  const T beta_6 = pset1<T>(1.19825839466702e-06f);
+
+  // Since the polynomials are odd/even, we need x^2.
+  const T x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial p.
+  T p = pmadd(x2, alpha_13, alpha_11);
+  p = pmadd(x2, p, alpha_9);
+  p = pmadd(x2, p, alpha_7);
+  p = pmadd(x2, p, alpha_5);
+  p = pmadd(x2, p, alpha_3);
+  p = pmadd(x2, p, alpha_1);
+  p = pmul(x, p);
+
+  // Evaluate the denominator polynomial q.
+  T q = pmadd(x2, beta_6, beta_4);
+  q = pmadd(x2, q, beta_2);
+  q = pmadd(x2, q, beta_0);
+
+  // Divide the numerator by the denominator.
+  return pselect(tiny_mask, x, pdiv(p, q));
+}
+
+template<typename RealScalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
+{
+  // IEEE IEC 6059 special cases.
+  if ((numext::isinf)(x) || (numext::isinf)(y))
+    return NumTraits<RealScalar>::infinity();
+  if ((numext::isnan)(x) || (numext::isnan)(y))
+    return NumTraits<RealScalar>::quiet_NaN();
+    
+  EIGEN_USING_STD(sqrt);
+  RealScalar p, qp;
+  p = numext::maxi(x,y);
+  if(p==RealScalar(0)) return RealScalar(0);
+  qp = numext::mini(y,x) / p;
+  return p * sqrt(RealScalar(1) + qp*qp);
+}
+
+template<typename Scalar>
+struct hypot_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static EIGEN_DEVICE_FUNC
+  inline RealScalar run(const Scalar& x, const Scalar& y)
+  {
+    EIGEN_USING_STD(abs);
+    return positive_real_hypot<RealScalar>(abs(x), abs(y));
+  }
+};
+
+// Generic complex sqrt implementation that correctly handles corner cases
+// according to https://en.cppreference.com/w/cpp/numeric/complex/sqrt
+template<typename T>
+EIGEN_DEVICE_FUNC std::complex<T> complex_sqrt(const std::complex<T>& z) {
+  // Computes the principal sqrt of the input.
+  //
+  // For a complex square root of the number x + i*y. We want to find real
+  // numbers u and v such that
+  //    (u + i*v)^2 = x + i*y  <=>
+  //    u^2 - v^2 + i*2*u*v = x + i*v.
+  // By equating the real and imaginary parts we get:
+  //    u^2 - v^2 = x
+  //    2*u*v = y.
+  //
+  // For x >= 0, this has the numerically stable solution
+  //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
+  //    v = y / (2 * u)
+  // and for x < 0,
+  //    v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
+  //    u = y / (2 * v)
+  //
+  // Letting w = sqrt(0.5 * (|x| + |z|)),
+  //   if x == 0: u = w, v = sign(y) * w
+  //   if x > 0:  u = w, v = y / (2 * w)
+  //   if x < 0:  u = |y| / (2 * w), v = sign(y) * w
+
+  const T x = numext::real(z);
+  const T y = numext::imag(z);
+  const T zero = T(0);
+  const T w = numext::sqrt(T(0.5) * (numext::abs(x) + numext::hypot(x, y)));
+
+  return
+    (numext::isinf)(y) ? std::complex<T>(NumTraits<T>::infinity(), y)
+      : x == zero ? std::complex<T>(w, y < zero ? -w : w)
+      : x > zero ? std::complex<T>(w, y / (2 * w))
+      : std::complex<T>(numext::abs(y) / (2 * w), y < zero ? -w : w );
+}
+
+// Generic complex rsqrt implementation.
+template<typename T>
+EIGEN_DEVICE_FUNC std::complex<T> complex_rsqrt(const std::complex<T>& z) {
+  // Computes the principal reciprocal sqrt of the input.
+  //
+  // For a complex reciprocal square root of the number z = x + i*y. We want to
+  // find real numbers u and v such that
+  //    (u + i*v)^2 = 1 / (x + i*y)  <=>
+  //    u^2 - v^2 + i*2*u*v = x/|z|^2 - i*v/|z|^2.
+  // By equating the real and imaginary parts we get:
+  //    u^2 - v^2 = x/|z|^2
+  //    2*u*v = y/|z|^2.
+  //
+  // For x >= 0, this has the numerically stable solution
+  //    u = sqrt(0.5 * (x + |z|)) / |z|
+  //    v = -y / (2 * u * |z|)
+  // and for x < 0,
+  //    v = -sign(y) * sqrt(0.5 * (-x + |z|)) / |z|
+  //    u = -y / (2 * v * |z|)
+  //
+  // Letting w = sqrt(0.5 * (|x| + |z|)),
+  //   if x == 0: u = w / |z|, v = -sign(y) * w / |z|
+  //   if x > 0:  u = w / |z|, v = -y / (2 * w * |z|)
+  //   if x < 0:  u = |y| / (2 * w * |z|), v = -sign(y) * w / |z|
+
+  const T x = numext::real(z);
+  const T y = numext::imag(z);
+  const T zero = T(0);
+
+  const T abs_z = numext::hypot(x, y);
+  const T w = numext::sqrt(T(0.5) * (numext::abs(x) + abs_z));
+  const T woz = w / abs_z;
+  // Corner cases consistent with 1/sqrt(z) on gcc/clang.
+  return
+    abs_z == zero ? std::complex<T>(NumTraits<T>::infinity(), NumTraits<T>::quiet_NaN())
+      : ((numext::isinf)(x) || (numext::isinf)(y)) ? std::complex<T>(zero, zero)
+      : x == zero ? std::complex<T>(woz, y < zero ? woz : -woz)
+      : x > zero ? std::complex<T>(woz, -y / (2 * w * abs_z))
+      : std::complex<T>(numext::abs(y) / (2 * w * abs_z), y < zero ? woz : -woz );
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC std::complex<T> complex_log(const std::complex<T>& z) {
+  // Computes complex log.
+  T a = numext::abs(z);
+  EIGEN_USING_STD(atan2);
+  T b = atan2(z.imag(), z.real());
+  return std::complex<T>(numext::log(a), b);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATHFUNCTIONSIMPL_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Matrix.h b/src/3rdparty/eigen/Eigen/src/Core/Matrix.h
new file mode 100644
index 000000000..f0e59a911
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Matrix.h
@@ -0,0 +1,565 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_H
+#define EIGEN_MATRIX_H
+
+namespace Eigen {
+
+namespace internal {
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+private:
+  enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret };
+  typedef typename find_best_packet<_Scalar,size>::type PacketScalar;
+  enum {
+      row_major_bit = _Options&RowMajor ? RowMajorBit : 0,
+      is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic,
+      max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols,
+      default_alignment = compute_default_alignment<_Scalar,max_size>::value,
+      actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0,
+      required_alignment = unpacket_traits<PacketScalar>::alignment,
+      packet_access_bit = (packet_traits<_Scalar>::Vectorizable && (EIGEN_UNALIGNED_VECTORIZE || (actual_alignment>=required_alignment))) ? PacketAccessBit : 0
+    };
+
+public:
+  typedef _Scalar Scalar;
+  typedef Dense StorageKind;
+  typedef Eigen::Index StorageIndex;
+  typedef MatrixXpr XprKind;
+  enum {
+    RowsAtCompileTime = _Rows,
+    ColsAtCompileTime = _Cols,
+    MaxRowsAtCompileTime = _MaxRows,
+    MaxColsAtCompileTime = _MaxCols,
+    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
+    Options = _Options,
+    InnerStrideAtCompileTime = 1,
+    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime,
+
+    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
+    EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit,
+    Alignment = actual_alignment
+  };
+};
+}
+
+/** \class Matrix
+  * \ingroup Core_Module
+  *
+  * \brief The matrix class, also used for vectors and row-vectors
+  *
+  * The %Matrix class is the work-horse for all \em dense (\ref dense "note") matrices and vectors within Eigen.
+  * Vectors are matrices with one column, and row-vectors are matrices with one row.
+  *
+  * The %Matrix class encompasses \em both fixed-size and dynamic-size objects (\ref fixedsize "note").
+  *
+  * The first three template parameters are required:
+  * \tparam _Scalar Numeric type, e.g. float, double, int or std::complex<float>.
+  *                 User defined scalar types are supported as well (see \ref user_defined_scalars "here").
+  * \tparam _Rows Number of rows, or \b Dynamic
+  * \tparam _Cols Number of columns, or \b Dynamic
+  *
+  * The remaining template parameters are optional -- in most cases you don't have to worry about them.
+  * \tparam _Options A combination of either \b #RowMajor or \b #ColMajor, and of either
+  *                 \b #AutoAlign or \b #DontAlign.
+  *                 The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required
+  *                 for vectorization. It defaults to aligning matrices except for fixed sizes that aren't a multiple of the packet size.
+  * \tparam _MaxRows Maximum number of rows. Defaults to \a _Rows (\ref maxrows "note").
+  * \tparam _MaxCols Maximum number of columns. Defaults to \a _Cols (\ref maxrows "note").
+  *
+  * Eigen provides a number of typedefs covering the usual cases. Here are some examples:
+  *
+  * \li \c Matrix2d is a 2x2 square matrix of doubles (\c Matrix<double, 2, 2>)
+  * \li \c Vector4f is a vector of 4 floats (\c Matrix<float, 4, 1>)
+  * \li \c RowVector3i is a row-vector of 3 ints (\c Matrix<int, 1, 3>)
+  *
+  * \li \c MatrixXf is a dynamic-size matrix of floats (\c Matrix<float, Dynamic, Dynamic>)
+  * \li \c VectorXf is a dynamic-size vector of floats (\c Matrix<float, Dynamic, 1>)
+  *
+  * \li \c Matrix2Xf is a partially fixed-size (dynamic-size) matrix of floats (\c Matrix<float, 2, Dynamic>)
+  * \li \c MatrixX3d is a partially dynamic-size (fixed-size) matrix of double (\c Matrix<double, Dynamic, 3>)
+  *
+  * See \link matrixtypedefs this page \endlink for a complete list of predefined \em %Matrix and \em Vector typedefs.
+  *
+  * You can access elements of vectors and matrices using normal subscripting:
+  *
+  * \code
+  * Eigen::VectorXd v(10);
+  * v[0] = 0.1;
+  * v[1] = 0.2;
+  * v(0) = 0.3;
+  * v(1) = 0.4;
+  *
+  * Eigen::MatrixXi m(10, 10);
+  * m(0, 1) = 1;
+  * m(0, 2) = 2;
+  * m(0, 3) = 3;
+  * \endcode
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_MATRIX_PLUGIN.
+  *
+  * <i><b>Some notes:</b></i>
+  *
+  * <dl>
+  * <dt><b>\anchor dense Dense versus sparse:</b></dt>
+  * <dd>This %Matrix class handles dense, not sparse matrices and vectors. For sparse matrices and vectors, see the Sparse module.
+  *
+  * Dense matrices and vectors are plain usual arrays of coefficients. All the coefficients are stored, in an ordinary contiguous array.
+  * This is unlike Sparse matrices and vectors where the coefficients are stored as a list of nonzero coefficients.</dd>
+  *
+  * <dt><b>\anchor fixedsize Fixed-size versus dynamic-size:</b></dt>
+  * <dd>Fixed-size means that the numbers of rows and columns are known are compile-time. In this case, Eigen allocates the array
+  * of coefficients as a fixed-size array, as a class member. This makes sense for very small matrices, typically up to 4x4, sometimes up
+  * to 16x16. Larger matrices should be declared as dynamic-size even if one happens to know their size at compile-time.
+  *
+  * Dynamic-size means that the numbers of rows or columns are not necessarily known at compile-time. In this case they are runtime
+  * variables, and the array of coefficients is allocated dynamically on the heap.
+  *
+  * Note that \em dense matrices, be they Fixed-size or Dynamic-size, <em>do not</em> expand dynamically in the sense of a std::map.
+  * If you want this behavior, see the Sparse module.</dd>
+  *
+  * <dt><b>\anchor maxrows _MaxRows and _MaxCols:</b></dt>
+  * <dd>In most cases, one just leaves these parameters to the default values.
+  * These parameters mean the maximum size of rows and columns that the matrix may have. They are useful in cases
+  * when the exact numbers of rows and columns are not known are compile-time, but it is known at compile-time that they cannot
+  * exceed a certain value. This happens when taking dynamic-size blocks inside fixed-size matrices: in this case _MaxRows and _MaxCols
+  * are the dimensions of the original matrix, while _Rows and _Cols are Dynamic.</dd>
+  * </dl>
+  *
+  * <i><b>ABI and storage layout</b></i>
+  *
+  * The table below summarizes the ABI of some possible Matrix instances which is fixed thorough the lifetime of Eigen 3.
+  * <table  class="manual">
+  * <tr><th>Matrix type</th><th>Equivalent C structure</th></tr>
+  * <tr><td>\code Matrix<T,Dynamic,Dynamic> \endcode</td><td>\code
+  * struct {
+  *   T *data;                  // with (size_t(data)%EIGEN_MAX_ALIGN_BYTES)==0
+  *   Eigen::Index rows, cols;
+  *  };
+  * \endcode</td></tr>
+  * <tr class="alt"><td>\code
+  * Matrix<T,Dynamic,1>
+  * Matrix<T,1,Dynamic> \endcode</td><td>\code
+  * struct {
+  *   T *data;                  // with (size_t(data)%EIGEN_MAX_ALIGN_BYTES)==0
+  *   Eigen::Index size;
+  *  };
+  * \endcode</td></tr>
+  * <tr><td>\code Matrix<T,Rows,Cols> \endcode</td><td>\code
+  * struct {
+  *   T data[Rows*Cols];        // with (size_t(data)%A(Rows*Cols*sizeof(T)))==0
+  *  };
+  * \endcode</td></tr>
+  * <tr class="alt"><td>\code Matrix<T,Dynamic,Dynamic,0,MaxRows,MaxCols> \endcode</td><td>\code
+  * struct {
+  *   T data[MaxRows*MaxCols];  // with (size_t(data)%A(MaxRows*MaxCols*sizeof(T)))==0
+  *   Eigen::Index rows, cols;
+  *  };
+  * \endcode</td></tr>
+  * </table>
+  * Note that in this table Rows, Cols, MaxRows and MaxCols are all positive integers. A(S) is defined to the largest possible power-of-two
+  * smaller to EIGEN_MAX_STATIC_ALIGN_BYTES.
+  *
+  * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy,
+  * \ref TopicStorageOrders
+  */
+
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+class Matrix
+  : public PlainObjectBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+{
+  public:
+
+    /** \brief Base class typedef.
+      * \sa PlainObjectBase
+      */
+    typedef PlainObjectBase<Matrix> Base;
+
+    enum { Options = _Options };
+
+    EIGEN_DENSE_PUBLIC_INTERFACE(Matrix)
+
+    typedef typename Base::PlainObject PlainObject;
+
+    using Base::base;
+    using Base::coeffRef;
+
+    /**
+      * \brief Assigns matrices to each other.
+      *
+      * \note This is a special case of the templated operator=. Its purpose is
+      * to prevent a default operator= from hiding the templated operator=.
+      *
+      * \callgraph
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Matrix& operator=(const Matrix& other)
+    {
+      return Base::_set(other);
+    }
+
+    /** \internal
+      * \brief Copies the value of the expression \a other into \c *this with automatic resizing.
+      *
+      * *this might be resized to match the dimensions of \a other. If *this was a null matrix (not already initialized),
+      * it will be initialized.
+      *
+      * Note that copying a row-vector into a vector (and conversely) is allowed.
+      * The resizing, if any, is then done in the appropriate way so that row-vectors
+      * remain row-vectors and vectors remain vectors.
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Matrix& operator=(const DenseBase<OtherDerived>& other)
+    {
+      return Base::_set(other);
+    }
+
+    /* Here, doxygen failed to copy the brief information when using \copydoc */
+
+    /**
+      * \brief Copies the generic expression \a other into *this.
+      * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Matrix& operator=(const EigenBase<OtherDerived> &other)
+    {
+      return Base::operator=(other);
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Matrix& operator=(const ReturnByValue<OtherDerived>& func)
+    {
+      return Base::operator=(func);
+    }
+
+    /** \brief Default constructor.
+      *
+      * For fixed-size matrices, does nothing.
+      *
+      * For dynamic-size matrices, creates an empty matrix of size 0. Does not allocate any array. Such a matrix
+      * is called a null matrix. This constructor is the unique way to create null matrices: resizing
+      * a matrix to 0 is not supported.
+      *
+      * \sa resize(Index,Index)
+      */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix() : Base()
+    {
+      Base::_check_template_params();
+      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+
+    // FIXME is it still needed
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    explicit Matrix(internal::constructor_without_unaligned_array_assert)
+      : Base(internal::constructor_without_unaligned_array_assert())
+    { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
+
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
+      : Base(std::move(other))
+    {
+      Base::_check_template_params();
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
+    {
+      Base::operator=(std::move(other));
+      return *this;
+    }
+#endif
+
+#if EIGEN_HAS_CXX11
+    /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&... args)
+     *
+     * Example: \include Matrix_variadic_ctor_cxx11.cpp
+     * Output: \verbinclude Matrix_variadic_ctor_cxx11.out
+     *
+     * \sa Matrix(const std::initializer_list<std::initializer_list<Scalar>>&)
+     */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
+      : Base(a0, a1, a2, a3, args...) {}
+
+    /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
+      *
+      * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
+      *
+      * Example: \include Matrix_initializer_list_23_cxx11.cpp
+      * Output: \verbinclude Matrix_initializer_list_23_cxx11.out
+      *
+      * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
+      *
+      * In the case of a compile-time column vector, implicit transposition from a single row is allowed.
+      * Therefore <code>VectorXd{{1,2,3,4,5}}</code> is legal and the more verbose syntax
+      * <code>RowVectorXd{{1},{2},{3},{4},{5}}</code> can be avoided:
+      *
+      * Example: \include Matrix_initializer_list_vector_cxx11.cpp
+      * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out
+      *
+      * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes,
+      * and implicit transposition is allowed for compile-time vectors only.
+      *
+      * \sa Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
+      */
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
+#endif // end EIGEN_HAS_CXX11
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+    // This constructor is for both 1x1 matrices and dynamic vectors
+    template<typename T>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    explicit Matrix(const T& x)
+    {
+      Base::_check_template_params();
+      Base::template _init1<T>(x);
+    }
+
+    template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix(const T0& x, const T1& y)
+    {
+      Base::_check_template_params();
+      Base::template _init2<T0,T1>(x, y);
+    }
+
+
+#else
+    /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */
+    EIGEN_DEVICE_FUNC
+    explicit Matrix(const Scalar *data);
+
+    /** \brief Constructs a vector or row-vector with given dimension. \only_for_vectors
+      *
+      * This is useful for dynamic-size vectors. For fixed-size vectors,
+      * it is redundant to pass these parameters, so one should use the default constructor
+      * Matrix() instead.
+      *
+      * \warning This constructor is disabled for fixed-size \c 1x1 matrices. For instance,
+      * calling Matrix<double,1,1>(1) will call the initialization constructor: Matrix(const Scalar&).
+      * For fixed-size \c 1x1 matrices it is therefore recommended to use the default
+      * constructor Matrix() instead, especially when using one of the non standard
+      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
+      */
+    EIGEN_STRONG_INLINE explicit Matrix(Index dim);
+    /** \brief Constructs an initialized 1x1 matrix with the given coefficient
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
+    Matrix(const Scalar& x);
+    /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
+      *
+      * This is useful for dynamic-size matrices. For fixed-size matrices,
+      * it is redundant to pass these parameters, so one should use the default constructor
+      * Matrix() instead.
+      *
+      * \warning This constructor is disabled for fixed-size \c 1x2 and \c 2x1 vectors. For instance,
+      * calling Matrix2f(2,1) will call the initialization constructor: Matrix(const Scalar& x, const Scalar& y).
+      * For fixed-size \c 1x2 or \c 2x1 vectors it is therefore recommended to use the default
+      * constructor Matrix() instead, especially when using one of the non standard
+      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
+      */
+    EIGEN_DEVICE_FUNC
+    Matrix(Index rows, Index cols);
+
+    /** \brief Constructs an initialized 2D vector with given coefficients
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
+    Matrix(const Scalar& x, const Scalar& y);
+    #endif  // end EIGEN_PARSED_BY_DOXYGEN
+
+    /** \brief Constructs an initialized 3D vector with given coefficients
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
+    {
+      Base::_check_template_params();
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 3)
+      m_storage.data()[0] = x;
+      m_storage.data()[1] = y;
+      m_storage.data()[2] = z;
+    }
+    /** \brief Constructs an initialized 4D vector with given coefficients
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w)
+    {
+      Base::_check_template_params();
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 4)
+      m_storage.data()[0] = x;
+      m_storage.data()[1] = y;
+      m_storage.data()[2] = z;
+      m_storage.data()[3] = w;
+    }
+
+
+    /** \brief Copy constructor */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Matrix(const Matrix& other) : Base(other)
+    { }
+
+    /** \brief Copy constructor for generic expressions.
+      * \sa MatrixBase::operator=(const EigenBase<OtherDerived>&)
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived> &other)
+      : Base(other.derived())
+    { }
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return 1; }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); }
+
+    /////////// Geometry module ///////////
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    explicit Matrix(const RotationBase<OtherDerived,ColsAtCompileTime>& r);
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    Matrix& operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r);
+
+    // allow to extend Matrix outside Eigen
+    #ifdef EIGEN_MATRIX_PLUGIN
+    #include EIGEN_MATRIX_PLUGIN
+    #endif
+
+  protected:
+    template <typename Derived, typename OtherDerived, bool IsVector>
+    friend struct internal::conservative_resize_like_impl;
+
+    using Base::m_storage;
+};
+
+/** \defgroup matrixtypedefs Global matrix typedefs
+  *
+  * \ingroup Core_Module
+  *
+  * %Eigen defines several typedef shortcuts for most common matrix and vector types.
+  *
+  * The general patterns are the following:
+  *
+  * \c MatrixSizeType where \c Size can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size,
+  * and where \c Type can be \c i for integer, \c f for float, \c d for double, \c cf for complex float, \c cd
+  * for complex double.
+  *
+  * For example, \c Matrix3d is a fixed-size 3x3 matrix type of doubles, and \c MatrixXf is a dynamic-size matrix of floats.
+  *
+  * There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is
+  * a fixed-size vector of 4 complex floats.
+  *
+  * With \cpp11, template alias are also defined for common sizes.
+  * They follow the same pattern as above except that the scalar type suffix is replaced by a
+  * template parameter, i.e.:
+  *   - `MatrixSize<Type>` where `Size` can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size.
+  *   - `MatrixXSize<Type>` and `MatrixSizeX<Type>` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices.
+  *   - `VectorSize<Type>` and `RowVectorSize<Type>` for column and row vectors.
+  *
+  * With \cpp11, you can also use fully generic column and row vector types: `Vector<Type,Size>` and `RowVector<Type,Size>`.
+  *
+  * \sa class Matrix
+  */
+
+#define EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix)   \
+/** \ingroup matrixtypedefs */                                    \
+typedef Matrix<Type, Size, Size> Matrix##SizeSuffix##TypeSuffix;  \
+/** \ingroup matrixtypedefs */                                    \
+typedef Matrix<Type, Size, 1>    Vector##SizeSuffix##TypeSuffix;  \
+/** \ingroup matrixtypedefs */                                    \
+typedef Matrix<Type, 1, Size>    RowVector##SizeSuffix##TypeSuffix;
+
+#define EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, Size)         \
+/** \ingroup matrixtypedefs */                                    \
+typedef Matrix<Type, Size, Dynamic> Matrix##Size##X##TypeSuffix;  \
+/** \ingroup matrixtypedefs */                                    \
+typedef Matrix<Type, Dynamic, Size> Matrix##X##Size##TypeSuffix;
+
+#define EIGEN_MAKE_TYPEDEFS_ALL_SIZES(Type, TypeSuffix) \
+EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 2, 2) \
+EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 3, 3) \
+EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 4, 4) \
+EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Dynamic, X) \
+EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 2) \
+EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 3) \
+EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 4)
+
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(int,                  i)
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(float,                f)
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(double,               d)
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex<float>,  cf)
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)
+
+#undef EIGEN_MAKE_TYPEDEFS_ALL_SIZES
+#undef EIGEN_MAKE_TYPEDEFS
+#undef EIGEN_MAKE_FIXED_TYPEDEFS
+
+#if EIGEN_HAS_CXX11
+
+#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix)                     \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Matrix##SizeSuffix = Matrix<Type, Size, Size>;              \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Vector##SizeSuffix = Matrix<Type, Size, 1>;                 \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using RowVector##SizeSuffix = Matrix<Type, 1, Size>;
+
+#define EIGEN_MAKE_FIXED_TYPEDEFS(Size)                           \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Matrix##Size##X = Matrix<Type, Size, Dynamic>;              \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Matrix##X##Size = Matrix<Type, Dynamic, Size>;
+
+EIGEN_MAKE_TYPEDEFS(2, 2)
+EIGEN_MAKE_TYPEDEFS(3, 3)
+EIGEN_MAKE_TYPEDEFS(4, 4)
+EIGEN_MAKE_TYPEDEFS(Dynamic, X)
+EIGEN_MAKE_FIXED_TYPEDEFS(2)
+EIGEN_MAKE_FIXED_TYPEDEFS(3)
+EIGEN_MAKE_FIXED_TYPEDEFS(4)
+
+/** \ingroup matrixtypedefs
+  * \brief \cpp11 */
+template <typename Type, int Size>
+using Vector = Matrix<Type, Size, 1>;
+
+/** \ingroup matrixtypedefs
+  * \brief \cpp11 */
+template <typename Type, int Size>
+using RowVector = Matrix<Type, 1, Size>;
+
+#undef EIGEN_MAKE_TYPEDEFS
+#undef EIGEN_MAKE_FIXED_TYPEDEFS
+
+#endif // EIGEN_HAS_CXX11
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/MatrixBase.h b/src/3rdparty/eigen/Eigen/src/Core/MatrixBase.h
new file mode 100644
index 000000000..45c3a596e
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/MatrixBase.h
@@ -0,0 +1,547 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIXBASE_H
+#define EIGEN_MATRIXBASE_H
+
+namespace Eigen {
+
+/** \class MatrixBase
+  * \ingroup Core_Module
+  *
+  * \brief Base class for all dense matrices, vectors, and expressions
+  *
+  * This class is the base that is inherited by all matrix, vector, and related expression
+  * types. Most of the Eigen API is contained in this class, and its base classes. Other important
+  * classes for the Eigen API are Matrix, and VectorwiseOp.
+  *
+  * Note that some methods are defined in other modules such as the \ref LU_Module LU module
+  * for all functions related to matrix inversions.
+  *
+  * \tparam Derived is the derived type, e.g. a matrix type, or an expression, etc.
+  *
+  * When writing a function taking Eigen objects as argument, if you want your function
+  * to take as argument any matrix, vector, or expression, just let it take a
+  * MatrixBase argument. As an example, here is a function printFirstRow which, given
+  * a matrix, vector, or expression \a x, prints the first row of \a x.
+  *
+  * \code
+    template<typename Derived>
+    void printFirstRow(const Eigen::MatrixBase<Derived>& x)
+    {
+      cout << x.row(0) << endl;
+    }
+  * \endcode
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_MATRIXBASE_PLUGIN.
+  *
+  * \sa \blank \ref TopicClassHierarchy
+  */
+template<typename Derived> class MatrixBase
+  : public DenseBase<Derived>
+{
+  public:
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef MatrixBase StorageBaseType;
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    typedef DenseBase<Derived> Base;
+    using Base::RowsAtCompileTime;
+    using Base::ColsAtCompileTime;
+    using Base::SizeAtCompileTime;
+    using Base::MaxRowsAtCompileTime;
+    using Base::MaxColsAtCompileTime;
+    using Base::MaxSizeAtCompileTime;
+    using Base::IsVectorAtCompileTime;
+    using Base::Flags;
+
+    using Base::derived;
+    using Base::const_cast_derived;
+    using Base::rows;
+    using Base::cols;
+    using Base::size;
+    using Base::coeff;
+    using Base::coeffRef;
+    using Base::lazyAssign;
+    using Base::eval;
+    using Base::operator-;
+    using Base::operator+=;
+    using Base::operator-=;
+    using Base::operator*=;
+    using Base::operator/=;
+
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+    typedef typename Base::ConstTransposeReturnType ConstTransposeReturnType;
+    typedef typename Base::RowXpr RowXpr;
+    typedef typename Base::ColXpr ColXpr;
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** type of the equivalent square matrix */
+    typedef Matrix<Scalar,EIGEN_SIZE_MAX(RowsAtCompileTime,ColsAtCompileTime),
+                          EIGEN_SIZE_MAX(RowsAtCompileTime,ColsAtCompileTime)> SquareMatrixType;
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+    /** \returns the size of the main diagonal, which is min(rows(),cols()).
+      * \sa rows(), cols(), SizeAtCompileTime. */
+    EIGEN_DEVICE_FUNC
+    inline Index diagonalSize() const { return (numext::mini)(rows(),cols()); }
+
+    typedef typename Base::PlainObject PlainObject;
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal Represents a matrix with all coefficients equal to one another*/
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
+    /** \internal the return type of MatrixBase::adjoint() */
+    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
+                        CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
+                        ConstTransposeReturnType
+                     >::type AdjointReturnType;
+    /** \internal Return type of eigenvalues() */
+    typedef Matrix<std::complex<RealScalar>, internal::traits<Derived>::ColsAtCompileTime, 1, ColMajor> EigenvaluesReturnType;
+    /** \internal the return type of identity */
+    typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>,PlainObject> IdentityReturnType;
+    /** \internal the return type of unit vectors */
+    typedef Block<const CwiseNullaryOp<internal::scalar_identity_op<Scalar>, SquareMatrixType>,
+                  internal::traits<Derived>::RowsAtCompileTime,
+                  internal::traits<Derived>::ColsAtCompileTime> BasisReturnType;
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
+#   include "../plugins/CommonCwiseBinaryOps.h"
+#   include "../plugins/MatrixCwiseUnaryOps.h"
+#   include "../plugins/MatrixCwiseBinaryOps.h"
+#   ifdef EIGEN_MATRIXBASE_PLUGIN
+#     include EIGEN_MATRIXBASE_PLUGIN
+#   endif
+#undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS
+
+    /** Special case of the template operator=, in order to prevent the compiler
+      * from generating a default operator= (issue hit with g++ 4.1)
+      */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator=(const MatrixBase& other);
+
+    // We cannot inherit here via Base::operator= since it is causing
+    // trouble with MSVC.
+
+    template <typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator=(const DenseBase<OtherDerived>& other);
+
+    template <typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    Derived& operator=(const EigenBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    Derived& operator=(const ReturnByValue<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator+=(const MatrixBase<OtherDerived>& other);
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator-=(const MatrixBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    const Product<Derived,OtherDerived>
+    operator*(const MatrixBase<OtherDerived> &other) const;
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    const Product<Derived,OtherDerived,LazyProduct>
+    lazyProduct(const MatrixBase<OtherDerived> &other) const;
+
+    template<typename OtherDerived>
+    Derived& operator*=(const EigenBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    void applyOnTheLeft(const EigenBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    void applyOnTheRight(const EigenBase<OtherDerived>& other);
+
+    template<typename DiagonalDerived>
+    EIGEN_DEVICE_FUNC
+    const Product<Derived, DiagonalDerived, LazyProduct>
+    operator*(const DiagonalBase<DiagonalDerived> &diagonal) const;
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
+    dot(const MatrixBase<OtherDerived>& other) const;
+
+    EIGEN_DEVICE_FUNC RealScalar squaredNorm() const;
+    EIGEN_DEVICE_FUNC RealScalar norm() const;
+    RealScalar stableNorm() const;
+    RealScalar blueNorm() const;
+    RealScalar hypotNorm() const;
+    EIGEN_DEVICE_FUNC const PlainObject normalized() const;
+    EIGEN_DEVICE_FUNC const PlainObject stableNormalized() const;
+    EIGEN_DEVICE_FUNC void normalize();
+    EIGEN_DEVICE_FUNC void stableNormalize();
+
+    EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const;
+    EIGEN_DEVICE_FUNC void adjointInPlace();
+
+    typedef Diagonal<Derived> DiagonalReturnType;
+    EIGEN_DEVICE_FUNC
+    DiagonalReturnType diagonal();
+
+    typedef typename internal::add_const<Diagonal<const Derived> >::type ConstDiagonalReturnType;
+    EIGEN_DEVICE_FUNC
+    ConstDiagonalReturnType diagonal() const;
+
+    template<int Index> struct DiagonalIndexReturnType { typedef Diagonal<Derived,Index> Type; };
+    template<int Index> struct ConstDiagonalIndexReturnType { typedef const Diagonal<const Derived,Index> Type; };
+
+    template<int Index>
+    EIGEN_DEVICE_FUNC
+    typename DiagonalIndexReturnType<Index>::Type diagonal();
+
+    template<int Index>
+    EIGEN_DEVICE_FUNC
+    typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
+
+    typedef Diagonal<Derived,DynamicIndex> DiagonalDynamicIndexReturnType;
+    typedef typename internal::add_const<Diagonal<const Derived,DynamicIndex> >::type ConstDiagonalDynamicIndexReturnType;
+
+    EIGEN_DEVICE_FUNC
+    DiagonalDynamicIndexReturnType diagonal(Index index);
+    EIGEN_DEVICE_FUNC
+    ConstDiagonalDynamicIndexReturnType diagonal(Index index) const;
+
+    template<unsigned int Mode> struct TriangularViewReturnType { typedef TriangularView<Derived, Mode> Type; };
+    template<unsigned int Mode> struct ConstTriangularViewReturnType { typedef const TriangularView<const Derived, Mode> Type; };
+
+    template<unsigned int Mode>
+    EIGEN_DEVICE_FUNC
+    typename TriangularViewReturnType<Mode>::Type triangularView();
+    template<unsigned int Mode>
+    EIGEN_DEVICE_FUNC
+    typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;
+
+    template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SelfAdjointView<Derived, UpLo> Type; };
+    template<unsigned int UpLo> struct ConstSelfAdjointViewReturnType { typedef const SelfAdjointView<const Derived, UpLo> Type; };
+
+    template<unsigned int UpLo>
+    EIGEN_DEVICE_FUNC
+    typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
+    template<unsigned int UpLo>
+    EIGEN_DEVICE_FUNC
+    typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
+
+    const SparseView<Derived> sparseView(const Scalar& m_reference = Scalar(0),
+                                         const typename NumTraits<Scalar>::Real& m_epsilon = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC static const IdentityReturnType Identity();
+    EIGEN_DEVICE_FUNC static const IdentityReturnType Identity(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC static const BasisReturnType Unit(Index size, Index i);
+    EIGEN_DEVICE_FUNC static const BasisReturnType Unit(Index i);
+    EIGEN_DEVICE_FUNC static const BasisReturnType UnitX();
+    EIGEN_DEVICE_FUNC static const BasisReturnType UnitY();
+    EIGEN_DEVICE_FUNC static const BasisReturnType UnitZ();
+    EIGEN_DEVICE_FUNC static const BasisReturnType UnitW();
+
+    EIGEN_DEVICE_FUNC
+    const DiagonalWrapper<const Derived> asDiagonal() const;
+    const PermutationWrapper<const Derived> asPermutation() const;
+
+    EIGEN_DEVICE_FUNC
+    Derived& setIdentity();
+    EIGEN_DEVICE_FUNC
+    Derived& setIdentity(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setUnit(Index i);
+    EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i);
+
+    bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+
+    bool isUpperTriangular(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isLowerTriangular(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+
+    template<typename OtherDerived>
+    bool isOrthogonal(const MatrixBase<OtherDerived>& other,
+                      const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    bool isUnitary(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+
+    /** \returns true if each coefficients of \c *this and \a other are all exactly equal.
+      * \warning When using floating point scalar values you probably should rather use a
+      *          fuzzy comparison such as isApprox()
+      * \sa isApprox(), operator!= */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase<OtherDerived>& other) const
+    { return cwiseEqual(other).all(); }
+
+    /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
+      * \warning When using floating point scalar values you probably should rather use a
+      *          fuzzy comparison such as isApprox()
+      * \sa isApprox(), operator== */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
+    { return cwiseNotEqual(other).any(); }
+
+    NoAlias<Derived,Eigen::MatrixBase > EIGEN_DEVICE_FUNC noalias();
+
+    // TODO forceAlignedAccess is temporarily disabled
+    // Need to find a nicer workaround.
+    inline const Derived& forceAlignedAccess() const { return derived(); }
+    inline Derived& forceAlignedAccess() { return derived(); }
+    template<bool Enable> inline const Derived& forceAlignedAccessIf() const { return derived(); }
+    template<bool Enable> inline Derived& forceAlignedAccessIf() { return derived(); }
+
+    EIGEN_DEVICE_FUNC Scalar trace() const;
+
+    template<int p> EIGEN_DEVICE_FUNC RealScalar lpNorm() const;
+
+    EIGEN_DEVICE_FUNC MatrixBase<Derived>& matrix() { return *this; }
+    EIGEN_DEVICE_FUNC const MatrixBase<Derived>& matrix() const { return *this; }
+
+    /** \returns an \link Eigen::ArrayBase Array \endlink expression of this matrix
+      * \sa ArrayBase::matrix() */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ArrayWrapper<Derived> array() { return ArrayWrapper<Derived>(derived()); }
+    /** \returns a const \link Eigen::ArrayBase Array \endlink expression of this matrix
+      * \sa ArrayBase::matrix() */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArrayWrapper<const Derived> array() const { return ArrayWrapper<const Derived>(derived()); }
+
+/////////// LU module ///////////
+
+    inline const FullPivLU<PlainObject> fullPivLu() const;
+    inline const PartialPivLU<PlainObject> partialPivLu() const;
+
+    inline const PartialPivLU<PlainObject> lu() const;
+
+    EIGEN_DEVICE_FUNC
+    inline const Inverse<Derived> inverse() const;
+
+    template<typename ResultType>
+    inline void computeInverseAndDetWithCheck(
+      ResultType& inverse,
+      typename ResultType::Scalar& determinant,
+      bool& invertible,
+      const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
+    ) const;
+
+    template<typename ResultType>
+    inline void computeInverseWithCheck(
+      ResultType& inverse,
+      bool& invertible,
+      const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
+    ) const;
+
+    EIGEN_DEVICE_FUNC
+    Scalar determinant() const;
+
+/////////// Cholesky module ///////////
+
+    inline const LLT<PlainObject>  llt() const;
+    inline const LDLT<PlainObject> ldlt() const;
+
+/////////// QR module ///////////
+
+    inline const HouseholderQR<PlainObject> householderQr() const;
+    inline const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
+    inline const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
+    inline const CompleteOrthogonalDecomposition<PlainObject> completeOrthogonalDecomposition() const;
+
+/////////// Eigenvalues module ///////////
+
+    inline EigenvaluesReturnType eigenvalues() const;
+    inline RealScalar operatorNorm() const;
+
+/////////// SVD module ///////////
+
+    inline JacobiSVD<PlainObject> jacobiSvd(unsigned int computationOptions = 0) const;
+    inline BDCSVD<PlainObject>    bdcSvd(unsigned int computationOptions = 0) const;
+
+/////////// Geometry module ///////////
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /// \internal helper struct to form the return type of the cross product
+    template<typename OtherDerived> struct cross_product_return_type {
+      typedef typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
+      typedef Matrix<Scalar,MatrixBase::RowsAtCompileTime,MatrixBase::ColsAtCompileTime> type;
+    };
+    #endif // EIGEN_PARSED_BY_DOXYGEN
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    inline typename cross_product_return_type<OtherDerived>::type
+#else
+    inline PlainObject
+#endif
+    cross(const MatrixBase<OtherDerived>& other) const;
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    inline PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
+
+    EIGEN_DEVICE_FUNC
+    inline PlainObject unitOrthogonal(void) const;
+
+    EIGEN_DEVICE_FUNC
+    inline Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
+
+    // put this as separate enum value to work around possible GCC 4.3 bug (?)
+    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1&&RowsAtCompileTime==1 ? ((internal::traits<Derived>::Flags&RowMajorBit)==RowMajorBit ? Horizontal : Vertical)
+                                          : ColsAtCompileTime==1 ? Vertical : Horizontal };
+    typedef Homogeneous<Derived, HomogeneousReturnTypeDirection> HomogeneousReturnType;
+    EIGEN_DEVICE_FUNC
+    inline HomogeneousReturnType homogeneous() const;
+
+    enum {
+      SizeMinusOne = SizeAtCompileTime==Dynamic ? Dynamic : SizeAtCompileTime-1
+    };
+    typedef Block<const Derived,
+                  internal::traits<Derived>::ColsAtCompileTime==1 ? SizeMinusOne : 1,
+                  internal::traits<Derived>::ColsAtCompileTime==1 ? 1 : SizeMinusOne> ConstStartMinusOne;
+    typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(ConstStartMinusOne,Scalar,quotient) HNormalizedReturnType;
+    EIGEN_DEVICE_FUNC
+    inline const HNormalizedReturnType hnormalized() const;
+
+////////// Householder module ///////////
+
+    EIGEN_DEVICE_FUNC
+    void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
+    template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
+    void makeHouseholder(EssentialPart& essential,
+                         Scalar& tau, RealScalar& beta) const;
+    template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
+    void applyHouseholderOnTheLeft(const EssentialPart& essential,
+                                   const Scalar& tau,
+                                   Scalar* workspace);
+    template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
+    void applyHouseholderOnTheRight(const EssentialPart& essential,
+                                    const Scalar& tau,
+                                    Scalar* workspace);
+
+///////// Jacobi module /////////
+
+    template<typename OtherScalar>
+    EIGEN_DEVICE_FUNC
+    void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
+    template<typename OtherScalar>
+    EIGEN_DEVICE_FUNC
+    void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
+
+///////// SparseCore module /////////
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE const typename SparseMatrixBase<OtherDerived>::template CwiseProductDenseReturnType<Derived>::Type
+    cwiseProduct(const SparseMatrixBase<OtherDerived> &other) const
+    {
+      return other.cwiseProduct(derived());
+    }
+
+///////// MatrixFunctions module /////////
+
+    typedef typename internal::stem_function<Scalar>::type StemFunction;
+#define EIGEN_MATRIX_FUNCTION(ReturnType, Name, Description) \
+    /** \returns an expression of the matrix Description of \c *this. \brief This function requires the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>. To compute the coefficient-wise Description use ArrayBase::##Name . */ \
+    const ReturnType<Derived> Name() const;
+#define EIGEN_MATRIX_FUNCTION_1(ReturnType, Name, Description, Argument) \
+    /** \returns an expression of the matrix Description of \c *this. \brief This function requires the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>. To compute the coefficient-wise Description use ArrayBase::##Name . */ \
+    const ReturnType<Derived> Name(Argument) const;
+
+    EIGEN_MATRIX_FUNCTION(MatrixExponentialReturnValue, exp, exponential)
+    /** \brief Helper function for the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>.*/
+    const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const;
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine)
+#if EIGEN_HAS_CXX11_MATH
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine)
+#endif
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine)
+    EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root)
+    EIGEN_MATRIX_FUNCTION(MatrixLogarithmReturnValue, log, logarithm)
+    EIGEN_MATRIX_FUNCTION_1(MatrixPowerReturnValue,        pow, power to \c p, const RealScalar& p)
+    EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const std::complex<RealScalar>& p)
+
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MatrixBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MatrixBase)
+
+  private:
+    EIGEN_DEVICE_FUNC explicit MatrixBase(int);
+    EIGEN_DEVICE_FUNC MatrixBase(int,int);
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC explicit MatrixBase(const MatrixBase<OtherDerived>&);
+  protected:
+    // mixing arrays and matrices is not legal
+    template<typename OtherDerived> Derived& operator+=(const ArrayBase<OtherDerived>& )
+    {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;}
+    // mixing arrays and matrices is not legal
+    template<typename OtherDerived> Derived& operator-=(const ArrayBase<OtherDerived>& )
+    {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;}
+};
+
+
+/***************************************************************************
+* Implementation of matrix base methods
+***************************************************************************/
+
+/** replaces \c *this by \c *this * \a other.
+  *
+  * \returns a reference to \c *this
+  *
+  * Example: \include MatrixBase_applyOnTheRight.cpp
+  * Output: \verbinclude MatrixBase_applyOnTheRight.out
+  */
+template<typename Derived>
+template<typename OtherDerived>
+inline Derived&
+MatrixBase<Derived>::operator*=(const EigenBase<OtherDerived> &other)
+{
+  other.derived().applyThisOnTheRight(derived());
+  return derived();
+}
+
+/** replaces \c *this by \c *this * \a other. It is equivalent to MatrixBase::operator*=().
+  *
+  * Example: \include MatrixBase_applyOnTheRight.cpp
+  * Output: \verbinclude MatrixBase_applyOnTheRight.out
+  */
+template<typename Derived>
+template<typename OtherDerived>
+inline void MatrixBase<Derived>::applyOnTheRight(const EigenBase<OtherDerived> &other)
+{
+  other.derived().applyThisOnTheRight(derived());
+}
+
+/** replaces \c *this by \a other * \c *this.
+  *
+  * Example: \include MatrixBase_applyOnTheLeft.cpp
+  * Output: \verbinclude MatrixBase_applyOnTheLeft.out
+  */
+template<typename Derived>
+template<typename OtherDerived>
+inline void MatrixBase<Derived>::applyOnTheLeft(const EigenBase<OtherDerived> &other)
+{
+  other.derived().applyThisOnTheLeft(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIXBASE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/NestByValue.h b/src/3rdparty/eigen/Eigen/src/Core/NestByValue.h
new file mode 100644
index 000000000..b4275768a
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/NestByValue.h
@@ -0,0 +1,85 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NESTBYVALUE_H
+#define EIGEN_NESTBYVALUE_H
+
+namespace Eigen {
+
+namespace internal {
+template<typename ExpressionType>
+struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType>
+{
+  enum {
+    Flags = traits<ExpressionType>::Flags & ~NestByRefBit
+  };
+};
+}
+
+/** \class NestByValue
+  * \ingroup Core_Module
+  *
+  * \brief Expression which must be nested by value
+  *
+  * \tparam ExpressionType the type of the object of which we are requiring nesting-by-value
+  *
+  * This class is the return type of MatrixBase::nestByValue()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::nestByValue()
+  */
+template<typename ExpressionType> class NestByValue
+  : public internal::dense_xpr_base< NestByValue<ExpressionType> >::type
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<NestByValue>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(NestByValue)
+
+    EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
+
+    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
+
+    EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; }
+
+  protected:
+    const ExpressionType m_expression;
+};
+
+/** \returns an expression of the temporary version of *this.
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline const NestByValue<Derived>
+DenseBase<Derived>::nestByValue() const
+{
+  return NestByValue<Derived>(derived());
+}
+
+namespace internal {
+
+// Evaluator of Solve -> eval into a temporary
+template<typename ArgType>
+struct evaluator<NestByValue<ArgType> >
+  : public evaluator<ArgType>
+{
+  typedef evaluator<ArgType> Base;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue<ArgType>& xpr)
+    : Base(xpr.nestedExpression())
+  {}
+};
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_NESTBYVALUE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/NoAlias.h b/src/3rdparty/eigen/Eigen/src/Core/NoAlias.h
new file mode 100644
index 000000000..570283d90
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/NoAlias.h
@@ -0,0 +1,109 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NOALIAS_H
+#define EIGEN_NOALIAS_H
+
+namespace Eigen {
+
+/** \class NoAlias
+  * \ingroup Core_Module
+  *
+  * \brief Pseudo expression providing an operator = assuming no aliasing
+  *
+  * \tparam ExpressionType the type of the object on which to do the lazy assignment
+  *
+  * This class represents an expression with special assignment operators
+  * assuming no aliasing between the target expression and the source expression.
+  * More precisely it alloas to bypass the EvalBeforeAssignBit flag of the source expression.
+  * It is the return type of MatrixBase::noalias()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::noalias()
+  */
+template<typename ExpressionType, template <typename> class StorageBase>
+class NoAlias
+{
+  public:
+    typedef typename ExpressionType::Scalar Scalar;
+    
+    EIGEN_DEVICE_FUNC
+    explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
+    
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase<OtherDerived>& other)
+    {
+      call_assignment_no_alias(m_expression, other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
+      return m_expression;
+    }
+    
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE ExpressionType& operator+=(const StorageBase<OtherDerived>& other)
+    {
+      call_assignment_no_alias(m_expression, other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
+      return m_expression;
+    }
+    
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE ExpressionType& operator-=(const StorageBase<OtherDerived>& other)
+    {
+      call_assignment_no_alias(m_expression, other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
+      return m_expression;
+    }
+
+    EIGEN_DEVICE_FUNC
+    ExpressionType& expression() const
+    {
+      return m_expression;
+    }
+
+  protected:
+    ExpressionType& m_expression;
+};
+
+/** \returns a pseudo expression of \c *this with an operator= assuming
+  * no aliasing between \c *this and the source expression.
+  *
+  * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag.
+  * Currently, even though several expressions may alias, only product
+  * expressions have this flag. Therefore, noalias() is only useful when
+  * the source expression contains a matrix product.
+  *
+  * Here are some examples where noalias is useful:
+  * \code
+  * D.noalias()  = A * B;
+  * D.noalias() += A.transpose() * B;
+  * D.noalias() -= 2 * A * B.adjoint();
+  * \endcode
+  *
+  * On the other hand the following example will lead to a \b wrong result:
+  * \code
+  * A.noalias() = A * B;
+  * \endcode
+  * because the result matrix A is also an operand of the matrix product. Therefore,
+  * there is no alternative than evaluating A * B in a temporary, that is the default
+  * behavior when you write:
+  * \code
+  * A = A * B;
+  * \endcode
+  *
+  * \sa class NoAlias
+  */
+template<typename Derived>
+NoAlias<Derived,MatrixBase> EIGEN_DEVICE_FUNC MatrixBase<Derived>::noalias()
+{
+  return NoAlias<Derived, Eigen::MatrixBase >(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_NOALIAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/NumTraits.h b/src/3rdparty/eigen/Eigen/src/Core/NumTraits.h
new file mode 100644
index 000000000..72eac5a93
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/NumTraits.h
@@ -0,0 +1,335 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NUMTRAITS_H
+#define EIGEN_NUMTRAITS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// default implementation of digits10(), based on numeric_limits if specialized,
+// 0 for integer types, and log10(epsilon()) otherwise.
+template< typename T,
+          bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
+          bool is_integer = NumTraits<T>::IsInteger>
+struct default_digits10_impl
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int run() { return std::numeric_limits<T>::digits10; }
+};
+
+template<typename T>
+struct default_digits10_impl<T,false,false> // Floating point
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int run() {
+    using std::log10;
+    using std::ceil;
+    typedef typename NumTraits<T>::Real Real;
+    return int(ceil(-log10(NumTraits<Real>::epsilon())));
+  }
+};
+
+template<typename T>
+struct default_digits10_impl<T,false,true> // Integer
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int run() { return 0; }
+};
+
+
+// default implementation of digits(), based on numeric_limits if specialized,
+// 0 for integer types, and log2(epsilon()) otherwise.
+template< typename T,
+          bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
+          bool is_integer = NumTraits<T>::IsInteger>
+struct default_digits_impl
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int run() { return std::numeric_limits<T>::digits; }
+};
+
+template<typename T>
+struct default_digits_impl<T,false,false> // Floating point
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int run() {
+    using std::log;
+    using std::ceil;
+    typedef typename NumTraits<T>::Real Real;
+    return int(ceil(-log(NumTraits<Real>::epsilon())/log(static_cast<Real>(2))));
+  }
+};
+
+template<typename T>
+struct default_digits_impl<T,false,true> // Integer
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int run() { return 0; }
+};
+
+} // end namespace internal
+
+namespace numext {
+/** \internal bit-wise cast without changing the underlying bit representation. */
+
+// TODO: Replace by std::bit_cast (available in C++20)
+template <typename Tgt, typename Src>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) {
+#if EIGEN_HAS_TYPE_TRAITS
+  // The behaviour of memcpy is not specified for non-trivially copyable types
+  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Src>::value, THIS_TYPE_IS_NOT_SUPPORTED);
+  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Tgt>::value && std::is_default_constructible<Tgt>::value,
+                      THIS_TYPE_IS_NOT_SUPPORTED);
+#endif
+
+  EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED);
+  Tgt tgt;
+  EIGEN_USING_STD(memcpy)
+  memcpy(&tgt, &src, sizeof(Tgt));
+  return tgt;
+}
+}  // namespace numext
+
+/** \class NumTraits
+  * \ingroup Core_Module
+  *
+  * \brief Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
+  *
+  * \tparam T the numeric type at hand
+  *
+  * This class stores enums, typedefs and static methods giving information about a numeric type.
+  *
+  * The provided data consists of:
+  * \li A typedef \c Real, giving the "real part" type of \a T. If \a T is already real,
+  *     then \c Real is just a typedef to \a T. If \a T is \c std::complex<U> then \c Real
+  *     is a typedef to \a U.
+  * \li A typedef \c NonInteger, giving the type that should be used for operations producing non-integral values,
+  *     such as quotients, square roots, etc. If \a T is a floating-point type, then this typedef just gives
+  *     \a T again. Note however that many Eigen functions such as internal::sqrt simply refuse to
+  *     take integers. Outside of a few cases, Eigen doesn't do automatic type promotion. Thus, this typedef is
+  *     only intended as a helper for code that needs to explicitly promote types.
+  * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for \c std::complex<U>, Literal is defined as \c U.
+  *     Of course, this type must be fully compatible with \a T. In doubt, just use \a T here.
+  * \li A typedef \a Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
+  *     this means, just use \a T here.
+  * \li An enum value \a IsComplex. It is equal to 1 if \a T is a \c std::complex
+  *     type, and to 0 otherwise.
+  * \li An enum value \a IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int,
+  *     and to \c 0 otherwise.
+  * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed
+  *     to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers.
+  *     Stay vague here. No need to do architecture-specific stuff. If you don't know what this means, just use \c Eigen::HugeCost.
+  * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
+  * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
+  *     be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
+  * \li An epsilon() function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">std::numeric_limits::epsilon()</a>,
+  *     it returns a \a Real instead of a \a T.
+  * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default
+  *     value by the fuzzy comparison operators.
+  * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
+  * \li digits() function returning the number of radix digits (non-sign digits for integers, mantissa for floating-point). This is
+  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits">std::numeric_limits<T>::digits</a>
+  *     which is used as the default implementation if specialized.
+  * \li digits10() function returning the number of decimal digits that can be represented without change. This is
+  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">std::numeric_limits<T>::digits10</a>
+  *     which is used as the default implementation if specialized.
+  * \li min_exponent() and max_exponent() functions returning the highest and lowest possible values, respectively,
+  *     such that the radix raised to the power exponent-1 is a normalized floating-point number.  These are equivalent to
+  *     <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/min_exponent">std::numeric_limits<T>::min_exponent</a>/
+  *     <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/max_exponent">std::numeric_limits<T>::max_exponent</a>.
+  * \li infinity() function returning a representation of positive infinity, if available.
+  * \li quiet_NaN function returning a non-signaling "not-a-number", if available.
+  */
+
+template<typename T> struct GenericNumTraits
+{
+  enum {
+    IsInteger = std::numeric_limits<T>::is_integer,
+    IsSigned = std::numeric_limits<T>::is_signed,
+    IsComplex = 0,
+    RequireInitialization = internal::is_arithmetic<T>::value ? 0 : 1,
+    ReadCost = 1,
+    AddCost = 1,
+    MulCost = 1
+  };
+
+  typedef T Real;
+  typedef typename internal::conditional<
+                     IsInteger,
+                     typename internal::conditional<sizeof(T)<=2, float, double>::type,
+                     T
+                   >::type NonInteger;
+  typedef T Nested;
+  typedef T Literal;
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline Real epsilon()
+  {
+    return numext::numeric_limits<T>::epsilon();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline int digits10()
+  {
+    return internal::default_digits10_impl<T>::run();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline int digits()
+  {
+    return internal::default_digits_impl<T>::run();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline int min_exponent()
+  {
+    return numext::numeric_limits<T>::min_exponent;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline int max_exponent()
+  {
+    return numext::numeric_limits<T>::max_exponent;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline Real dummy_precision()
+  {
+    // make sure to override this for floating-point types
+    return Real(0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline T highest() {
+    return (numext::numeric_limits<T>::max)();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline T lowest()  {
+    return IsInteger ? (numext::numeric_limits<T>::min)()
+                     : static_cast<T>(-(numext::numeric_limits<T>::max)());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline T infinity() {
+    return numext::numeric_limits<T>::infinity();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline T quiet_NaN() {
+    return numext::numeric_limits<T>::quiet_NaN();
+  }
+};
+
+template<typename T> struct NumTraits : GenericNumTraits<T>
+{};
+
+template<> struct NumTraits<float>
+  : GenericNumTraits<float>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline float dummy_precision() { return 1e-5f; }
+};
+
+template<> struct NumTraits<double> : GenericNumTraits<double>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline double dummy_precision() { return 1e-12; }
+};
+
+template<> struct NumTraits<long double>
+  : GenericNumTraits<long double>
+{
+  EIGEN_CONSTEXPR
+  static inline long double dummy_precision() { return 1e-15l; }
+};
+
+template<typename _Real> struct NumTraits<std::complex<_Real> >
+  : GenericNumTraits<std::complex<_Real> >
+{
+  typedef _Real Real;
+  typedef typename NumTraits<_Real>::Literal Literal;
+  enum {
+    IsComplex = 1,
+    RequireInitialization = NumTraits<_Real>::RequireInitialization,
+    ReadCost = 2 * NumTraits<_Real>::ReadCost,
+    AddCost = 2 * NumTraits<Real>::AddCost,
+    MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline int digits10() { return NumTraits<Real>::digits10(); }
+};
+
+template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
+{
+  typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> ArrayType;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Array<RealScalar, Rows, Cols, Options, MaxRows, MaxCols> Real;
+  typedef typename NumTraits<Scalar>::NonInteger NonIntegerScalar;
+  typedef Array<NonIntegerScalar, Rows, Cols, Options, MaxRows, MaxCols> NonInteger;
+  typedef ArrayType & Nested;
+  typedef typename NumTraits<Scalar>::Literal Literal;
+
+  enum {
+    IsComplex = NumTraits<Scalar>::IsComplex,
+    IsInteger = NumTraits<Scalar>::IsInteger,
+    IsSigned  = NumTraits<Scalar>::IsSigned,
+    RequireInitialization = 1,
+    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::ReadCost),
+    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::AddCost),
+    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::MulCost)
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
+
+  EIGEN_CONSTEXPR
+  static inline int digits10() { return NumTraits<Scalar>::digits10(); }
+};
+
+template<> struct NumTraits<std::string>
+  : GenericNumTraits<std::string>
+{
+  enum {
+    RequireInitialization = 1,
+    ReadCost = HugeCost,
+    AddCost  = HugeCost,
+    MulCost  = HugeCost
+  };
+
+  EIGEN_CONSTEXPR
+  static inline int digits10() { return 0; }
+
+private:
+  static inline std::string epsilon();
+  static inline std::string dummy_precision();
+  static inline std::string lowest();
+  static inline std::string highest();
+  static inline std::string infinity();
+  static inline std::string quiet_NaN();
+};
+
+// Empty specialization for void to allow template specialization based on NumTraits<T>::Real with T==void and SFINAE.
+template<> struct NumTraits<void> {};
+
+template<> struct NumTraits<bool> : GenericNumTraits<bool> {};
+
+} // end namespace Eigen
+
+#endif // EIGEN_NUMTRAITS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/PartialReduxEvaluator.h b/src/3rdparty/eigen/Eigen/src/Core/PartialReduxEvaluator.h
new file mode 100644
index 000000000..29abf35b9
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/PartialReduxEvaluator.h
@@ -0,0 +1,232 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PARTIALREDUX_H
+#define EIGEN_PARTIALREDUX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+
+/***************************************************************************
+*
+* This file provides evaluators for partial reductions.
+* There are two modes:
+*
+*  - scalar path: simply calls the respective function on the column or row.
+*    -> nothing special here, all the tricky part is handled by the return
+*       types of VectorwiseOp's members. They embed the functor calling the
+*       respective DenseBase's member function.
+*
+*  - vectorized path: implements a packet-wise reductions followed by
+*    some (optional) processing of the outcome, e.g., division by n for mean.
+*
+* For the vectorized path let's observe that the packet-size and outer-unrolling
+* are both decided by the assignement logic. So all we have to do is to decide
+* on the inner unrolling.
+*
+* For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h,
+* but be need to be careful to specify correct increment.
+*
+***************************************************************************/
+
+
+/* logic deciding a strategy for unrolling of vectorized paths */
+template<typename Func, typename Evaluator>
+struct packetwise_redux_traits
+{
+  enum {
+    OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime,
+    Cost = OuterSize == Dynamic ? HugeCost
+         : OuterSize * Evaluator::CoeffReadCost + (OuterSize-1) * functor_traits<Func>::Cost,
+    Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling
+  };
+
+};
+
+/* Value to be returned when size==0 , by default let's return 0 */
+template<typename PacketType,typename Func>
+EIGEN_DEVICE_FUNC
+PacketType packetwise_redux_empty_value(const Func& ) { return pset1<PacketType>(0); }
+
+/* For products the default is 1 */
+template<typename PacketType,typename Scalar>
+EIGEN_DEVICE_FUNC
+PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) { return pset1<PacketType>(1); }
+
+/* Perform the actual reduction */
+template<typename Func, typename Evaluator,
+         int Unrolling = packetwise_redux_traits<Func, Evaluator>::Unrolling
+>
+struct packetwise_redux_impl;
+
+/* Perform the actual reduction with unrolling */
+template<typename Func, typename Evaluator>
+struct packetwise_redux_impl<Func, Evaluator, CompleteUnrolling>
+{
+  typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
+  typedef typename Evaluator::Scalar Scalar;
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  PacketType run(const Evaluator &eval, const Func& func, Index /*size*/)
+  {
+    return redux_vec_unroller<Func, Evaluator, 0, packetwise_redux_traits<Func, Evaluator>::OuterSize>::template run<PacketType>(eval,func);
+  }
+};
+
+/* Add a specialization of redux_vec_unroller for size==0 at compiletime.
+ * This specialization is not required for general reductions, which is
+ * why it is defined here.
+ */
+template<typename Func, typename Evaluator, int Start>
+struct redux_vec_unroller<Func, Evaluator, Start, 0>
+{
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &, const Func& f)
+  {
+    return packetwise_redux_empty_value<PacketType>(f);
+  }
+};
+
+/* Perform the actual reduction for dynamic sizes */
+template<typename Func, typename Evaluator>
+struct packetwise_redux_impl<Func, Evaluator, NoUnrolling>
+{
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC
+  static PacketType run(const Evaluator &eval, const Func& func, Index size)
+  {
+    if(size==0)
+      return packetwise_redux_empty_value<PacketType>(func);
+    
+    const Index size4 = (size-1)&(~3);
+    PacketType p = eval.template packetByOuterInner<Unaligned,PacketType>(0,0);
+    Index i = 1;
+    // This loop is optimized for instruction pipelining:
+    // - each iteration generates two independent instructions
+    // - thanks to branch prediction and out-of-order execution we have independent instructions across loops
+    for(; i<size4; i+=4)
+      p = func.packetOp(p,
+            func.packetOp(
+              func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+0,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+1,0)),
+              func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+2,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+3,0))));
+    for(; i<size; ++i)
+      p = func.packetOp(p, eval.template packetByOuterInner<Unaligned,PacketType>(i,0));
+    return p;
+  }
+};
+
+template< typename ArgType, typename MemberOp, int Direction>
+struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
+  : evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
+{
+  typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
+  typedef typename internal::nested_eval<ArgType,1>::type ArgTypeNested;
+  typedef typename internal::add_const_on_value_type<ArgTypeNested>::type ConstArgTypeNested;
+  typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
+  typedef typename ArgType::Scalar InputScalar;
+  typedef typename XprType::Scalar Scalar;
+  enum {
+    TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) :  int(ArgType::ColsAtCompileTime)
+  };
+  typedef typename MemberOp::template Cost<int(TraversalSize)> CostOpType;
+  enum {
+    CoeffReadCost = TraversalSize==Dynamic ? HugeCost
+                  : TraversalSize==0 ? 1
+                  : int(TraversalSize) * int(evaluator<ArgType>::CoeffReadCost) + int(CostOpType::value),
+    
+    _ArgFlags = evaluator<ArgType>::Flags,
+
+    _Vectorizable =  bool(int(_ArgFlags)&PacketAccessBit)
+                  && bool(MemberOp::Vectorizable)
+                  && (Direction==int(Vertical) ? bool(_ArgFlags&RowMajorBit) : (_ArgFlags&RowMajorBit)==0)
+                  && (TraversalSize!=0),
+                  
+    Flags = (traits<XprType>::Flags&RowMajorBit)
+          | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit)))
+          | (_Vectorizable ? PacketAccessBit : 0)
+          | LinearAccessBit,
+    
+    Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr)
+    : m_arg(xpr.nestedExpression()), m_functor(xpr.functor())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : (TraversalSize==0 ? 1 : int(CostOpType::value)));
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar coeff(Index i, Index j) const
+  {
+    return coeff(Direction==Vertical ? j : i);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar coeff(Index index) const
+  {
+    return m_functor(m_arg.template subVector<DirectionType(Direction)>(index));
+  }
+
+  template<int LoadMode,typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketType packet(Index i, Index j) const
+  {
+    return packet<LoadMode,PacketType>(Direction==Vertical ? j : i);
+  }
+  
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+  PacketType packet(Index idx) const
+  {
+    enum { PacketSize = internal::unpacket_traits<PacketType>::size };
+    typedef Block<const ArgTypeNestedCleaned,
+                  Direction==Vertical ? int(ArgType::RowsAtCompileTime) : int(PacketSize),
+                  Direction==Vertical ? int(PacketSize) : int(ArgType::ColsAtCompileTime),
+                  true /* InnerPanel */> PanelType;
+    
+    PanelType panel(m_arg,
+                    Direction==Vertical ? 0 : idx,
+                    Direction==Vertical ? idx : 0,
+                    Direction==Vertical ? m_arg.rows() : Index(PacketSize),
+                    Direction==Vertical ? Index(PacketSize) : m_arg.cols());
+
+    // FIXME
+    // See bug 1612, currently if PacketSize==1 (i.e. complex<double> with 128bits registers) then the storage-order of panel get reversed
+    // and methods like packetByOuterInner do not make sense anymore in this context.
+    // So let's just by pass "vectorization" in this case:
+    if(PacketSize==1)
+      return internal::pset1<PacketType>(coeff(idx));
+    
+    typedef typename internal::redux_evaluator<PanelType> PanelEvaluator;
+    PanelEvaluator panel_eval(panel);
+    typedef typename MemberOp::BinaryOp BinaryOp;
+    PacketType p = internal::packetwise_redux_impl<BinaryOp,PanelEvaluator>::template run<PacketType>(panel_eval,m_functor.binaryFunc(),m_arg.outerSize());
+    return p;
+  }
+
+protected:
+  ConstArgTypeNested m_arg;
+  const MemberOp m_functor;
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PARTIALREDUX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/PermutationMatrix.h b/src/3rdparty/eigen/Eigen/src/Core/PermutationMatrix.h
new file mode 100644
index 000000000..69401bf41
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/PermutationMatrix.h
@@ -0,0 +1,605 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PERMUTATIONMATRIX_H
+#define EIGEN_PERMUTATIONMATRIX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+enum PermPermProduct_t {PermPermProduct};
+
+} // end namespace internal
+
+/** \class PermutationBase
+  * \ingroup Core_Module
+  *
+  * \brief Base class for permutations
+  *
+  * \tparam Derived the derived class
+  *
+  * This class is the base class for all expressions representing a permutation matrix,
+  * internally stored as a vector of integers.
+  * The convention followed here is that if \f$ \sigma \f$ is a permutation, the corresponding permutation matrix
+  * \f$ P_\sigma \f$ is such that if \f$ (e_1,\ldots,e_p) \f$ is the canonical basis, we have:
+  *  \f[ P_\sigma(e_i) = e_{\sigma(i)}. \f]
+  * This convention ensures that for any two permutations \f$ \sigma, \tau \f$, we have:
+  *  \f[ P_{\sigma\circ\tau} = P_\sigma P_\tau. \f]
+  *
+  * Permutation matrices are square and invertible.
+  *
+  * Notice that in addition to the member functions and operators listed here, there also are non-member
+  * operator* to multiply any kind of permutation object with any kind of matrix expression (MatrixBase)
+  * on either side.
+  *
+  * \sa class PermutationMatrix, class PermutationWrapper
+  */
+template<typename Derived>
+class PermutationBase : public EigenBase<Derived>
+{
+    typedef internal::traits<Derived> Traits;
+    typedef EigenBase<Derived> Base;
+  public:
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef typename Traits::IndicesType IndicesType;
+    enum {
+      Flags = Traits::Flags,
+      RowsAtCompileTime = Traits::RowsAtCompileTime,
+      ColsAtCompileTime = Traits::ColsAtCompileTime,
+      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
+    };
+    typedef typename Traits::StorageIndex StorageIndex;
+    typedef Matrix<StorageIndex,RowsAtCompileTime,ColsAtCompileTime,0,MaxRowsAtCompileTime,MaxColsAtCompileTime>
+            DenseMatrixType;
+    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,StorageIndex>
+            PlainPermutationType;
+    typedef PlainPermutationType PlainObject;
+    using Base::derived;
+    typedef Inverse<Derived> InverseReturnType;
+    typedef void Scalar;
+    #endif
+
+    /** Copies the other permutation into *this */
+    template<typename OtherDerived>
+    Derived& operator=(const PermutationBase<OtherDerived>& other)
+    {
+      indices() = other.indices();
+      return derived();
+    }
+
+    /** Assignment from the Transpositions \a tr */
+    template<typename OtherDerived>
+    Derived& operator=(const TranspositionsBase<OtherDerived>& tr)
+    {
+      setIdentity(tr.size());
+      for(Index k=size()-1; k>=0; --k)
+        applyTranspositionOnTheRight(k,tr.coeff(k));
+      return derived();
+    }
+
+    /** \returns the number of rows */
+    inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); }
+
+    /** \returns the number of columns */
+    inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); }
+
+    /** \returns the size of a side of the respective square matrix, i.e., the number of indices */
+    inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename DenseDerived>
+    void evalTo(MatrixBase<DenseDerived>& other) const
+    {
+      other.setZero();
+      for (Index i=0; i<rows(); ++i)
+        other.coeffRef(indices().coeff(i),i) = typename DenseDerived::Scalar(1);
+    }
+    #endif
+
+    /** \returns a Matrix object initialized from this permutation matrix. Notice that it
+      * is inefficient to return this Matrix object by value. For efficiency, favor using
+      * the Matrix constructor taking EigenBase objects.
+      */
+    DenseMatrixType toDenseMatrix() const
+    {
+      return derived();
+    }
+
+    /** const version of indices(). */
+    const IndicesType& indices() const { return derived().indices(); }
+    /** \returns a reference to the stored array representing the permutation. */
+    IndicesType& indices() { return derived().indices(); }
+
+    /** Resizes to given size.
+      */
+    inline void resize(Index newSize)
+    {
+      indices().resize(newSize);
+    }
+
+    /** Sets *this to be the identity permutation matrix */
+    void setIdentity()
+    {
+      StorageIndex n = StorageIndex(size());
+      for(StorageIndex i = 0; i < n; ++i)
+        indices().coeffRef(i) = i;
+    }
+
+    /** Sets *this to be the identity permutation matrix of given size.
+      */
+    void setIdentity(Index newSize)
+    {
+      resize(newSize);
+      setIdentity();
+    }
+
+    /** Multiplies *this by the transposition \f$(ij)\f$ on the left.
+      *
+      * \returns a reference to *this.
+      *
+      * \warning This is much slower than applyTranspositionOnTheRight(Index,Index):
+      * this has linear complexity and requires a lot of branching.
+      *
+      * \sa applyTranspositionOnTheRight(Index,Index)
+      */
+    Derived& applyTranspositionOnTheLeft(Index i, Index j)
+    {
+      eigen_assert(i>=0 && j>=0 && i<size() && j<size());
+      for(Index k = 0; k < size(); ++k)
+      {
+        if(indices().coeff(k) == i) indices().coeffRef(k) = StorageIndex(j);
+        else if(indices().coeff(k) == j) indices().coeffRef(k) = StorageIndex(i);
+      }
+      return derived();
+    }
+
+    /** Multiplies *this by the transposition \f$(ij)\f$ on the right.
+      *
+      * \returns a reference to *this.
+      *
+      * This is a fast operation, it only consists in swapping two indices.
+      *
+      * \sa applyTranspositionOnTheLeft(Index,Index)
+      */
+    Derived& applyTranspositionOnTheRight(Index i, Index j)
+    {
+      eigen_assert(i>=0 && j>=0 && i<size() && j<size());
+      std::swap(indices().coeffRef(i), indices().coeffRef(j));
+      return derived();
+    }
+
+    /** \returns the inverse permutation matrix.
+      *
+      * \note \blank \note_try_to_help_rvo
+      */
+    inline InverseReturnType inverse() const
+    { return InverseReturnType(derived()); }
+    /** \returns the tranpose permutation matrix.
+      *
+      * \note \blank \note_try_to_help_rvo
+      */
+    inline InverseReturnType transpose() const
+    { return InverseReturnType(derived()); }
+
+    /**** multiplication helpers to hopefully get RVO ****/
+
+  
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  protected:
+    template<typename OtherDerived>
+    void assignTranspose(const PermutationBase<OtherDerived>& other)
+    {
+      for (Index i=0; i<rows();++i) indices().coeffRef(other.indices().coeff(i)) = i;
+    }
+    template<typename Lhs,typename Rhs>
+    void assignProduct(const Lhs& lhs, const Rhs& rhs)
+    {
+      eigen_assert(lhs.cols() == rhs.rows());
+      for (Index i=0; i<rows();++i) indices().coeffRef(i) = lhs.indices().coeff(rhs.indices().coeff(i));
+    }
+#endif
+
+  public:
+
+    /** \returns the product permutation matrix.
+      *
+      * \note \blank \note_try_to_help_rvo
+      */
+    template<typename Other>
+    inline PlainPermutationType operator*(const PermutationBase<Other>& other) const
+    { return PlainPermutationType(internal::PermPermProduct, derived(), other.derived()); }
+
+    /** \returns the product of a permutation with another inverse permutation.
+      *
+      * \note \blank \note_try_to_help_rvo
+      */
+    template<typename Other>
+    inline PlainPermutationType operator*(const InverseImpl<Other,PermutationStorage>& other) const
+    { return PlainPermutationType(internal::PermPermProduct, *this, other.eval()); }
+
+    /** \returns the product of an inverse permutation with another permutation.
+      *
+      * \note \blank \note_try_to_help_rvo
+      */
+    template<typename Other> friend
+    inline PlainPermutationType operator*(const InverseImpl<Other, PermutationStorage>& other, const PermutationBase& perm)
+    { return PlainPermutationType(internal::PermPermProduct, other.eval(), perm); }
+    
+    /** \returns the determinant of the permutation matrix, which is either 1 or -1 depending on the parity of the permutation.
+      *
+      * This function is O(\c n) procedure allocating a buffer of \c n booleans.
+      */
+    Index determinant() const
+    {
+      Index res = 1;
+      Index n = size();
+      Matrix<bool,RowsAtCompileTime,1,0,MaxRowsAtCompileTime> mask(n);
+      mask.fill(false);
+      Index r = 0;
+      while(r < n)
+      {
+        // search for the next seed
+        while(r<n && mask[r]) r++;
+        if(r>=n)
+          break;
+        // we got one, let's follow it until we are back to the seed
+        Index k0 = r++;
+        mask.coeffRef(k0) = true;
+        for(Index k=indices().coeff(k0); k!=k0; k=indices().coeff(k))
+        {
+          mask.coeffRef(k) = true;
+          res = -res;
+        }
+      }
+      return res;
+    }
+
+  protected:
+
+};
+
+namespace internal {
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
+struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> >
+ : traits<Matrix<_StorageIndex,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+{
+  typedef PermutationStorage StorageKind;
+  typedef Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef _StorageIndex StorageIndex;
+  typedef void Scalar;
+};
+}
+
+/** \class PermutationMatrix
+  * \ingroup Core_Module
+  *
+  * \brief Permutation matrix
+  *
+  * \tparam SizeAtCompileTime the number of rows/cols, or Dynamic
+  * \tparam MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
+  * \tparam _StorageIndex the integer type of the indices
+  *
+  * This class represents a permutation matrix, internally stored as a vector of integers.
+  *
+  * \sa class PermutationBase, class PermutationWrapper, class DiagonalMatrix
+  */
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
+class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> >
+{
+    typedef PermutationBase<PermutationMatrix> Base;
+    typedef internal::traits<PermutationMatrix> Traits;
+  public:
+
+    typedef const PermutationMatrix& Nested;
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef typename Traits::IndicesType IndicesType;
+    typedef typename Traits::StorageIndex StorageIndex;
+    #endif
+
+    inline PermutationMatrix()
+    {}
+
+    /** Constructs an uninitialized permutation matrix of given size.
+      */
+    explicit inline PermutationMatrix(Index size) : m_indices(size)
+    {
+      eigen_internal_assert(size <= NumTraits<StorageIndex>::highest());
+    }
+
+    /** Copy constructor. */
+    template<typename OtherDerived>
+    inline PermutationMatrix(const PermutationBase<OtherDerived>& other)
+      : m_indices(other.indices()) {}
+
+    /** Generic constructor from expression of the indices. The indices
+      * array has the meaning that the permutations sends each integer i to indices[i].
+      *
+      * \warning It is your responsibility to check that the indices array that you passes actually
+      * describes a permutation, i.e., each value between 0 and n-1 occurs exactly once, where n is the
+      * array's size.
+      */
+    template<typename Other>
+    explicit inline PermutationMatrix(const MatrixBase<Other>& indices) : m_indices(indices)
+    {}
+
+    /** Convert the Transpositions \a tr to a permutation matrix */
+    template<typename Other>
+    explicit PermutationMatrix(const TranspositionsBase<Other>& tr)
+      : m_indices(tr.size())
+    {
+      *this = tr;
+    }
+
+    /** Copies the other permutation into *this */
+    template<typename Other>
+    PermutationMatrix& operator=(const PermutationBase<Other>& other)
+    {
+      m_indices = other.indices();
+      return *this;
+    }
+
+    /** Assignment from the Transpositions \a tr */
+    template<typename Other>
+    PermutationMatrix& operator=(const TranspositionsBase<Other>& tr)
+    {
+      return Base::operator=(tr.derived());
+    }
+
+    /** const version of indices(). */
+    const IndicesType& indices() const { return m_indices; }
+    /** \returns a reference to the stored array representing the permutation. */
+    IndicesType& indices() { return m_indices; }
+
+
+    /**** multiplication helpers to hopefully get RVO ****/
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename Other>
+    PermutationMatrix(const InverseImpl<Other,PermutationStorage>& other)
+      : m_indices(other.derived().nestedExpression().size())
+    {
+      eigen_internal_assert(m_indices.size() <= NumTraits<StorageIndex>::highest());
+      StorageIndex end = StorageIndex(m_indices.size());
+      for (StorageIndex i=0; i<end;++i)
+        m_indices.coeffRef(other.derived().nestedExpression().indices().coeff(i)) = i;
+    }
+    template<typename Lhs,typename Rhs>
+    PermutationMatrix(internal::PermPermProduct_t, const Lhs& lhs, const Rhs& rhs)
+      : m_indices(lhs.indices().size())
+    {
+      Base::assignProduct(lhs,rhs);
+    }
+#endif
+
+  protected:
+
+    IndicesType m_indices;
+};
+
+
+namespace internal {
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
+struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex>,_PacketAccess> >
+ : traits<Matrix<_StorageIndex,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+{
+  typedef PermutationStorage StorageKind;
+  typedef Map<const Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
+  typedef _StorageIndex StorageIndex;
+  typedef void Scalar;
+};
+}
+
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
+class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex>,_PacketAccess>
+  : public PermutationBase<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex>,_PacketAccess> >
+{
+    typedef PermutationBase<Map> Base;
+    typedef internal::traits<Map> Traits;
+  public:
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef typename Traits::IndicesType IndicesType;
+    typedef typename IndicesType::Scalar StorageIndex;
+    #endif
+
+    inline Map(const StorageIndex* indicesPtr)
+      : m_indices(indicesPtr)
+    {}
+
+    inline Map(const StorageIndex* indicesPtr, Index size)
+      : m_indices(indicesPtr,size)
+    {}
+
+    /** Copies the other permutation into *this */
+    template<typename Other>
+    Map& operator=(const PermutationBase<Other>& other)
+    { return Base::operator=(other.derived()); }
+
+    /** Assignment from the Transpositions \a tr */
+    template<typename Other>
+    Map& operator=(const TranspositionsBase<Other>& tr)
+    { return Base::operator=(tr.derived()); }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    Map& operator=(const Map& other)
+    {
+      m_indices = other.m_indices;
+      return *this;
+    }
+    #endif
+
+    /** const version of indices(). */
+    const IndicesType& indices() const { return m_indices; }
+    /** \returns a reference to the stored array representing the permutation. */
+    IndicesType& indices() { return m_indices; }
+
+  protected:
+
+    IndicesType m_indices;
+};
+
+template<typename _IndicesType> class TranspositionsWrapper;
+namespace internal {
+template<typename _IndicesType>
+struct traits<PermutationWrapper<_IndicesType> >
+{
+  typedef PermutationStorage StorageKind;
+  typedef void Scalar;
+  typedef typename _IndicesType::Scalar StorageIndex;
+  typedef _IndicesType IndicesType;
+  enum {
+    RowsAtCompileTime = _IndicesType::SizeAtCompileTime,
+    ColsAtCompileTime = _IndicesType::SizeAtCompileTime,
+    MaxRowsAtCompileTime = IndicesType::MaxSizeAtCompileTime,
+    MaxColsAtCompileTime = IndicesType::MaxSizeAtCompileTime,
+    Flags = 0
+  };
+};
+}
+
+/** \class PermutationWrapper
+  * \ingroup Core_Module
+  *
+  * \brief Class to view a vector of integers as a permutation matrix
+  *
+  * \tparam _IndicesType the type of the vector of integer (can be any compatible expression)
+  *
+  * This class allows to view any vector expression of integers as a permutation matrix.
+  *
+  * \sa class PermutationBase, class PermutationMatrix
+  */
+template<typename _IndicesType>
+class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesType> >
+{
+    typedef PermutationBase<PermutationWrapper> Base;
+    typedef internal::traits<PermutationWrapper> Traits;
+  public:
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef typename Traits::IndicesType IndicesType;
+    #endif
+
+    inline PermutationWrapper(const IndicesType& indices)
+      : m_indices(indices)
+    {}
+
+    /** const version of indices(). */
+    const typename internal::remove_all<typename IndicesType::Nested>::type&
+    indices() const { return m_indices; }
+
+  protected:
+
+    typename IndicesType::Nested m_indices;
+};
+
+
+/** \returns the matrix with the permutation applied to the columns.
+  */
+template<typename MatrixDerived, typename PermutationDerived>
+EIGEN_DEVICE_FUNC
+const Product<MatrixDerived, PermutationDerived, AliasFreeProduct>
+operator*(const MatrixBase<MatrixDerived> &matrix,
+          const PermutationBase<PermutationDerived>& permutation)
+{
+  return Product<MatrixDerived, PermutationDerived, AliasFreeProduct>
+            (matrix.derived(), permutation.derived());
+}
+
+/** \returns the matrix with the permutation applied to the rows.
+  */
+template<typename PermutationDerived, typename MatrixDerived>
+EIGEN_DEVICE_FUNC
+const Product<PermutationDerived, MatrixDerived, AliasFreeProduct>
+operator*(const PermutationBase<PermutationDerived> &permutation,
+          const MatrixBase<MatrixDerived>& matrix)
+{
+  return Product<PermutationDerived, MatrixDerived, AliasFreeProduct>
+            (permutation.derived(), matrix.derived());
+}
+
+
+template<typename PermutationType>
+class InverseImpl<PermutationType, PermutationStorage>
+  : public EigenBase<Inverse<PermutationType> >
+{
+    typedef typename PermutationType::PlainPermutationType PlainPermutationType;
+    typedef internal::traits<PermutationType> PermTraits;
+  protected:
+    InverseImpl() {}
+  public:
+    typedef Inverse<PermutationType> InverseType;
+    using EigenBase<Inverse<PermutationType> >::derived;
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef typename PermutationType::DenseMatrixType DenseMatrixType;
+    enum {
+      RowsAtCompileTime = PermTraits::RowsAtCompileTime,
+      ColsAtCompileTime = PermTraits::ColsAtCompileTime,
+      MaxRowsAtCompileTime = PermTraits::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = PermTraits::MaxColsAtCompileTime
+    };
+    #endif
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename DenseDerived>
+    void evalTo(MatrixBase<DenseDerived>& other) const
+    {
+      other.setZero();
+      for (Index i=0; i<derived().rows();++i)
+        other.coeffRef(i, derived().nestedExpression().indices().coeff(i)) = typename DenseDerived::Scalar(1);
+    }
+    #endif
+
+    /** \return the equivalent permutation matrix */
+    PlainPermutationType eval() const { return derived(); }
+
+    DenseMatrixType toDenseMatrix() const { return derived(); }
+
+    /** \returns the matrix with the inverse permutation applied to the columns.
+      */
+    template<typename OtherDerived> friend
+    const Product<OtherDerived, InverseType, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix, const InverseType& trPerm)
+    {
+      return Product<OtherDerived, InverseType, AliasFreeProduct>(matrix.derived(), trPerm.derived());
+    }
+
+    /** \returns the matrix with the inverse permutation applied to the rows.
+      */
+    template<typename OtherDerived>
+    const Product<InverseType, OtherDerived, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix) const
+    {
+      return Product<InverseType, OtherDerived, AliasFreeProduct>(derived(), matrix.derived());
+    }
+};
+
+template<typename Derived>
+const PermutationWrapper<const Derived> MatrixBase<Derived>::asPermutation() const
+{
+  return derived();
+}
+
+namespace internal {
+
+template<> struct AssignmentKind<DenseShape,PermutationShape> { typedef EigenBase2EigenBase Kind; };
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PERMUTATIONMATRIX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/PlainObjectBase.h b/src/3rdparty/eigen/Eigen/src/Core/PlainObjectBase.h
new file mode 100644
index 000000000..e2ddbd1d5
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/PlainObjectBase.h
@@ -0,0 +1,1128 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DENSESTORAGEBASE_H
+#define EIGEN_DENSESTORAGEBASE_H
+
+#if defined(EIGEN_INITIALIZE_MATRICES_BY_ZERO)
+# define EIGEN_INITIALIZE_COEFFS
+# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(Index i=0;i<base().size();++i) coeffRef(i)=Scalar(0);
+#elif defined(EIGEN_INITIALIZE_MATRICES_BY_NAN)
+# define EIGEN_INITIALIZE_COEFFS
+# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(Index i=0;i<base().size();++i) coeffRef(i)=std::numeric_limits<Scalar>::quiet_NaN();
+#else
+# undef EIGEN_INITIALIZE_COEFFS
+# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+template<int MaxSizeAtCompileTime> struct check_rows_cols_for_overflow {
+  template<typename Index>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE void run(Index, Index)
+  {
+  }
+};
+
+template<> struct check_rows_cols_for_overflow<Dynamic> {
+  template<typename Index>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE void run(Index rows, Index cols)
+  {
+    // http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242
+    // we assume Index is signed
+    Index max_index = (std::size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
+    bool error = (rows == 0 || cols == 0) ? false
+               : (rows > max_index / cols);
+    if (error)
+      throw_std_bad_alloc();
+  }
+};
+
+template <typename Derived,
+          typename OtherDerived = Derived,
+          bool IsVector = bool(Derived::IsVectorAtCompileTime) && bool(OtherDerived::IsVectorAtCompileTime)>
+struct conservative_resize_like_impl;
+
+template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct matrix_swap_impl;
+
+} // end namespace internal
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+namespace doxygen {
+
+// This is a workaround to doxygen not being able to understand the inheritance logic
+// when it is hidden by the dense_xpr_base helper struct.
+// Moreover, doxygen fails to include members that are not documented in the declaration body of
+// MatrixBase if we inherits MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >,
+// this is why we simply inherits MatrixBase, though this does not make sense.
+
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename Derived> struct dense_xpr_base_dispatcher;
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct dense_xpr_base_dispatcher<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+    : public MatrixBase {};
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct dense_xpr_base_dispatcher<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
+    : public ArrayBase {};
+
+} // namespace doxygen
+
+/** \class PlainObjectBase
+  * \ingroup Core_Module
+  * \brief %Dense storage base class for matrices and arrays.
+  *
+  * This class can be extended with the help of the plugin mechanism described on the page
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
+  *
+  * \tparam Derived is the derived type, e.g., a Matrix or Array
+  *
+  * \sa \ref TopicClassHierarchy
+  */
+template<typename Derived>
+class PlainObjectBase : public doxygen::dense_xpr_base_dispatcher<Derived>
+#else
+template<typename Derived>
+class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
+#endif
+{
+  public:
+    enum { Options = internal::traits<Derived>::Options };
+    typedef typename internal::dense_xpr_base<Derived>::type Base;
+
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+
+    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef Derived DenseType;
+
+    using Base::RowsAtCompileTime;
+    using Base::ColsAtCompileTime;
+    using Base::SizeAtCompileTime;
+    using Base::MaxRowsAtCompileTime;
+    using Base::MaxColsAtCompileTime;
+    using Base::MaxSizeAtCompileTime;
+    using Base::IsVectorAtCompileTime;
+    using Base::Flags;
+
+    typedef Eigen::Map<Derived, Unaligned>  MapType;
+    typedef const Eigen::Map<const Derived, Unaligned> ConstMapType;
+    typedef Eigen::Map<Derived, AlignedMax> AlignedMapType;
+    typedef const Eigen::Map<const Derived, AlignedMax> ConstAlignedMapType;
+    template<typename StrideType> struct StridedMapType { typedef Eigen::Map<Derived, Unaligned, StrideType> type; };
+    template<typename StrideType> struct StridedConstMapType { typedef Eigen::Map<const Derived, Unaligned, StrideType> type; };
+    template<typename StrideType> struct StridedAlignedMapType { typedef Eigen::Map<Derived, AlignedMax, StrideType> type; };
+    template<typename StrideType> struct StridedConstAlignedMapType { typedef Eigen::Map<const Derived, AlignedMax, StrideType> type; };
+
+  protected:
+    DenseStorage<Scalar, Base::MaxSizeAtCompileTime, Base::RowsAtCompileTime, Base::ColsAtCompileTime, Options> m_storage;
+
+  public:
+    enum { NeedsToAlign = (SizeAtCompileTime != Dynamic) && (internal::traits<Derived>::Alignment>0) };
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
+
+    EIGEN_DEVICE_FUNC
+    Base& base() { return *static_cast<Base*>(this); }
+    EIGEN_DEVICE_FUNC
+    const Base& base() const { return *static_cast<const Base*>(this); }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_storage.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_storage.cols(); }
+
+    /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index,Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& coeff(Index rowId, Index colId) const
+    {
+      if(Flags & RowMajorBit)
+        return m_storage.data()[colId + rowId * m_storage.cols()];
+      else // column-major
+        return m_storage.data()[rowId + colId * m_storage.rows()];
+    }
+
+    /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
+    {
+      return m_storage.data()[index];
+    }
+
+    /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const for details. */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index rowId, Index colId)
+    {
+      if(Flags & RowMajorBit)
+        return m_storage.data()[colId + rowId * m_storage.cols()];
+      else // column-major
+        return m_storage.data()[rowId + colId * m_storage.rows()];
+    }
+
+    /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const
+      * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+      *
+      * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const for details. */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+    {
+      return m_storage.data()[index];
+    }
+
+    /** This is the const version of coeffRef(Index,Index) which is thus synonym of coeff(Index,Index).
+      * It is provided for convenience. */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const
+    {
+      if(Flags & RowMajorBit)
+        return m_storage.data()[colId + rowId * m_storage.cols()];
+      else // column-major
+        return m_storage.data()[rowId + colId * m_storage.rows()];
+    }
+
+    /** This is the const version of coeffRef(Index) which is thus synonym of coeff(Index).
+      * It is provided for convenience. */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const
+    {
+      return m_storage.data()[index];
+    }
+
+    /** \internal */
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
+    {
+      return internal::ploadt<PacketScalar, LoadMode>
+               (m_storage.data() + (Flags & RowMajorBit
+                                   ? colId + rowId * m_storage.cols()
+                                   : rowId + colId * m_storage.rows()));
+    }
+
+    /** \internal */
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
+    {
+      return internal::ploadt<PacketScalar, LoadMode>(m_storage.data() + index);
+    }
+
+    /** \internal */
+    template<int StoreMode>
+    EIGEN_STRONG_INLINE void writePacket(Index rowId, Index colId, const PacketScalar& val)
+    {
+      internal::pstoret<Scalar, PacketScalar, StoreMode>
+              (m_storage.data() + (Flags & RowMajorBit
+                                   ? colId + rowId * m_storage.cols()
+                                   : rowId + colId * m_storage.rows()), val);
+    }
+
+    /** \internal */
+    template<int StoreMode>
+    EIGEN_STRONG_INLINE void writePacket(Index index, const PacketScalar& val)
+    {
+      internal::pstoret<Scalar, PacketScalar, StoreMode>(m_storage.data() + index, val);
+    }
+
+    /** \returns a const pointer to the data array of this matrix */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const
+    { return m_storage.data(); }
+
+    /** \returns a pointer to the data array of this matrix */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data()
+    { return m_storage.data(); }
+
+    /** Resizes \c *this to a \a rows x \a cols matrix.
+      *
+      * This method is intended for dynamic-size matrices, although it is legal to call it on any
+      * matrix as long as fixed dimensions are left unchanged. If you only want to change the number
+      * of rows and/or of columns, you can use resize(NoChange_t, Index), resize(Index, NoChange_t).
+      *
+      * If the current number of coefficients of \c *this exactly matches the
+      * product \a rows * \a cols, then no memory allocation is performed and
+      * the current values are left unchanged. In all other cases, including
+      * shrinking, the data is reallocated and all previous values are lost.
+      *
+      * Example: \include Matrix_resize_int_int.cpp
+      * Output: \verbinclude Matrix_resize_int_int.out
+      *
+      * \sa resize(Index) for vectors, resize(NoChange_t, Index), resize(Index, NoChange_t)
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void resize(Index rows, Index cols)
+    {
+      eigen_assert(   EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,rows==RowsAtCompileTime)
+                   && EIGEN_IMPLIES(ColsAtCompileTime!=Dynamic,cols==ColsAtCompileTime)
+                   && EIGEN_IMPLIES(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic,rows<=MaxRowsAtCompileTime)
+                   && EIGEN_IMPLIES(ColsAtCompileTime==Dynamic && MaxColsAtCompileTime!=Dynamic,cols<=MaxColsAtCompileTime)
+                   && rows>=0 && cols>=0 && "Invalid sizes when resizing a matrix or array.");
+      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(rows, cols);
+      #ifdef EIGEN_INITIALIZE_COEFFS
+        Index size = rows*cols;
+        bool size_changed = size != this->size();
+        m_storage.resize(size, rows, cols);
+        if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+      #else
+        m_storage.resize(rows*cols, rows, cols);
+      #endif
+    }
+
+    /** Resizes \c *this to a vector of length \a size
+      *
+      * \only_for_vectors. This method does not work for
+      * partially dynamic matrices when the static dimension is anything other
+      * than 1. For example it will not work with Matrix<double, 2, Dynamic>.
+      *
+      * Example: \include Matrix_resize_int.cpp
+      * Output: \verbinclude Matrix_resize_int.out
+      *
+      * \sa resize(Index,Index), resize(NoChange_t, Index), resize(Index, NoChange_t)
+      */
+    EIGEN_DEVICE_FUNC
+    inline void resize(Index size)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase)
+      eigen_assert(((SizeAtCompileTime == Dynamic && (MaxSizeAtCompileTime==Dynamic || size<=MaxSizeAtCompileTime)) || SizeAtCompileTime == size) && size>=0);
+      #ifdef EIGEN_INITIALIZE_COEFFS
+        bool size_changed = size != this->size();
+      #endif
+      if(RowsAtCompileTime == 1)
+        m_storage.resize(size, 1, size);
+      else
+        m_storage.resize(size, size, 1);
+      #ifdef EIGEN_INITIALIZE_COEFFS
+        if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+      #endif
+    }
+
+    /** Resizes the matrix, changing only the number of columns. For the parameter of type NoChange_t, just pass the special value \c NoChange
+      * as in the example below.
+      *
+      * Example: \include Matrix_resize_NoChange_int.cpp
+      * Output: \verbinclude Matrix_resize_NoChange_int.out
+      *
+      * \sa resize(Index,Index)
+      */
+    EIGEN_DEVICE_FUNC
+    inline void resize(NoChange_t, Index cols)
+    {
+      resize(rows(), cols);
+    }
+
+    /** Resizes the matrix, changing only the number of rows. For the parameter of type NoChange_t, just pass the special value \c NoChange
+      * as in the example below.
+      *
+      * Example: \include Matrix_resize_int_NoChange.cpp
+      * Output: \verbinclude Matrix_resize_int_NoChange.out
+      *
+      * \sa resize(Index,Index)
+      */
+    EIGEN_DEVICE_FUNC
+    inline void resize(Index rows, NoChange_t)
+    {
+      resize(rows, cols());
+    }
+
+    /** Resizes \c *this to have the same dimensions as \a other.
+      * Takes care of doing all the checking that's needed.
+      *
+      * Note that copying a row-vector into a vector (and conversely) is allowed.
+      * The resizing, if any, is then done in the appropriate way so that row-vectors
+      * remain row-vectors and vectors remain vectors.
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other)
+    {
+      const OtherDerived& other = _other.derived();
+      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(other.rows(), other.cols());
+      const Index othersize = other.rows()*other.cols();
+      if(RowsAtCompileTime == 1)
+      {
+        eigen_assert(other.rows() == 1 || other.cols() == 1);
+        resize(1, othersize);
+      }
+      else if(ColsAtCompileTime == 1)
+      {
+        eigen_assert(other.rows() == 1 || other.cols() == 1);
+        resize(othersize, 1);
+      }
+      else resize(other.rows(), other.cols());
+    }
+
+    /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
+      *
+      * The method is intended for matrices of dynamic size. If you only want to change the number
+      * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
+      * conservativeResize(Index, NoChange_t).
+      *
+      * Matrices are resized relative to the top-left element. In case values need to be
+      * appended to the matrix they will be uninitialized.
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void conservativeResize(Index rows, Index cols)
+    {
+      internal::conservative_resize_like_impl<Derived>::run(*this, rows, cols);
+    }
+
+    /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
+      *
+      * As opposed to conservativeResize(Index rows, Index cols), this version leaves
+      * the number of columns unchanged.
+      *
+      * In case the matrix is growing, new rows will be uninitialized.
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void conservativeResize(Index rows, NoChange_t)
+    {
+      // Note: see the comment in conservativeResize(Index,Index)
+      conservativeResize(rows, cols());
+    }
+
+    /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
+      *
+      * As opposed to conservativeResize(Index rows, Index cols), this version leaves
+      * the number of rows unchanged.
+      *
+      * In case the matrix is growing, new columns will be uninitialized.
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index cols)
+    {
+      // Note: see the comment in conservativeResize(Index,Index)
+      conservativeResize(rows(), cols);
+    }
+
+    /** Resizes the vector to \a size while retaining old values.
+      *
+      * \only_for_vectors. This method does not work for
+      * partially dynamic matrices when the static dimension is anything other
+      * than 1. For example it will not work with Matrix<double, 2, Dynamic>.
+      *
+      * When values are appended, they will be uninitialized.
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void conservativeResize(Index size)
+    {
+      internal::conservative_resize_like_impl<Derived>::run(*this, size);
+    }
+
+    /** Resizes the matrix to \a rows x \a cols of \c other, while leaving old values untouched.
+      *
+      * The method is intended for matrices of dynamic size. If you only want to change the number
+      * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
+      * conservativeResize(Index, NoChange_t).
+      *
+      * Matrices are resized relative to the top-left element. In case values need to be
+      * appended to the matrix they will copied from \c other.
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void conservativeResizeLike(const DenseBase<OtherDerived>& other)
+    {
+      internal::conservative_resize_like_impl<Derived,OtherDerived>::run(*this, other);
+    }
+
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& operator=(const PlainObjectBase& other)
+    {
+      return _set(other);
+    }
+
+    /** \sa MatrixBase::lazyAssign() */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& lazyAssign(const DenseBase<OtherDerived>& other)
+    {
+      _resize_to_match(other);
+      return Base::lazyAssign(other.derived());
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& operator=(const ReturnByValue<OtherDerived>& func)
+    {
+      resize(func.rows(), func.cols());
+      return Base::operator=(func);
+    }
+
+    // Prevent user from trying to instantiate PlainObjectBase objects
+    // by making all its constructor protected. See bug 1074.
+  protected:
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase() : m_storage()
+    {
+//       _check_template_params();
+//       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    // FIXME is it still needed ?
+    /** \internal */
+    EIGEN_DEVICE_FUNC
+    explicit PlainObjectBase(internal::constructor_without_unaligned_array_assert)
+      : m_storage(internal::constructor_without_unaligned_array_assert())
+    {
+//       _check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+#endif
+
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    PlainObjectBase(PlainObjectBase&& other) EIGEN_NOEXCEPT
+      : m_storage( std::move(other.m_storage) )
+    {
+    }
+
+    EIGEN_DEVICE_FUNC
+    PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT
+    {
+      _check_template_params();
+      m_storage = std::move(other.m_storage);
+      return *this;
+    }
+#endif
+
+    /** Copy constructor */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const PlainObjectBase& other)
+      : Base(), m_storage(other.m_storage) { }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(Index size, Index rows, Index cols)
+      : m_storage(size, rows, cols)
+    {
+//       _check_template_params();
+//       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    }
+
+    #if EIGEN_HAS_CXX11
+    /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11
+      *
+      * \only_for_vectors
+      *
+      * This constructor is for 1D array or vectors with more than 4 coefficients.
+      * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients.
+      *
+      * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
+      * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+      */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
+      : m_storage()
+    {
+      _check_template_params();
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4);
+      m_storage.data()[0] = a0;
+      m_storage.data()[1] = a1;
+      m_storage.data()[2] = a2;
+      m_storage.data()[3] = a3;
+      Index i = 4;
+      auto x = {(m_storage.data()[i++] = args, 0)...};
+      static_cast<void>(x);
+    }
+
+    /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer
+      * lists \cpp11
+      */
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list<std::initializer_list<Scalar>>& list)
+      : m_storage()
+    {
+      _check_template_params();
+
+      size_t list_size = 0;
+      if (list.begin() != list.end()) {
+        list_size = list.begin()->size();
+      }
+
+      // This is to allow syntax like VectorXi {{1, 2, 3, 4}}
+      if (ColsAtCompileTime == 1 && list.size() == 1) {
+        eigen_assert(list_size == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
+        resize(list_size, ColsAtCompileTime);
+        std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data());
+      } else {
+        eigen_assert(list.size() == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
+        eigen_assert(list_size == static_cast<size_t>(ColsAtCompileTime) || ColsAtCompileTime == Dynamic);
+        resize(list.size(), list_size);
+
+        Index row_index = 0;
+        for (const std::initializer_list<Scalar>& row : list) {
+          eigen_assert(list_size == row.size());
+          Index col_index = 0;
+          for (const Scalar& e : row) {
+            coeffRef(row_index, col_index) = e;
+            ++col_index;
+          }
+          ++row_index;
+        }
+      }
+    }
+    #endif  // end EIGEN_HAS_CXX11
+
+    /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const DenseBase<OtherDerived> &other)
+      : m_storage()
+    {
+      _check_template_params();
+      resizeLike(other);
+      _set_noalias(other);
+    }
+
+    /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived> &other)
+      : m_storage()
+    {
+      _check_template_params();
+      resizeLike(other);
+      *this = other.derived();
+    }
+    /** \brief Copy constructor with in-place evaluation */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const ReturnByValue<OtherDerived>& other)
+    {
+      _check_template_params();
+      // FIXME this does not automatically transpose vectors if necessary
+      resize(other.rows(), other.cols());
+      other.evalTo(this->derived());
+    }
+
+  public:
+
+    /** \brief Copies the generic expression \a other into *this.
+      * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
+    {
+      _resize_to_match(other);
+      Base::operator=(other.derived());
+      return this->derived();
+    }
+
+    /** \name Map
+      * These are convenience functions returning Map objects. The Map() static functions return unaligned Map objects,
+      * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
+      * \a data pointers.
+      *
+      * Here is an example using strides:
+      * \include Matrix_Map_stride.cpp
+      * Output: \verbinclude Matrix_Map_stride.out
+      *
+      * \see class Map
+      */
+    //@{
+    static inline ConstMapType Map(const Scalar* data)
+    { return ConstMapType(data); }
+    static inline MapType Map(Scalar* data)
+    { return MapType(data); }
+    static inline ConstMapType Map(const Scalar* data, Index size)
+    { return ConstMapType(data, size); }
+    static inline MapType Map(Scalar* data, Index size)
+    { return MapType(data, size); }
+    static inline ConstMapType Map(const Scalar* data, Index rows, Index cols)
+    { return ConstMapType(data, rows, cols); }
+    static inline MapType Map(Scalar* data, Index rows, Index cols)
+    { return MapType(data, rows, cols); }
+
+    static inline ConstAlignedMapType MapAligned(const Scalar* data)
+    { return ConstAlignedMapType(data); }
+    static inline AlignedMapType MapAligned(Scalar* data)
+    { return AlignedMapType(data); }
+    static inline ConstAlignedMapType MapAligned(const Scalar* data, Index size)
+    { return ConstAlignedMapType(data, size); }
+    static inline AlignedMapType MapAligned(Scalar* data, Index size)
+    { return AlignedMapType(data, size); }
+    static inline ConstAlignedMapType MapAligned(const Scalar* data, Index rows, Index cols)
+    { return ConstAlignedMapType(data, rows, cols); }
+    static inline AlignedMapType MapAligned(Scalar* data, Index rows, Index cols)
+    { return AlignedMapType(data, rows, cols); }
+
+    template<int Outer, int Inner>
+    static inline typename StridedConstMapType<Stride<Outer, Inner> >::type Map(const Scalar* data, const Stride<Outer, Inner>& stride)
+    { return typename StridedConstMapType<Stride<Outer, Inner> >::type(data, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedMapType<Stride<Outer, Inner> >::type Map(Scalar* data, const Stride<Outer, Inner>& stride)
+    { return typename StridedMapType<Stride<Outer, Inner> >::type(data, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedConstMapType<Stride<Outer, Inner> >::type Map(const Scalar* data, Index size, const Stride<Outer, Inner>& stride)
+    { return typename StridedConstMapType<Stride<Outer, Inner> >::type(data, size, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedMapType<Stride<Outer, Inner> >::type Map(Scalar* data, Index size, const Stride<Outer, Inner>& stride)
+    { return typename StridedMapType<Stride<Outer, Inner> >::type(data, size, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedConstMapType<Stride<Outer, Inner> >::type Map(const Scalar* data, Index rows, Index cols, const Stride<Outer, Inner>& stride)
+    { return typename StridedConstMapType<Stride<Outer, Inner> >::type(data, rows, cols, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedMapType<Stride<Outer, Inner> >::type Map(Scalar* data, Index rows, Index cols, const Stride<Outer, Inner>& stride)
+    { return typename StridedMapType<Stride<Outer, Inner> >::type(data, rows, cols, stride); }
+
+    template<int Outer, int Inner>
+    static inline typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type MapAligned(const Scalar* data, const Stride<Outer, Inner>& stride)
+    { return typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type(data, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedAlignedMapType<Stride<Outer, Inner> >::type MapAligned(Scalar* data, const Stride<Outer, Inner>& stride)
+    { return typename StridedAlignedMapType<Stride<Outer, Inner> >::type(data, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type MapAligned(const Scalar* data, Index size, const Stride<Outer, Inner>& stride)
+    { return typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type(data, size, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedAlignedMapType<Stride<Outer, Inner> >::type MapAligned(Scalar* data, Index size, const Stride<Outer, Inner>& stride)
+    { return typename StridedAlignedMapType<Stride<Outer, Inner> >::type(data, size, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type MapAligned(const Scalar* data, Index rows, Index cols, const Stride<Outer, Inner>& stride)
+    { return typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type(data, rows, cols, stride); }
+    template<int Outer, int Inner>
+    static inline typename StridedAlignedMapType<Stride<Outer, Inner> >::type MapAligned(Scalar* data, Index rows, Index cols, const Stride<Outer, Inner>& stride)
+    { return typename StridedAlignedMapType<Stride<Outer, Inner> >::type(data, rows, cols, stride); }
+    //@}
+
+    using Base::setConstant;
+    EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val);
+    EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& val);
+    EIGEN_DEVICE_FUNC Derived& setConstant(NoChange_t, Index cols, const Scalar& val);
+    EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, NoChange_t, const Scalar& val);
+
+    using Base::setZero;
+    EIGEN_DEVICE_FUNC Derived& setZero(Index size);
+    EIGEN_DEVICE_FUNC Derived& setZero(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setZero(NoChange_t, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setZero(Index rows, NoChange_t);
+
+    using Base::setOnes;
+    EIGEN_DEVICE_FUNC Derived& setOnes(Index size);
+    EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setOnes(NoChange_t, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, NoChange_t);
+
+    using Base::setRandom;
+    Derived& setRandom(Index size);
+    Derived& setRandom(Index rows, Index cols);
+    Derived& setRandom(NoChange_t, Index cols);
+    Derived& setRandom(Index rows, NoChange_t);
+
+    #ifdef EIGEN_PLAINOBJECTBASE_PLUGIN
+    #include EIGEN_PLAINOBJECTBASE_PLUGIN
+    #endif
+
+  protected:
+    /** \internal Resizes *this in preparation for assigning \a other to it.
+      * Takes care of doing all the checking that's needed.
+      *
+      * Note that copying a row-vector into a vector (and conversely) is allowed.
+      * The resizing, if any, is then done in the appropriate way so that row-vectors
+      * remain row-vectors and vectors remain vectors.
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase<OtherDerived>& other)
+    {
+      #ifdef EIGEN_NO_AUTOMATIC_RESIZING
+      eigen_assert((this->size()==0 || (IsVectorAtCompileTime ? (this->size() == other.size())
+                 : (rows() == other.rows() && cols() == other.cols())))
+        && "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
+      EIGEN_ONLY_USED_FOR_DEBUG(other);
+      #else
+      resizeLike(other);
+      #endif
+    }
+
+    /**
+      * \brief Copies the value of the expression \a other into \c *this with automatic resizing.
+      *
+      * *this might be resized to match the dimensions of \a other. If *this was a null matrix (not already initialized),
+      * it will be initialized.
+      *
+      * Note that copying a row-vector into a vector (and conversely) is allowed.
+      * The resizing, if any, is then done in the appropriate way so that row-vectors
+      * remain row-vectors and vectors remain vectors.
+      *
+      * \sa operator=(const MatrixBase<OtherDerived>&), _set_noalias()
+      *
+      * \internal
+      */
+    // aliasing is dealt once in internal::call_assignment
+    // so at this stage we have to assume aliasing... and resising has to be done later.
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& _set(const DenseBase<OtherDerived>& other)
+    {
+      internal::call_assignment(this->derived(), other.derived());
+      return this->derived();
+    }
+
+    /** \internal Like _set() but additionally makes the assumption that no aliasing effect can happen (which
+      * is the case when creating a new matrix) so one can enforce lazy evaluation.
+      *
+      * \sa operator=(const MatrixBase<OtherDerived>&), _set()
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase<OtherDerived>& other)
+    {
+      // I don't think we need this resize call since the lazyAssign will anyways resize
+      // and lazyAssign will be called by the assign selector.
+      //_resize_to_match(other);
+      // the 'false' below means to enforce lazy evaluation. We don't use lazyAssign() because
+      // it wouldn't allow to copy a row-vector into a column-vector.
+      internal::call_assignment_no_alias(this->derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
+      return this->derived();
+    }
+
+    template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
+    {
+      const bool t0_is_integer_alike = internal::is_valid_index_type<T0>::value;
+      const bool t1_is_integer_alike = internal::is_valid_index_type<T1>::value;
+      EIGEN_STATIC_ASSERT(t0_is_integer_alike &&
+                          t1_is_integer_alike,
+                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
+      resize(rows,cols);
+    }
+
+    template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
+      m_storage.data()[0] = Scalar(val0);
+      m_storage.data()[1] = Scalar(val1);
+    }
+
+    template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1,
+                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
+                                                                  && (internal::is_same<T0,Index>::value)
+                                                                  && (internal::is_same<T1,Index>::value)
+                                                                  && Base::SizeAtCompileTime==2,T1>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
+      m_storage.data()[0] = Scalar(val0);
+      m_storage.data()[1] = Scalar(val1);
+    }
+
+    // The argument is convertible to the Index type and we either have a non 1x1 Matrix, or a dynamic-sized Array,
+    // then the argument is meant to be the size of the object.
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(Index size, typename internal::enable_if<    (Base::SizeAtCompileTime!=1 || !internal::is_convertible<T, Scalar>::value)
+                                                                              && ((!internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0)
+    {
+      // NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
+      const bool is_integer_alike = internal::is_valid_index_type<T>::value;
+      EIGEN_UNUSED_VARIABLE(is_integer_alike);
+      EIGEN_STATIC_ASSERT(is_integer_alike,
+                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
+      resize(size);
+    }
+
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted)
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
+      m_storage.data()[0] = val0;
+    }
+
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type match the index type)
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Index& val0,
+                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
+                                                                  && (internal::is_same<Index,T>::value)
+                                                                  && Base::SizeAtCompileTime==1
+                                                                  && internal::is_convertible<T, Scalar>::value,T*>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
+      m_storage.data()[0] = Scalar(val0);
+    }
+
+    // Initialize a fixed size matrix from a pointer to raw data
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Scalar* data){
+      this->_set_noalias(ConstMapType(data));
+    }
+
+    // Initialize an arbitrary matrix from a dense expression
+    template<typename T, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const DenseBase<OtherDerived>& other){
+      this->_set_noalias(other);
+    }
+
+    // Initialize an arbitrary matrix from an object convertible to the Derived type.
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Derived& other){
+      this->_set_noalias(other);
+    }
+
+    // Initialize an arbitrary matrix from a generic Eigen expression
+    template<typename T, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const EigenBase<OtherDerived>& other){
+      this->derived() = other;
+    }
+
+    template<typename T, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const ReturnByValue<OtherDerived>& other)
+    {
+      resize(other.rows(), other.cols());
+      other.evalTo(this->derived());
+    }
+
+    template<typename T, typename OtherDerived, int ColsAtCompileTime>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
+    {
+      this->derived() = r;
+    }
+
+    // For fixed-size Array<Scalar,...>
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Scalar& val0,
+                                    typename internal::enable_if<    Base::SizeAtCompileTime!=Dynamic
+                                                                  && Base::SizeAtCompileTime!=1
+                                                                  && internal::is_convertible<T, Scalar>::value
+                                                                  && internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value,T>::type* = 0)
+    {
+      Base::setConstant(val0);
+    }
+
+    // For fixed-size Array<Index,...>
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Index& val0,
+                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
+                                                                  && (internal::is_same<Index,T>::value)
+                                                                  && Base::SizeAtCompileTime!=Dynamic
+                                                                  && Base::SizeAtCompileTime!=1
+                                                                  && internal::is_convertible<T, Scalar>::value
+                                                                  && internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value,T*>::type* = 0)
+    {
+      Base::setConstant(val0);
+    }
+
+    template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
+    friend struct internal::matrix_swap_impl;
+
+  public:
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal
+      * \brief Override DenseBase::swap() since for dynamic-sized matrices
+      * of same type it is enough to swap the data pointers.
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void swap(DenseBase<OtherDerived> & other)
+    {
+      enum { SwapPointers = internal::is_same<Derived, OtherDerived>::value && Base::SizeAtCompileTime==Dynamic };
+      internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.derived());
+    }
+
+    /** \internal
+      * \brief const version forwarded to DenseBase::swap
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void swap(DenseBase<OtherDerived> const & other)
+    { Base::swap(other.derived()); }
+
+    EIGEN_DEVICE_FUNC
+    static EIGEN_STRONG_INLINE void _check_template_params()
+    {
+      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (int(Options)&RowMajor)==RowMajor)
+                        && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (int(Options)&RowMajor)==0)
+                        && ((RowsAtCompileTime == Dynamic) || (RowsAtCompileTime >= 0))
+                        && ((ColsAtCompileTime == Dynamic) || (ColsAtCompileTime >= 0))
+                        && ((MaxRowsAtCompileTime == Dynamic) || (MaxRowsAtCompileTime >= 0))
+                        && ((MaxColsAtCompileTime == Dynamic) || (MaxColsAtCompileTime >= 0))
+                        && (MaxRowsAtCompileTime == RowsAtCompileTime || RowsAtCompileTime==Dynamic)
+                        && (MaxColsAtCompileTime == ColsAtCompileTime || ColsAtCompileTime==Dynamic)
+                        && (Options & (DontAlign|RowMajor)) == Options),
+        INVALID_MATRIX_TEMPLATE_PARAMETERS)
+    }
+
+    enum { IsPlainObjectBase = 1 };
+#endif
+  public:
+    // These apparently need to be down here for nvcc+icc to prevent duplicate
+    // Map symbol.
+    template<typename PlainObjectType, int MapOptions, typename StrideType> friend class Eigen::Map;
+    friend class Eigen::Map<Derived, Unaligned>;
+    friend class Eigen::Map<const Derived, Unaligned>;
+#if EIGEN_MAX_ALIGN_BYTES>0
+    // for EIGEN_MAX_ALIGN_BYTES==0, AlignedMax==Unaligned, and many compilers generate warnings for friend-ing a class twice.
+    friend class Eigen::Map<Derived, AlignedMax>;
+    friend class Eigen::Map<const Derived, AlignedMax>;
+#endif
+};
+
+namespace internal {
+
+template <typename Derived, typename OtherDerived, bool IsVector>
+struct conservative_resize_like_impl
+{
+  #if EIGEN_HAS_TYPE_TRAITS
+  static const bool IsRelocatable = std::is_trivially_copyable<typename Derived::Scalar>::value;
+  #else
+  static const bool IsRelocatable = !NumTraits<typename Derived::Scalar>::RequireInitialization;
+  #endif
+  static void run(DenseBase<Derived>& _this, Index rows, Index cols)
+  {
+    if (_this.rows() == rows && _this.cols() == cols) return;
+    EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
+
+    if ( IsRelocatable
+          && (( Derived::IsRowMajor && _this.cols() == cols) ||  // row-major and we change only the number of rows
+              (!Derived::IsRowMajor && _this.rows() == rows) ))  // column-major and we change only the number of columns
+    {
+      internal::check_rows_cols_for_overflow<Derived::MaxSizeAtCompileTime>::run(rows, cols);
+      _this.derived().m_storage.conservativeResize(rows*cols,rows,cols);
+    }
+    else
+    {
+      // The storage order does not allow us to use reallocation.
+      Derived tmp(rows,cols);
+      const Index common_rows = numext::mini(rows, _this.rows());
+      const Index common_cols = numext::mini(cols, _this.cols());
+      tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
+      _this.derived().swap(tmp);
+    }
+  }
+
+  static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
+  {
+    if (_this.rows() == other.rows() && _this.cols() == other.cols()) return;
+
+    // Note: Here is space for improvement. Basically, for conservativeResize(Index,Index),
+    // neither RowsAtCompileTime or ColsAtCompileTime must be Dynamic. If only one of the
+    // dimensions is dynamic, one could use either conservativeResize(Index rows, NoChange_t) or
+    // conservativeResize(NoChange_t, Index cols). For these methods new static asserts like
+    // EIGEN_STATIC_ASSERT_DYNAMIC_ROWS and EIGEN_STATIC_ASSERT_DYNAMIC_COLS would be good.
+    EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
+    EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived)
+
+    if ( IsRelocatable &&
+          (( Derived::IsRowMajor && _this.cols() == other.cols()) ||  // row-major and we change only the number of rows
+           (!Derived::IsRowMajor && _this.rows() == other.rows()) ))  // column-major and we change only the number of columns
+    {
+      const Index new_rows = other.rows() - _this.rows();
+      const Index new_cols = other.cols() - _this.cols();
+      _this.derived().m_storage.conservativeResize(other.size(),other.rows(),other.cols());
+      if (new_rows>0)
+        _this.bottomRightCorner(new_rows, other.cols()) = other.bottomRows(new_rows);
+      else if (new_cols>0)
+        _this.bottomRightCorner(other.rows(), new_cols) = other.rightCols(new_cols);
+    }
+    else
+    {
+      // The storage order does not allow us to use reallocation.
+      Derived tmp(other);
+      const Index common_rows = numext::mini(tmp.rows(), _this.rows());
+      const Index common_cols = numext::mini(tmp.cols(), _this.cols());
+      tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
+      _this.derived().swap(tmp);
+    }
+  }
+};
+
+// Here, the specialization for vectors inherits from the general matrix case
+// to allow calling .conservativeResize(rows,cols) on vectors.
+template <typename Derived, typename OtherDerived>
+struct conservative_resize_like_impl<Derived,OtherDerived,true>
+  : conservative_resize_like_impl<Derived,OtherDerived,false>
+{
+  typedef conservative_resize_like_impl<Derived,OtherDerived,false> Base;
+  using Base::run;
+  using Base::IsRelocatable;
+
+  static void run(DenseBase<Derived>& _this, Index size)
+  {
+    const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
+    const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1;
+    if(IsRelocatable)
+      _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
+    else
+      Base::run(_this.derived(), new_rows, new_cols);
+  }
+
+  static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
+  {
+    if (_this.rows() == other.rows() && _this.cols() == other.cols()) return;
+
+    const Index num_new_elements = other.size() - _this.size();
+
+    const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
+    const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
+    if(IsRelocatable)
+      _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
+    else
+      Base::run(_this.derived(), new_rows, new_cols);
+
+    if (num_new_elements > 0)
+      _this.tail(num_new_elements) = other.tail(num_new_elements);
+  }
+};
+
+template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
+struct matrix_swap_impl
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(MatrixTypeA& a, MatrixTypeB& b)
+  {
+    a.base().swap(b);
+  }
+};
+
+template<typename MatrixTypeA, typename MatrixTypeB>
+struct matrix_swap_impl<MatrixTypeA, MatrixTypeB, true>
+{
+  EIGEN_DEVICE_FUNC
+  static inline void run(MatrixTypeA& a, MatrixTypeB& b)
+  {
+    static_cast<typename MatrixTypeA::Base&>(a).m_storage.swap(static_cast<typename MatrixTypeB::Base&>(b).m_storage);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_DENSESTORAGEBASE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Product.h b/src/3rdparty/eigen/Eigen/src/Core/Product.h
new file mode 100644
index 000000000..70a6c1063
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Product.h
@@ -0,0 +1,191 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PRODUCT_H
+#define EIGEN_PRODUCT_H
+
+namespace Eigen {
+
+template<typename Lhs, typename Rhs, int Option, typename StorageKind> class ProductImpl;
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, int Option>
+struct traits<Product<Lhs, Rhs, Option> >
+{
+  typedef typename remove_all<Lhs>::type LhsCleaned;
+  typedef typename remove_all<Rhs>::type RhsCleaned;
+  typedef traits<LhsCleaned> LhsTraits;
+  typedef traits<RhsCleaned> RhsTraits;
+
+  typedef MatrixXpr XprKind;
+
+  typedef typename ScalarBinaryOpTraits<typename traits<LhsCleaned>::Scalar, typename traits<RhsCleaned>::Scalar>::ReturnType Scalar;
+  typedef typename product_promote_storage_type<typename LhsTraits::StorageKind,
+                                                typename RhsTraits::StorageKind,
+                                                internal::product_type<Lhs,Rhs>::ret>::ret StorageKind;
+  typedef typename promote_index_type<typename LhsTraits::StorageIndex,
+                                      typename RhsTraits::StorageIndex>::type StorageIndex;
+
+  enum {
+    RowsAtCompileTime    = LhsTraits::RowsAtCompileTime,
+    ColsAtCompileTime    = RhsTraits::ColsAtCompileTime,
+    MaxRowsAtCompileTime = LhsTraits::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = RhsTraits::MaxColsAtCompileTime,
+
+    // FIXME: only needed by GeneralMatrixMatrixTriangular
+    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsTraits::ColsAtCompileTime, RhsTraits::RowsAtCompileTime),
+
+    // The storage order is somewhat arbitrary here. The correct one will be determined through the evaluator.
+    Flags = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? RowMajorBit
+          : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0
+          : (   ((LhsTraits::Flags&NoPreferredStorageOrderBit) && (RhsTraits::Flags&RowMajorBit))
+             || ((RhsTraits::Flags&NoPreferredStorageOrderBit) && (LhsTraits::Flags&RowMajorBit)) ) ? RowMajorBit
+          : NoPreferredStorageOrderBit
+  };
+};
+
+} // end namespace internal
+
+/** \class Product
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the product of two arbitrary matrices or vectors
+  *
+  * \tparam _Lhs the type of the left-hand side expression
+  * \tparam _Rhs the type of the right-hand side expression
+  *
+  * This class represents an expression of the product of two arbitrary matrices.
+  *
+  * The other template parameters are:
+  * \tparam Option     can be DefaultProduct, AliasFreeProduct, or LazyProduct
+  *
+  */
+template<typename _Lhs, typename _Rhs, int Option>
+class Product : public ProductImpl<_Lhs,_Rhs,Option,
+                                   typename internal::product_promote_storage_type<typename internal::traits<_Lhs>::StorageKind,
+                                                                                   typename internal::traits<_Rhs>::StorageKind,
+                                                                                   internal::product_type<_Lhs,_Rhs>::ret>::ret>
+{
+  public:
+
+    typedef _Lhs Lhs;
+    typedef _Rhs Rhs;
+
+    typedef typename ProductImpl<
+        Lhs, Rhs, Option,
+        typename internal::product_promote_storage_type<typename internal::traits<Lhs>::StorageKind,
+                                                        typename internal::traits<Rhs>::StorageKind,
+                                                        internal::product_type<Lhs,Rhs>::ret>::ret>::Base Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Product)
+
+    typedef typename internal::ref_selector<Lhs>::type LhsNested;
+    typedef typename internal::ref_selector<Rhs>::type RhsNested;
+    typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
+    typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
+    {
+      eigen_assert(lhs.cols() == rhs.rows()
+        && "invalid matrix product"
+        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const LhsNestedCleaned& lhs() const { return m_lhs; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const RhsNestedCleaned& rhs() const { return m_rhs; }
+
+  protected:
+
+    LhsNested m_lhs;
+    RhsNested m_rhs;
+};
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, int Option, int ProductTag = internal::product_type<Lhs,Rhs>::ret>
+class dense_product_base
+ : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
+{};
+
+/** Conversion to scalar for inner-products */
+template<typename Lhs, typename Rhs, int Option>
+class dense_product_base<Lhs, Rhs, Option, InnerProduct>
+ : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
+{
+  typedef Product<Lhs,Rhs,Option> ProductXpr;
+  typedef typename internal::dense_xpr_base<ProductXpr>::type Base;
+public:
+  using Base::derived;
+  typedef typename Base::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const
+  {
+    return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
+  }
+};
+
+} // namespace internal
+
+// Generic API dispatcher
+template<typename Lhs, typename Rhs, int Option, typename StorageKind>
+class ProductImpl : public internal::generic_xpr_base<Product<Lhs,Rhs,Option>, MatrixXpr, StorageKind>::type
+{
+  public:
+    typedef typename internal::generic_xpr_base<Product<Lhs,Rhs,Option>, MatrixXpr, StorageKind>::type Base;
+};
+
+template<typename Lhs, typename Rhs, int Option>
+class ProductImpl<Lhs,Rhs,Option,Dense>
+  : public internal::dense_product_base<Lhs,Rhs,Option>
+{
+    typedef Product<Lhs, Rhs, Option> Derived;
+
+  public:
+
+    typedef typename internal::dense_product_base<Lhs, Rhs, Option> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+  protected:
+    enum {
+      IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) &&
+                   (ColsAtCompileTime == 1 || ColsAtCompileTime == Dynamic),
+      EnableCoeff = IsOneByOne || Option==LazyProduct
+    };
+
+  public:
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const
+    {
+      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
+      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
+
+      return internal::evaluator<Derived>(derived()).coeff(row,col);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const
+    {
+      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
+      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
+
+      return internal::evaluator<Derived>(derived()).coeff(i);
+    }
+
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_PRODUCT_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/ProductEvaluators.h b/src/3rdparty/eigen/Eigen/src/Core/ProductEvaluators.h
new file mode 100644
index 000000000..8cf294b28
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/ProductEvaluators.h
@@ -0,0 +1,1179 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_PRODUCTEVALUATORS_H
+#define EIGEN_PRODUCTEVALUATORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+  * Evaluator of a product expression.
+  * Since products require special treatments to handle all possible cases,
+  * we simply defer the evaluation logic to a product_evaluator class
+  * which offers more partial specialization possibilities.
+  *
+  * \sa class product_evaluator
+  */
+template<typename Lhs, typename Rhs, int Options>
+struct evaluator<Product<Lhs, Rhs, Options> >
+ : public product_evaluator<Product<Lhs, Rhs, Options> >
+{
+  typedef Product<Lhs, Rhs, Options> XprType;
+  typedef product_evaluator<XprType> Base;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+// Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
+// TODO we should apply that rule only if that's really helpful
+template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                                               const Product<Lhs, Rhs, DefaultProduct> > >
+{
+  static const bool value = true;
+};
+template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
+struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                               const Product<Lhs, Rhs, DefaultProduct> > >
+ : public evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> >
+{
+  typedef CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                               const Product<Lhs, Rhs, DefaultProduct> > XprType;
+  typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
+    : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
+  {}
+};
+
+
+template<typename Lhs, typename Rhs, int DiagIndex>
+struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
+ : public evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> >
+{
+  typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
+  typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
+    : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
+        Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
+        xpr.index() ))
+  {}
+};
+
+
+// Helper class to perform a matrix product with the destination at hand.
+// Depending on the sizes of the factors, there are different evaluation strategies
+// as controlled by internal::product_type.
+template< typename Lhs, typename Rhs,
+          typename LhsShape = typename evaluator_traits<Lhs>::Shape,
+          typename RhsShape = typename evaluator_traits<Rhs>::Shape,
+          int ProductType = internal::product_type<Lhs,Rhs>::value>
+struct generic_product_impl;
+
+template<typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<Product<Lhs, Rhs, DefaultProduct> > {
+  static const bool value = true;
+};
+
+// This is the default evaluator implementation for products:
+// It creates a temporary and call generic_product_impl
+template<typename Lhs, typename Rhs, int Options, int ProductTag, typename LhsShape, typename RhsShape>
+struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsShape>
+  : public evaluator<typename Product<Lhs, Rhs, Options>::PlainObject>
+{
+  typedef Product<Lhs, Rhs, Options> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+  enum {
+    Flags = Base::Flags | EvalBeforeNestingBit
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit product_evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+
+// FIXME shall we handle nested_eval here?,
+// if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in permutation_matrix_product, transposition_matrix_product, etc.)
+//     typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
+//     typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+//     typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
+//     typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
+//
+//     const LhsNested lhs(xpr.lhs());
+//     const RhsNested rhs(xpr.rhs());
+//
+//     generic_product_impl<LhsNestedCleaned, RhsNestedCleaned>::evalTo(m_result, lhs, rhs);
+
+    generic_product_impl<Lhs, Rhs, LhsShape, RhsShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+// The following three shortcuts are enabled only if the scalar types match exactly.
+// TODO: we could enable them for different scalar types when the product is not vectorized.
+
+// Dense = Product
+template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scalar,Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
+{
+  typedef Product<Lhs,Rhs,Options> SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+    // FIXME shall we handle nested_eval here?
+    generic_product_impl<Lhs, Rhs>::evalTo(dst, src.lhs(), src.rhs());
+  }
+};
+
+// Dense += Product
+template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<Scalar,Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
+{
+  typedef Product<Lhs,Rhs,Options> SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
+  {
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    // FIXME shall we handle nested_eval here?
+    generic_product_impl<Lhs, Rhs>::addTo(dst, src.lhs(), src.rhs());
+  }
+};
+
+// Dense -= Product
+template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<Scalar,Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
+{
+  typedef Product<Lhs,Rhs,Options> SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
+  {
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    // FIXME shall we handle nested_eval here?
+    generic_product_impl<Lhs, Rhs>::subTo(dst, src.lhs(), src.rhs());
+  }
+};
+
+
+// Dense ?= scalar * Product
+// TODO we should apply that rule if that's really helpful
+// for instance, this is not good for inner products
+template< typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis, typename Plain>
+struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>, const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
+                                           const Product<Lhs,Rhs,DefaultProduct> >, AssignFunc, Dense2Dense>
+{
+  typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
+                        const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
+                        const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
+  {
+    call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
+  }
+};
+
+//----------------------------------------
+// Catch "Dense ?= xpr + Product<>" expression to save one temporary
+// FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct
+
+template<typename OtherXpr, typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
+                                               const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
+  static const bool value = true;
+};
+
+template<typename OtherXpr, typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_difference_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
+                                               const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
+  static const bool value = true;
+};
+
+template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
+struct assignment_from_xpr_op_product
+{
+  template<typename SrcXprType, typename InitialFunc>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
+  {
+    call_assignment_no_alias(dst, src.lhs(), Func1());
+    call_assignment_no_alias(dst, src.rhs(), Func2());
+  }
+};
+
+#define EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(ASSIGN_OP,BINOP,ASSIGN_OP2) \
+  template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename DstScalar, typename SrcScalar, typename OtherScalar,typename ProdScalar> \
+  struct Assignment<DstXprType, CwiseBinaryOp<internal::BINOP<OtherScalar,ProdScalar>, const OtherXpr, \
+                                            const Product<Lhs,Rhs,DefaultProduct> >, internal::ASSIGN_OP<DstScalar,SrcScalar>, Dense2Dense> \
+    : assignment_from_xpr_op_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, internal::ASSIGN_OP<DstScalar,OtherScalar>, internal::ASSIGN_OP2<DstScalar,ProdScalar> > \
+  {}
+
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op,    scalar_sum_op,add_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op,scalar_sum_op,add_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op,scalar_sum_op,sub_assign_op);
+
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op,    scalar_difference_op,sub_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op,scalar_difference_op,sub_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op,scalar_difference_op,add_assign_op);
+
+//----------------------------------------
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
+{
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
+  }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
+  }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
+};
+
+
+/***********************************************************************
+*  Implementation of outer dense * dense vector product
+***********************************************************************/
+
+// Column major result
+template<typename Dst, typename Lhs, typename Rhs, typename Func>
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
+{
+  evaluator<Rhs> rhsEval(rhs);
+  ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
+  // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
+  // FIXME not very good if rhs is real and lhs complex while alpha is real too
+  const Index cols = dst.cols();
+  for (Index j=0; j<cols; ++j)
+    func(dst.col(j), rhsEval.coeff(Index(0),j) * actual_lhs);
+}
+
+// Row major result
+template<typename Dst, typename Lhs, typename Rhs, typename Func>
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
+{
+  evaluator<Lhs> lhsEval(lhs);
+  ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
+  // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
+  // FIXME not very good if lhs is real and rhs complex while alpha is real too
+  const Index rows = dst.rows();
+  for (Index i=0; i<rows; ++i)
+    func(dst.row(i), lhsEval.coeff(i,Index(0)) * actual_rhs);
+}
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
+{
+  template<typename T> struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+
+  // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
+  struct set  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
+  struct add  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
+  struct sub  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
+  struct adds {
+    Scalar m_scale;
+    explicit adds(const Scalar& s) : m_scale(s) {}
+    template<typename Dst, typename Src> void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const {
+      dst.const_cast_derived() += m_scale * src;
+    }
+  };
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
+  }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
+  }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
+  }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
+  }
+
+};
+
+
+// This base class provides default implementations for evalTo, addTo, subTo, in terms of scaleAndAddTo
+template<typename Lhs, typename Rhs, typename Derived>
+struct generic_product_impl_base
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
+
+};
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct> >
+{
+  typedef typename nested_eval<Lhs,1>::type LhsNested;
+  typedef typename nested_eval<Rhs,1>::type RhsNested;
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
+  typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;
+
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    // Fallback to inner product if both the lhs and rhs is a runtime vector.
+    if (lhs.rows() == 1 && rhs.cols() == 1) {
+      dst.coeffRef(0,0) += alpha * lhs.row(0).conjugate().dot(rhs.col(0));
+      return;
+    }
+    LhsNested actual_lhs(lhs);
+    RhsNested actual_rhs(rhs);
+    internal::gemv_dense_selector<Side,
+                            (int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
+                            bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)
+                           >::run(actual_lhs, actual_rhs, dst, alpha);
+  }
+};
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    // Same as: dst.noalias() = lhs.lazyProduct(rhs);
+    // but easier on the compiler side
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
+  }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    // dst.noalias() += lhs.lazyProduct(rhs);
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
+  }
+
+  template<typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    // dst.noalias() -= lhs.lazyProduct(rhs);
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
+  }
+
+  // This is a special evaluation path called from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
+  // This variant tries to extract scalar multiples from both the LHS and RHS and factor them out. For instance:
+  //   dst {,+,-}= (s1*A)*(B*s2)
+  // will be rewritten as:
+  //   dst {,+,-}= (s1*s2) * (A.lazyProduct(B))
+  // There are at least four benefits of doing so:
+  //  1 - huge performance gain for heap-allocated matrix types as it save costly allocations.
+  //  2 - it is faster than simply by-passing the heap allocation through stack allocation.
+  //  3 - it makes this fallback consistent with the heavy GEMM routine.
+  //  4 - it fully by-passes huge stack allocation attempts when multiplying huge fixed-size matrices.
+  //      (see https://stackoverflow.com/questions/54738495)
+  // For small fixed sizes matrices, howver, the gains are less obvious, it is sometimes x2 faster, but sometimes x3 slower,
+  // and the behavior depends also a lot on the compiler... This is why this re-writting strategy is currently
+  // enabled only when falling back from the main GEMM.
+  template<typename Dst, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func &func)
+  {
+    enum {
+      HasScalarFactor = blas_traits<Lhs>::HasScalarFactor || blas_traits<Rhs>::HasScalarFactor,
+      ConjLhs = blas_traits<Lhs>::NeedToConjugate,
+      ConjRhs = blas_traits<Rhs>::NeedToConjugate
+    };
+    // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto
+    //        this is important for real*complex_mat
+    Scalar actualAlpha = combine_scalar_factors<Scalar>(lhs, rhs);
+
+    eval_dynamic_impl(dst,
+                      blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(),
+                      blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(),
+                      func,
+                      actualAlpha,
+                      typename conditional<HasScalarFactor,true_type,false_type>::type());
+  }
+
+protected:
+
+  template<typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar&  s /* == 1 */, false_type)
+  {
+    EIGEN_UNUSED_VARIABLE(s);
+    eigen_internal_assert(s==Scalar(1));
+    call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
+  }
+
+  template<typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s, true_type)
+  {
+    call_restricted_packet_assignment_no_alias(dst, s * lhs.lazyProduct(rhs), func);
+  }
+};
+
+// This specialization enforces the use of a coefficient-based evaluation strategy
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,LazyCoeffBasedProductMode>
+  : generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> {};
+
+// Case 2: Evaluate coeff by coeff
+//
+// This is mostly taken from CoeffBasedProduct.h
+// The main difference is that we add an extra argument to the etor_product_*_impl::run() function
+// for the inner dimension of the product, because evaluator object do not know their size.
+
+template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct etor_product_coeff_impl;
+
+template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl;
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, DenseShape>
+    : evaluator_base<Product<Lhs, Rhs, LazyProduct> >
+{
+  typedef Product<Lhs, Rhs, LazyProduct> XprType;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit product_evaluator(const XprType& xpr)
+    : m_lhs(xpr.lhs()),
+      m_rhs(xpr.rhs()),
+      m_lhsImpl(m_lhs),     // FIXME the creation of the evaluator objects should result in a no-op, but check that!
+      m_rhsImpl(m_rhs),     //       Moreover, they are only useful for the packet path, so we could completely disable them when not needed,
+                            //       or perhaps declare them on the fly on the packet method... We have experiment to check what's best.
+      m_innerDim(xpr.lhs().cols())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+#if 0
+    std::cerr << "LhsOuterStrideBytes=  " << LhsOuterStrideBytes << "\n";
+    std::cerr << "RhsOuterStrideBytes=  " << RhsOuterStrideBytes << "\n";
+    std::cerr << "LhsAlignment=         " << LhsAlignment << "\n";
+    std::cerr << "RhsAlignment=         " << RhsAlignment << "\n";
+    std::cerr << "CanVectorizeLhs=      " << CanVectorizeLhs << "\n";
+    std::cerr << "CanVectorizeRhs=      " << CanVectorizeRhs << "\n";
+    std::cerr << "CanVectorizeInner=    " << CanVectorizeInner << "\n";
+    std::cerr << "EvalToRowMajor=       " << EvalToRowMajor << "\n";
+    std::cerr << "Alignment=            " << Alignment << "\n";
+    std::cerr << "Flags=                " << Flags << "\n";
+#endif
+  }
+
+  // Everything below here is taken from CoeffBasedProduct.h
+
+  typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
+  typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+
+  typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
+  typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
+
+  typedef evaluator<LhsNestedCleaned> LhsEtorType;
+  typedef evaluator<RhsNestedCleaned> RhsEtorType;
+
+  enum {
+    RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime,
+    ColsAtCompileTime = RhsNestedCleaned::ColsAtCompileTime,
+    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime),
+    MaxRowsAtCompileTime = LhsNestedCleaned::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime
+  };
+
+  typedef typename find_best_packet<Scalar,RowsAtCompileTime>::type LhsVecPacketType;
+  typedef typename find_best_packet<Scalar,ColsAtCompileTime>::type RhsVecPacketType;
+
+  enum {
+
+    LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
+    RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
+    CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
+                  : InnerSize == Dynamic ? HugeCost
+                    : InnerSize * (NumTraits<Scalar>::MulCost + int(LhsCoeffReadCost) + int(RhsCoeffReadCost))
+                    + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
+
+    Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
+
+    LhsFlags = LhsEtorType::Flags,
+    RhsFlags = RhsEtorType::Flags,
+
+    LhsRowMajor = LhsFlags & RowMajorBit,
+    RhsRowMajor = RhsFlags & RowMajorBit,
+
+    LhsVecPacketSize = unpacket_traits<LhsVecPacketType>::size,
+    RhsVecPacketSize = unpacket_traits<RhsVecPacketType>::size,
+
+    // Here, we don't care about alignment larger than the usable packet size.
+    LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))),
+    RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))),
+
+    SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,
+
+    CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1),
+    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) && (RowsAtCompileTime!=1),
+
+    EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+                    : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+                    : (bool(RhsRowMajor) && !CanVectorizeLhs),
+
+    Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & ~RowMajorBit)
+          | (EvalToRowMajor ? RowMajorBit : 0)
+          // TODO enable vectorization for mixed types
+          | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0)
+          | (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0),
+
+    LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
+    RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
+
+    Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<=0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
+              : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<=0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
+              : 0,
+
+    /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
+     * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
+     * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
+     * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
+     */
+    CanVectorizeInner =    SameType
+                        && LhsRowMajor
+                        && (!RhsRowMajor)
+                        && (int(LhsFlags) & int(RhsFlags) & ActualPacketAccessBit)
+                        && (int(InnerSize) % packet_traits<Scalar>::size == 0)
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const
+  {
+    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
+  }
+
+  /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
+   * which is why we don't set the LinearAccessBit.
+   * TODO: this seems possible when the result is a vector
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const CoeffReturnType coeff(Index index) const
+  {
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
+    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const PacketType packet(Index row, Index col) const
+  {
+    PacketType res;
+    typedef etor_product_packet_impl<bool(int(Flags)&RowMajorBit) ? RowMajor : ColMajor,
+                                     Unroll ? int(InnerSize) : Dynamic,
+                                     LhsEtorType, RhsEtorType, PacketType, LoadMode> PacketImpl;
+    PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
+    return res;
+  }
+
+  template<int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const PacketType packet(Index index) const
+  {
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
+    return packet<LoadMode,PacketType>(row,col);
+  }
+
+protected:
+  typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
+  typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
+
+  LhsEtorType m_lhsImpl;
+  RhsEtorType m_rhsImpl;
+
+  // TODO: Get rid of m_innerDim if known at compile time
+  Index m_innerDim;
+};
+
+template<typename Lhs, typename Rhs>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProductMode, DenseShape, DenseShape>
+  : product_evaluator<Product<Lhs, Rhs, LazyProduct>, CoeffBasedProductMode, DenseShape, DenseShape>
+{
+  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
+  typedef Product<Lhs, Rhs, LazyProduct> BaseProduct;
+  typedef product_evaluator<BaseProduct, CoeffBasedProductMode, DenseShape, DenseShape> Base;
+  enum {
+    Flags = Base::Flags | EvalBeforeNestingBit
+  };
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit product_evaluator(const XprType& xpr)
+    : Base(BaseProduct(xpr.lhs(),xpr.rhs()))
+  {}
+};
+
+/****************************************
+*** Coeff based product, Packet path  ***
+****************************************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+  {
+    etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet<LoadMode,Packet>(Index(UnrollingIndex-1), col), res);
+  }
+};
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+  {
+    etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
+    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, Index(UnrollingIndex-1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex-1), col)), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  {
+    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),rhs.template packet<LoadMode,Packet>(Index(0), col));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  {
+    res = pmul(lhs.template packet<LoadMode,Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
+  {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
+  {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+  {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+    for(Index i = 0; i < innerDim; ++i)
+      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode,Packet>(i, col), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+  {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+    for(Index i = 0; i < innerDim; ++i)
+      res =  pmadd(lhs.template packet<LoadMode,Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
+  }
+};
+
+
+/***************************************************************************
+* Triangular products
+***************************************************************************/
+template<int Mode, bool LhsIsTriangular,
+         typename Lhs, bool LhsIsVector,
+         typename Rhs, bool RhsIsVector>
+struct triangular_product_impl;
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    triangular_product_impl<Lhs::Mode,true,typename Lhs::MatrixType,false,Rhs, Rhs::ColsAtCompileTime==1>
+        ::run(dst, lhs.nestedExpression(), rhs, alpha);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag>
+: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    triangular_product_impl<Rhs::Mode,false,Lhs,Lhs::RowsAtCompileTime==1, typename Rhs::MatrixType, false>::run(dst, lhs, rhs.nestedExpression(), alpha);
+  }
+};
+
+
+/***************************************************************************
+* SelfAdjoint products
+***************************************************************************/
+template <typename Lhs, int LhsMode, bool LhsIsVector,
+          typename Rhs, int RhsMode, bool RhsIsVector>
+struct selfadjoint_product_impl;
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC
+  void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag>
+: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    selfadjoint_product_impl<Lhs,0,Lhs::IsVectorAtCompileTime,typename Rhs::MatrixType,Rhs::Mode,false>::run(dst, lhs, rhs.nestedExpression(), alpha);
+  }
+};
+
+
+/***************************************************************************
+* Diagonal products
+***************************************************************************/
+
+template<typename MatrixType, typename DiagonalType, typename Derived, int ProductOrder>
+struct diagonal_product_evaluator_base
+  : evaluator_base<Derived>
+{
+   typedef typename ScalarBinaryOpTraits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
+public:
+  enum {
+    CoeffReadCost = int(NumTraits<Scalar>::MulCost) + int(evaluator<MatrixType>::CoeffReadCost) + int(evaluator<DiagonalType>::CoeffReadCost),
+
+    MatrixFlags = evaluator<MatrixType>::Flags,
+    DiagFlags = evaluator<DiagonalType>::Flags,
+
+    _StorageOrder = (Derived::MaxRowsAtCompileTime==1 && Derived::MaxColsAtCompileTime!=1) ? RowMajor
+                  : (Derived::MaxColsAtCompileTime==1 && Derived::MaxRowsAtCompileTime!=1) ? ColMajor
+                  : MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
+    _SameStorageOrder = _StorageOrder == (MatrixFlags & RowMajorBit ? RowMajor : ColMajor),
+
+    _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
+                           ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
+    _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
+    // FIXME currently we need same types, but in the future the next rule should be the one
+    //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
+    _Vectorizable =   bool(int(MatrixFlags)&PacketAccessBit)
+                  &&  _SameTypes
+                  && (_SameStorageOrder || (MatrixFlags&LinearAccessBit)==LinearAccessBit)
+                  && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
+    _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
+    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
+    Alignment = evaluator<MatrixType>::Alignment,
+
+    AsScalarProduct =     (DiagonalType::SizeAtCompileTime==1)
+                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
+                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
+  };
+
+  EIGEN_DEVICE_FUNC diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
+    : m_diagImpl(diag), m_matImpl(mat)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
+  {
+    if(AsScalarProduct)
+      return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
+    else
+      return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
+  }
+
+protected:
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const
+  {
+    return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
+                          internal::pset1<PacketType>(m_diagImpl.coeff(id)));
+  }
+
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const
+  {
+    enum {
+      InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
+      DiagonalPacketLoadMode = EIGEN_PLAIN_ENUM_MIN(LoadMode,((InnerSize%16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment)) // FIXME hardcoded 16!!
+    };
+    return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
+                          m_diagImpl.template packet<DiagonalPacketLoadMode,PacketType>(id));
+  }
+
+  evaluator<DiagonalType> m_diagImpl;
+  evaluator<MatrixType>   m_matImpl;
+};
+
+// diagonal * dense
+template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalShape, DenseShape>
+  : diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft>
+{
+  typedef diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft> Base;
+  using Base::m_diagImpl;
+  using Base::m_matImpl;
+  using Base::coeff;
+  typedef typename Base::Scalar Scalar;
+
+  typedef Product<Lhs, Rhs, ProductKind> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef typename Lhs::DiagonalVectorType DiagonalType;
+
+
+  enum { StorageOrder = Base::_StorageOrder };
+
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(xpr.rhs(), xpr.lhs().diagonal())
+  {
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
+  {
+    return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
+  }
+
+#ifndef EIGEN_GPUCC
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
+  {
+    // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
+    // See also similar calls below.
+    return this->template packet_impl<LoadMode,PacketType>(row,col, row,
+                                 typename internal::conditional<int(StorageOrder)==RowMajor, internal::true_type, internal::false_type>::type());
+  }
+
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index idx) const
+  {
+    return packet<LoadMode,PacketType>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
+  }
+#endif
+};
+
+// dense * diagonal
+template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape, DiagonalShape>
+  : diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight>
+{
+  typedef diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight> Base;
+  using Base::m_diagImpl;
+  using Base::m_matImpl;
+  using Base::coeff;
+  typedef typename Base::Scalar Scalar;
+
+  typedef Product<Lhs, Rhs, ProductKind> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+
+  enum { StorageOrder = Base::_StorageOrder };
+
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(xpr.lhs(), xpr.rhs().diagonal())
+  {
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
+  {
+    return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
+  }
+
+#ifndef EIGEN_GPUCC
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
+  {
+    return this->template packet_impl<LoadMode,PacketType>(row,col, col,
+                                 typename internal::conditional<int(StorageOrder)==ColMajor, internal::true_type, internal::false_type>::type());
+  }
+
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index idx) const
+  {
+    return packet<LoadMode,PacketType>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
+  }
+#endif
+};
+
+/***************************************************************************
+* Products with permutation matrices
+***************************************************************************/
+
+/** \internal
+  * \class permutation_matrix_product
+  * Internal helper class implementing the product between a permutation matrix and a matrix.
+  * This class is specialized for DenseShape below and for SparseShape in SparseCore/SparsePermutation.h
+  */
+template<typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
+struct permutation_matrix_product;
+
+template<typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape>
+{
+    typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+    typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
+
+    template<typename Dest, typename PermutationType>
+    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
+    {
+      MatrixType mat(xpr);
+      const Index n = Side==OnTheLeft ? mat.rows() : mat.cols();
+      // FIXME we need an is_same for expression that is not sensitive to constness. For instance
+      // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
+      //if(is_same<MatrixTypeCleaned,Dest>::value && extract_data(dst) == extract_data(mat))
+      if(is_same_dense(dst, mat))
+      {
+        // apply the permutation inplace
+        Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(perm.size());
+        mask.fill(false);
+        Index r = 0;
+        while(r < perm.size())
+        {
+          // search for the next seed
+          while(r<perm.size() && mask[r]) r++;
+          if(r>=perm.size())
+            break;
+          // we got one, let's follow it until we are back to the seed
+          Index k0 = r++;
+          Index kPrev = k0;
+          mask.coeffRef(k0) = true;
+          for(Index k=perm.indices().coeff(k0); k!=k0; k=perm.indices().coeff(k))
+          {
+                  Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>(dst, k)
+            .swap(Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
+                       (dst,((Side==OnTheLeft) ^ Transposed) ? k0 : kPrev));
+
+            mask.coeffRef(k) = true;
+            kPrev = k;
+          }
+        }
+      }
+      else
+      {
+        for(Index i = 0; i < n; ++i)
+        {
+          Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
+               (dst, ((Side==OnTheLeft) ^ Transposed) ? perm.indices().coeff(i) : i)
+
+          =
+
+          Block<const MatrixTypeCleaned,Side==OnTheLeft ? 1 : MatrixTypeCleaned::RowsAtCompileTime,Side==OnTheRight ? 1 : MatrixTypeCleaned::ColsAtCompileTime>
+               (mat, ((Side==OnTheRight) ^ Transposed) ? perm.indices().coeff(i) : i);
+        }
+      }
+    }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, PermutationShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    permutation_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, PermutationShape, ProductTag>
+{
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    permutation_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Inverse<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
+  {
+    permutation_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, ProductTag>
+{
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
+  {
+    permutation_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
+  }
+};
+
+
+/***************************************************************************
+* Products with transpositions matrices
+***************************************************************************/
+
+// FIXME could we unify Transpositions and Permutation into a single "shape"??
+
+/** \internal
+  * \class transposition_matrix_product
+  * Internal helper class implementing the product between a permutation matrix and a matrix.
+  */
+template<typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
+struct transposition_matrix_product
+{
+  typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+  typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
+
+  template<typename Dest, typename TranspositionType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr)
+  {
+    MatrixType mat(xpr);
+    typedef typename TranspositionType::StorageIndex StorageIndex;
+    const Index size = tr.size();
+    StorageIndex j = 0;
+
+    if(!is_same_dense(dst,mat))
+      dst = mat;
+
+    for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
+      if(Index(j=tr.coeff(k))!=k)
+      {
+        if(Side==OnTheLeft)        dst.row(k).swap(dst.row(j));
+        else if(Side==OnTheRight)  dst.col(k).swap(dst.col(j));
+      }
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, TranspositionsShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    transposition_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, TranspositionsShape, ProductTag>
+{
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    transposition_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
+  }
+};
+
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Transpose<Lhs>, Rhs, TranspositionsShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
+  {
+    transposition_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, TranspositionsShape, ProductTag>
+{
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
+  {
+    transposition_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PRODUCT_EVALUATORS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Random.h b/src/3rdparty/eigen/Eigen/src/Core/Random.h
new file mode 100644
index 000000000..dab2ac8e9
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Random.h
@@ -0,0 +1,218 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RANDOM_H
+#define EIGEN_RANDOM_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename Scalar> struct scalar_random_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_random_op)
+  inline const Scalar operator() () const { return random<Scalar>(); }
+};
+
+template<typename Scalar>
+struct functor_traits<scalar_random_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false, IsRepeatable = false }; };
+
+} // end namespace internal
+
+/** \returns a random matrix expression
+  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  * 
+  * The parameters \a rows and \a cols are the number of rows and of columns of
+  * the returned matrix. Must be compatible with this MatrixBase type.
+  *
+  * \not_reentrant
+  * 
+  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+  * it is redundant to pass \a rows and \a cols as arguments, so Random() should be used
+  * instead.
+  * 
+  *
+  * Example: \include MatrixBase_random_int_int.cpp
+  * Output: \verbinclude MatrixBase_random_int_int.out
+  *
+  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
+  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
+  * behavior with expressions involving random matrices.
+  * 
+  * See DenseBase::NullaryExpr(Index, const CustomNullaryOp&) for an example using C++11 random generators.
+  *
+  * \sa DenseBase::setRandom(), DenseBase::Random(Index), DenseBase::Random()
+  */
+template<typename Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
+DenseBase<Derived>::Random(Index rows, Index cols)
+{
+  return NullaryExpr(rows, cols, internal::scalar_random_op<Scalar>());
+}
+
+/** \returns a random vector expression
+  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  *
+  * The parameter \a size is the size of the returned vector.
+  * Must be compatible with this MatrixBase type.
+  *
+  * \only_for_vectors
+  * \not_reentrant
+  *
+  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+  * it is redundant to pass \a size as argument, so Random() should be used
+  * instead.
+  *
+  * Example: \include MatrixBase_random_int.cpp
+  * Output: \verbinclude MatrixBase_random_int.out
+  *
+  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
+  * a temporary vector whenever it is nested in a larger expression. This prevents unexpected
+  * behavior with expressions involving random matrices.
+  *
+  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random()
+  */
+template<typename Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
+DenseBase<Derived>::Random(Index size)
+{
+  return NullaryExpr(size, internal::scalar_random_op<Scalar>());
+}
+
+/** \returns a fixed-size random matrix or vector expression
+  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  * 
+  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
+  * need to use the variants taking size arguments.
+  *
+  * Example: \include MatrixBase_random.cpp
+  * Output: \verbinclude MatrixBase_random.out
+  *
+  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
+  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
+  * behavior with expressions involving random matrices.
+  * 
+  * \not_reentrant
+  *
+  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random(Index)
+  */
+template<typename Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
+DenseBase<Derived>::Random()
+{
+  return NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_random_op<Scalar>());
+}
+
+/** Sets all coefficients in this expression to random values.
+  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  * 
+  * \not_reentrant
+  * 
+  * Example: \include MatrixBase_setRandom.cpp
+  * Output: \verbinclude MatrixBase_setRandom.out
+  *
+  * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom()
+{
+  return *this = Random(rows(), cols());
+}
+
+/** Resizes to the given \a newSize, and sets all coefficients in this expression to random values.
+  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  * 
+  * \only_for_vectors
+  * \not_reentrant
+  *
+  * Example: \include Matrix_setRandom_int.cpp
+  * Output: \verbinclude Matrix_setRandom_int.out
+  *
+  * \sa DenseBase::setRandom(), setRandom(Index,Index), class CwiseNullaryOp, DenseBase::Random()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setRandom(Index newSize)
+{
+  resize(newSize);
+  return setRandom();
+}
+
+/** Resizes to the given size, and sets all coefficients in this expression to random values.
+  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  *
+  * \not_reentrant
+  * 
+  * \param rows the new number of rows
+  * \param cols the new number of columns
+  *
+  * Example: \include Matrix_setRandom_int_int.cpp
+  * Output: \verbinclude Matrix_setRandom_int_int.out
+  *
+  * \sa DenseBase::setRandom(), setRandom(Index), class CwiseNullaryOp, DenseBase::Random()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setRandom(Index rows, Index cols)
+{
+  resize(rows, cols);
+  return setRandom();
+}
+
+/** Resizes to the given size, changing only the number of columns, and sets all
+  * coefficients in this expression to random values. For the parameter of type
+  * NoChange_t, just pass the special value \c NoChange.
+  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  *
+  * \not_reentrant
+  *
+  * \sa DenseBase::setRandom(), setRandom(Index), setRandom(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Random()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setRandom(NoChange_t, Index cols)
+{
+  return setRandom(rows(), cols);
+}
+
+/** Resizes to the given size, changing only the number of rows, and sets all
+  * coefficients in this expression to random values. For the parameter of type
+  * NoChange_t, just pass the special value \c NoChange.
+  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  *
+  * \not_reentrant
+  *
+  * \sa DenseBase::setRandom(), setRandom(Index), setRandom(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Random()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setRandom(Index rows, NoChange_t)
+{
+  return setRandom(rows, cols());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_RANDOM_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Redux.h b/src/3rdparty/eigen/Eigen/src/Core/Redux.h
new file mode 100644
index 000000000..b6790d110
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Redux.h
@@ -0,0 +1,515 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REDUX_H
+#define EIGEN_REDUX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+// TODO
+//  * implement other kind of vectorization
+//  * factorize code
+
+/***************************************************************************
+* Part 1 : the logic deciding a strategy for vectorization and unrolling
+***************************************************************************/
+
+template<typename Func, typename Evaluator>
+struct redux_traits
+{
+public:
+    typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType;
+  enum {
+    PacketSize = unpacket_traits<PacketType>::size,
+    InnerMaxSize = int(Evaluator::IsRowMajor)
+                 ? Evaluator::MaxColsAtCompileTime
+                 : Evaluator::MaxRowsAtCompileTime,
+    OuterMaxSize = int(Evaluator::IsRowMajor)
+                 ? Evaluator::MaxRowsAtCompileTime
+                 : Evaluator::MaxColsAtCompileTime,
+    SliceVectorizedWork = int(InnerMaxSize)==Dynamic ? Dynamic
+                        : int(OuterMaxSize)==Dynamic ? (int(InnerMaxSize)>=int(PacketSize) ? Dynamic : 0)
+                        : (int(InnerMaxSize)/int(PacketSize)) * int(OuterMaxSize)
+  };
+
+  enum {
+    MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
+                  && (functor_traits<Func>::PacketAccess),
+    MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
+    MaySliceVectorize  = bool(MightVectorize) && (int(SliceVectorizedWork)==Dynamic || int(SliceVectorizedWork)>=3)
+  };
+
+public:
+  enum {
+    Traversal = int(MayLinearVectorize) ? int(LinearVectorizedTraversal)
+              : int(MaySliceVectorize)  ? int(SliceVectorizedTraversal)
+                                        : int(DefaultTraversal)
+  };
+
+public:
+  enum {
+    Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
+         : int(Evaluator::SizeAtCompileTime) * int(Evaluator::CoeffReadCost) + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+    UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
+  };
+
+public:
+  enum {
+    Unrolling = Cost <= UnrollingLimit ? CompleteUnrolling : NoUnrolling
+  };
+  
+#ifdef EIGEN_DEBUG_ASSIGN
+  static void debug()
+  {
+    std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
+    std::cerr.setf(std::ios::hex, std::ios::basefield);
+    EIGEN_DEBUG_VAR(Evaluator::Flags)
+    std::cerr.unsetf(std::ios::hex);
+    EIGEN_DEBUG_VAR(InnerMaxSize)
+    EIGEN_DEBUG_VAR(OuterMaxSize)
+    EIGEN_DEBUG_VAR(SliceVectorizedWork)
+    EIGEN_DEBUG_VAR(PacketSize)
+    EIGEN_DEBUG_VAR(MightVectorize)
+    EIGEN_DEBUG_VAR(MayLinearVectorize)
+    EIGEN_DEBUG_VAR(MaySliceVectorize)
+    std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
+    EIGEN_DEBUG_VAR(UnrollingLimit)
+    std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
+    std::cerr << std::endl;
+  }
+#endif
+};
+
+/***************************************************************************
+* Part 2 : unrollers
+***************************************************************************/
+
+/*** no vectorization ***/
+
+template<typename Func, typename Evaluator, int Start, int Length>
+struct redux_novec_unroller
+{
+  enum {
+    HalfLength = Length/2
+  };
+
+  typedef typename Evaluator::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func)
+  {
+    return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
+                redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func));
+  }
+};
+
+template<typename Func, typename Evaluator, int Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 1>
+{
+  enum {
+    outer = Start / Evaluator::InnerSizeAtCompileTime,
+    inner = Start % Evaluator::InnerSizeAtCompileTime
+  };
+
+  typedef typename Evaluator::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&)
+  {
+    return eval.coeffByOuterInner(outer, inner);
+  }
+};
+
+// This is actually dead code and will never be called. It is required
+// to prevent false warnings regarding failed inlining though
+// for 0 length run() will never be called at all.
+template<typename Func, typename Evaluator, int Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 0>
+{
+  typedef typename Evaluator::Scalar Scalar;
+  EIGEN_DEVICE_FUNC 
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
+};
+
+/*** vectorization ***/
+
+template<typename Func, typename Evaluator, int Start, int Length>
+struct redux_vec_unroller
+{
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func& func)
+  {
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      HalfLength = Length/2
+    };
+
+    return func.packetOp(
+            redux_vec_unroller<Func, Evaluator, Start, HalfLength>::template run<PacketType>(eval,func),
+            redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::template run<PacketType>(eval,func) );
+  }
+};
+
+template<typename Func, typename Evaluator, int Start>
+struct redux_vec_unroller<Func, Evaluator, Start, 1>
+{
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func&)
+  {
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      index = Start * PacketSize,
+      outer = index / int(Evaluator::InnerSizeAtCompileTime),
+      inner = index % int(Evaluator::InnerSizeAtCompileTime),
+      alignment = Evaluator::Alignment
+    };
+    return eval.template packetByOuterInner<alignment,PacketType>(outer, inner);
+  }
+};
+
+/***************************************************************************
+* Part 3 : implementation of all cases
+***************************************************************************/
+
+template<typename Func, typename Evaluator,
+         int Traversal = redux_traits<Func, Evaluator>::Traversal,
+         int Unrolling = redux_traits<Func, Evaluator>::Unrolling
+>
+struct redux_impl;
+
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
+{
+  typedef typename Evaluator::Scalar Scalar;
+
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
+  {
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
+    Scalar res;
+    res = eval.coeffByOuterInner(0, 0);
+    for(Index i = 1; i < xpr.innerSize(); ++i)
+      res = func(res, eval.coeffByOuterInner(0, i));
+    for(Index i = 1; i < xpr.outerSize(); ++i)
+      for(Index j = 0; j < xpr.innerSize(); ++j)
+        res = func(res, eval.coeffByOuterInner(i, j));
+    return res;
+  }
+};
+
+template<typename Func, typename Evaluator>
+struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
+  : redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
+{
+  typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
+  typedef typename Evaluator::Scalar Scalar;
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/)
+  {
+    return Base::run(eval,func);
+  }
+};
+
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
+{
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
+
+  template<typename XprType>
+  static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
+  {
+    const Index size = xpr.size();
+    
+    const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
+    const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
+    enum {
+      alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
+      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
+    };
+    const Index alignedStart = internal::first_default_aligned(xpr);
+    const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
+    const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
+    const Index alignedEnd2 = alignedStart + alignedSize2;
+    const Index alignedEnd  = alignedStart + alignedSize;
+    Scalar res;
+    if(alignedSize)
+    {
+      PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart);
+      if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
+      {
+        PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize);
+        for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
+        {
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index));
+          packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize));
+        }
+
+        packet_res0 = func.packetOp(packet_res0,packet_res1);
+        if(alignedEnd>alignedEnd2)
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2));
+      }
+      res = func.predux(packet_res0);
+
+      for(Index index = 0; index < alignedStart; ++index)
+        res = func(res,eval.coeff(index));
+
+      for(Index index = alignedEnd; index < size; ++index)
+        res = func(res,eval.coeff(index));
+    }
+    else // too small to vectorize anything.
+         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
+    {
+      res = eval.coeff(0);
+      for(Index index = 1; index < size; ++index)
+        res = func(res,eval.coeff(index));
+    }
+
+    return res;
+  }
+};
+
+// NOTE: for SliceVectorizedTraversal we simply bypass unrolling
+template<typename Func, typename Evaluator, int Unrolling>
+struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
+{
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
+
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
+  {
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
+    const Index innerSize = xpr.innerSize();
+    const Index outerSize = xpr.outerSize();
+    enum {
+      packetSize = redux_traits<Func, Evaluator>::PacketSize
+    };
+    const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
+    Scalar res;
+    if(packetedInnerSize)
+    {
+      PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0);
+      for(Index j=0; j<outerSize; ++j)
+        for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
+          packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i));
+
+      res = func.predux(packet_res);
+      for(Index j=0; j<outerSize; ++j)
+        for(Index i=packetedInnerSize; i<innerSize; ++i)
+          res = func(res, eval.coeffByOuterInner(j,i));
+    }
+    else // too small to vectorize anything.
+         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
+    {
+      res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
+    }
+
+    return res;
+  }
+};
+
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
+{
+  typedef typename Evaluator::Scalar Scalar;
+
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
+  enum {
+    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
+    Size = Evaluator::SizeAtCompileTime,
+    VectorizedSize = (int(Size) / int(PacketSize)) * int(PacketSize)
+  };
+
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr)
+  {
+    EIGEN_ONLY_USED_FOR_DEBUG(xpr)
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
+    if (VectorizedSize > 0) {
+      Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::template run<PacketType>(eval,func));
+      if (VectorizedSize != Size)
+        res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func));
+      return res;
+    }
+    else {
+      return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func);
+    }
+  }
+};
+
+// evaluator adaptor
+template<typename _XprType>
+class redux_evaluator : public internal::evaluator<_XprType>
+{
+  typedef internal::evaluator<_XprType> Base;
+public:
+  typedef _XprType XprType;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
+  
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketScalar PacketScalar;
+  
+  enum {
+    MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
+    // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
+    Flags = Base::Flags & ~DirectAccessBit,
+    IsRowMajor = XprType::IsRowMajor,
+    SizeAtCompileTime = XprType::SizeAtCompileTime,
+    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
+  };
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
+  { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  
+  template<int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketType packetByOuterInner(Index outer, Index inner) const
+  { return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  
+};
+
+} // end namespace internal
+
+/***************************************************************************
+* Part 4 : public API
+***************************************************************************/
+
+
+/** \returns the result of a full redux operation on the whole matrix or vector using \a func
+  *
+  * The template parameter \a BinaryOp is the type of the functor \a func which must be
+  * an associative operator. Both current C++98 and C++11 functor styles are handled.
+  *
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  *
+  * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
+  */
+template<typename Derived>
+template<typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::redux(const Func& func) const
+{
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
+  typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
+  ThisEvaluator thisEval(derived());
+
+  // The initial expression is passed to the reducer as an additional argument instead of
+  // passing it as a member of redux_evaluator to help  
+  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
+}
+
+/** \returns the minimum of all coefficients of \c *this.
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is minimum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  */
+template<typename Derived>
+template<int NaNPropagation>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::minCoeff() const
+{
+  return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar, NaNPropagation>());
+}
+
+/** \returns the maximum of all coefficients of \c *this. 
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  */
+template<typename Derived>
+template<int NaNPropagation>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::maxCoeff() const
+{
+  return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar, NaNPropagation>());
+}
+
+/** \returns the sum of all coefficients of \c *this
+  *
+  * If \c *this is empty, then the value 0 is returned.
+  *
+  * \sa trace(), prod(), mean()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::sum() const
+{
+  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
+    return Scalar(0);
+  return derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>());
+}
+
+/** \returns the mean of all coefficients of *this
+*
+* \sa trace(), prod(), sum()
+*/
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::mean() const
+{
+#ifdef __INTEL_COMPILER
+  #pragma warning push
+  #pragma warning ( disable : 2259 )
+#endif
+  return Scalar(derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>())) / Scalar(this->size());
+#ifdef __INTEL_COMPILER
+  #pragma warning pop
+#endif
+}
+
+/** \returns the product of all coefficients of *this
+  *
+  * Example: \include MatrixBase_prod.cpp
+  * Output: \verbinclude MatrixBase_prod.out
+  *
+  * \sa sum(), mean(), trace()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::prod() const
+{
+  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
+    return Scalar(1);
+  return derived().redux(Eigen::internal::scalar_product_op<Scalar>());
+}
+
+/** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal.
+  *
+  * \c *this can be any matrix, not necessarily square.
+  *
+  * \sa diagonal(), sum()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+MatrixBase<Derived>::trace() const
+{
+  return derived().diagonal().sum();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_REDUX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Ref.h b/src/3rdparty/eigen/Eigen/src/Core/Ref.h
new file mode 100644
index 000000000..c2a37eadb
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Ref.h
@@ -0,0 +1,381 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REF_H
+#define EIGEN_REF_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename _PlainObjectType, int _Options, typename _StrideType>
+struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
+  : public traits<Map<_PlainObjectType, _Options, _StrideType> >
+{
+  typedef _PlainObjectType PlainObjectType;
+  typedef _StrideType StrideType;
+  enum {
+    Options = _Options,
+    Flags = traits<Map<_PlainObjectType, _Options, _StrideType> >::Flags | NestByRefBit,
+    Alignment = traits<Map<_PlainObjectType, _Options, _StrideType> >::Alignment
+  };
+
+  template<typename Derived> struct match {
+    enum {
+      IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime,
+      HasDirectAccess = internal::has_direct_access<Derived>::ret,
+      StorageOrderMatch = IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
+      InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
+                      || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
+                      || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
+      OuterStrideMatch = IsVectorAtCompileTime
+                      || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
+      // NOTE, this indirection of evaluator<Derived>::Alignment is needed
+      // to workaround a very strange bug in MSVC related to the instantiation
+      // of has_*ary_operator in evaluator<CwiseNullaryOp>.
+      // This line is surprisingly very sensitive. For instance, simply adding parenthesis
+      // as "DerivedAlignment = (int(evaluator<Derived>::Alignment))," will make MSVC fail...
+      DerivedAlignment = int(evaluator<Derived>::Alignment),
+      AlignmentMatch = (int(traits<PlainObjectType>::Alignment)==int(Unaligned)) || (DerivedAlignment >= int(Alignment)), // FIXME the first condition is not very clear, it should be replaced by the required alignment
+      ScalarTypeMatch = internal::is_same<typename PlainObjectType::Scalar, typename Derived::Scalar>::value,
+      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch
+    };
+    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
+  };
+
+};
+
+template<typename Derived>
+struct traits<RefBase<Derived> > : public traits<Derived> {};
+
+}
+
+template<typename Derived> class RefBase
+ : public MapBase<Derived>
+{
+  typedef typename internal::traits<Derived>::PlainObjectType PlainObjectType;
+  typedef typename internal::traits<Derived>::StrideType StrideType;
+
+public:
+
+  typedef MapBase<Derived> Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const
+  {
+    return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const
+  {
+    return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
+         : IsVectorAtCompileTime ? this->size()
+         : int(Flags)&RowMajorBit ? this->cols()
+         : this->rows();
+  }
+
+  EIGEN_DEVICE_FUNC RefBase()
+    : Base(0,RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime),
+      // Stride<> does not allow default ctor for Dynamic strides, so let' initialize it with dummy values:
+      m_stride(StrideType::OuterStrideAtCompileTime==Dynamic?0:StrideType::OuterStrideAtCompileTime,
+               StrideType::InnerStrideAtCompileTime==Dynamic?0:StrideType::InnerStrideAtCompileTime)
+  {}
+
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(RefBase)
+
+protected:
+
+  typedef Stride<StrideType::OuterStrideAtCompileTime,StrideType::InnerStrideAtCompileTime> StrideBase;
+
+  // Resolves inner stride if default 0.
+  static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveInnerStride(Index inner) {
+    return inner == 0 ? 1 : inner;
+  }
+
+  // Resolves outer stride if default 0.
+  static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols, bool isVectorAtCompileTime, bool isRowMajor) {
+    return outer == 0 ? isVectorAtCompileTime ? inner * rows * cols : isRowMajor ? inner * cols : inner * rows : outer;
+  }
+
+  // Returns true if construction is valid, false if there is a stride mismatch,
+  // and fails if there is a size mismatch.
+  template<typename Expression>
+  EIGEN_DEVICE_FUNC bool construct(Expression& expr)
+  {
+    // Check matrix sizes.  If this is a compile-time vector, we do allow
+    // implicitly transposing.
+    EIGEN_STATIC_ASSERT(
+      EIGEN_PREDICATE_SAME_MATRIX_SIZE(PlainObjectType, Expression)
+      // If it is a vector, the transpose sizes might match.
+      || ( PlainObjectType::IsVectorAtCompileTime
+            && ((int(PlainObjectType::RowsAtCompileTime)==Eigen::Dynamic
+              || int(Expression::ColsAtCompileTime)==Eigen::Dynamic
+              || int(PlainObjectType::RowsAtCompileTime)==int(Expression::ColsAtCompileTime))
+            &&  (int(PlainObjectType::ColsAtCompileTime)==Eigen::Dynamic
+              || int(Expression::RowsAtCompileTime)==Eigen::Dynamic
+              || int(PlainObjectType::ColsAtCompileTime)==int(Expression::RowsAtCompileTime)))),
+      YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES
+    )
+
+    // Determine runtime rows and columns.
+    Index rows = expr.rows();
+    Index cols = expr.cols();
+    if(PlainObjectType::RowsAtCompileTime==1)
+    {
+      eigen_assert(expr.rows()==1 || expr.cols()==1);
+      rows = 1;
+      cols = expr.size();
+    }
+    else if(PlainObjectType::ColsAtCompileTime==1)
+    {
+      eigen_assert(expr.rows()==1 || expr.cols()==1);
+      rows = expr.size();
+      cols = 1;
+    }
+    // Verify that the sizes are valid.
+    eigen_assert(
+      (PlainObjectType::RowsAtCompileTime == Dynamic) || (PlainObjectType::RowsAtCompileTime == rows));
+    eigen_assert(
+      (PlainObjectType::ColsAtCompileTime == Dynamic) || (PlainObjectType::ColsAtCompileTime == cols));
+
+
+    // If this is a vector, we might be transposing, which means that stride should swap.
+    const bool transpose = PlainObjectType::IsVectorAtCompileTime && (rows != expr.rows());
+    // If the storage format differs, we also need to swap the stride.
+    const bool row_major = ((PlainObjectType::Flags)&RowMajorBit) != 0;
+    const bool expr_row_major = (Expression::Flags&RowMajorBit) != 0;
+    const bool storage_differs =  (row_major != expr_row_major);
+
+    const bool swap_stride = (transpose != storage_differs);
+
+    // Determine expr's actual strides, resolving any defaults if zero.
+    const Index expr_inner_actual = resolveInnerStride(expr.innerStride());
+    const Index expr_outer_actual = resolveOuterStride(expr_inner_actual,
+                                                       expr.outerStride(),
+                                                       expr.rows(),
+                                                       expr.cols(),
+                                                       Expression::IsVectorAtCompileTime != 0,
+                                                       expr_row_major);
+
+    // If this is a column-major row vector or row-major column vector, the inner-stride
+    // is arbitrary, so set it to either the compile-time inner stride or 1.
+    const bool row_vector = (rows == 1);
+    const bool col_vector = (cols == 1);
+    const Index inner_stride =
+        ( (!row_major && row_vector) || (row_major && col_vector) ) ?
+            ( StrideType::InnerStrideAtCompileTime > 0 ? Index(StrideType::InnerStrideAtCompileTime) : 1)
+            : swap_stride ? expr_outer_actual : expr_inner_actual;
+
+    // If this is a column-major column vector or row-major row vector, the outer-stride
+    // is arbitrary, so set it to either the compile-time outer stride or vector size.
+    const Index outer_stride =
+      ( (!row_major && col_vector) || (row_major && row_vector) ) ?
+          ( StrideType::OuterStrideAtCompileTime > 0 ? Index(StrideType::OuterStrideAtCompileTime) : rows * cols * inner_stride)
+          : swap_stride ? expr_inner_actual : expr_outer_actual;
+
+    // Check if given inner/outer strides are compatible with compile-time strides.
+    const bool inner_valid = (StrideType::InnerStrideAtCompileTime == Dynamic)
+        || (resolveInnerStride(Index(StrideType::InnerStrideAtCompileTime)) == inner_stride);
+    if (!inner_valid) {
+      return false;
+    }
+
+    const bool outer_valid = (StrideType::OuterStrideAtCompileTime == Dynamic)
+        || (resolveOuterStride(
+              inner_stride,
+              Index(StrideType::OuterStrideAtCompileTime),
+              rows, cols, PlainObjectType::IsVectorAtCompileTime != 0,
+              row_major)
+            == outer_stride);
+    if (!outer_valid) {
+      return false;
+    }
+
+    ::new (static_cast<Base*>(this)) Base(expr.data(), rows, cols);
+    ::new (&m_stride) StrideBase(
+      (StrideType::OuterStrideAtCompileTime == 0) ? 0 : outer_stride,
+      (StrideType::InnerStrideAtCompileTime == 0) ? 0 : inner_stride );
+    return true;
+  }
+
+  StrideBase m_stride;
+};
+
+/** \class Ref
+  * \ingroup Core_Module
+  *
+  * \brief A matrix or vector expression mapping an existing expression
+  *
+  * \tparam PlainObjectType the equivalent matrix type of the mapped data
+  * \tparam Options specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
+  *                 The default is \c #Unaligned.
+  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
+  *                   but accepts a variable outer stride (leading dimension).
+  *                   This can be overridden by specifying strides.
+  *                   The type passed here must be a specialization of the Stride template, see examples below.
+  *
+  * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies.
+  * A Ref<> object can represent either a const expression or a l-value:
+  * \code
+  * // in-out argument:
+  * void foo1(Ref<VectorXf> x);
+  *
+  * // read-only const argument:
+  * void foo2(const Ref<const VectorXf>& x);
+  * \endcode
+  *
+  * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
+  * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout.
+  * Likewise, a Ref<MatrixXf> can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with
+  * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension)
+  * can be greater than the number of rows.
+  *
+  * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function.
+  * Here are some examples:
+  * \code
+  * MatrixXf A;
+  * VectorXf a;
+  * foo1(a.head());             // OK
+  * foo1(A.col());              // OK
+  * foo1(A.row());              // Compilation error because here innerstride!=1
+  * foo2(A.row());              // Compilation error because A.row() is a 1xN object while foo2 is expecting a Nx1 object
+  * foo2(A.row().transpose());  // The row is copied into a contiguous temporary
+  * foo2(2*a);                  // The expression is evaluated into a temporary
+  * foo2(A.col().segment(2,4)); // No temporary
+  * \endcode
+  *
+  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters.
+  * Here is an example accepting an innerstride!=1:
+  * \code
+  * // in-out argument:
+  * void foo3(Ref<VectorXf,0,InnerStride<> > x);
+  * foo3(A.row());              // OK
+  * \endcode
+  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more
+  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a
+  * template function, e.g.:
+  * \code
+  * // in the .h:
+  * void foo(const Ref<MatrixXf>& A);
+  * void foo(const Ref<MatrixXf,0,Stride<> >& A);
+  *
+  * // in the .cpp:
+  * template<typename TypeOfA> void foo_impl(const TypeOfA& A) {
+  *     ... // crazy code goes here
+  * }
+  * void foo(const Ref<MatrixXf>& A) { foo_impl(A); }
+  * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); }
+  * \endcode
+  *
+  * See also the following stackoverflow questions for further references:
+  *  - <a href="http://stackoverflow.com/questions/21132538/correct-usage-of-the-eigenref-class">Correct usage of the Eigen::Ref<> class</a>
+  *
+  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
+  */
+template<typename PlainObjectType, int Options, typename StrideType> class Ref
+  : public RefBase<Ref<PlainObjectType, Options, StrideType> >
+{
+  private:
+    typedef internal::traits<Ref> Traits;
+    template<typename Derived>
+    EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0);
+  public:
+
+    typedef RefBase<Ref> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Ref)
+
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename Derived>
+    EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      // Construction must pass since we will not create temprary storage in the non-const case.
+      const bool success = Base::construct(expr.derived());
+      EIGEN_UNUSED_VARIABLE(success)
+      eigen_assert(success);
+    }
+    template<typename Derived>
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
+    #else
+    /** Implicit constructor from any dense expression */
+    template<typename Derived>
+    inline Ref(DenseBase<Derived>& expr)
+    #endif
+    {
+      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      EIGEN_STATIC_ASSERT(!Derived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      // Construction must pass since we will not create temporary storage in the non-const case.
+      const bool success = Base::construct(expr.const_cast_derived());
+      EIGEN_UNUSED_VARIABLE(success)
+      eigen_assert(success);
+    }
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref)
+
+};
+
+// this is the const ref version
+template<typename TPlainObjectType, int Options, typename StrideType> class Ref<const TPlainObjectType, Options, StrideType>
+  : public RefBase<Ref<const TPlainObjectType, Options, StrideType> >
+{
+    typedef internal::traits<Ref> Traits;
+  public:
+
+    typedef RefBase<Ref> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Ref)
+
+    template<typename Derived>
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::ScalarTypeMatch),Derived>::type* = 0)
+    {
+//      std::cout << match_helper<Derived>::HasDirectAccess << "," << match_helper<Derived>::OuterStrideMatch << "," << match_helper<Derived>::InnerStrideMatch << "\n";
+//      std::cout << int(StrideType::OuterStrideAtCompileTime) << " - " << int(Derived::OuterStrideAtCompileTime) << "\n";
+//      std::cout << int(StrideType::InnerStrideAtCompileTime) << " - " << int(Derived::InnerStrideAtCompileTime) << "\n";
+      construct(expr.derived(), typename Traits::template match<Derived>::type());
+    }
+
+    EIGEN_DEVICE_FUNC inline Ref(const Ref& other) : Base(other) {
+      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+    }
+
+    template<typename OtherRef>
+    EIGEN_DEVICE_FUNC inline Ref(const RefBase<OtherRef>& other) {
+      construct(other.derived(), typename Traits::template match<OtherRef>::type());
+    }
+
+  protected:
+
+    template<typename Expression>
+    EIGEN_DEVICE_FUNC void construct(const Expression& expr,internal::true_type)
+    {
+      // Check if we can use the underlying expr's storage directly, otherwise call the copy version.
+      if (!Base::construct(expr)) {
+        construct(expr, internal::false_type());
+      }
+    }
+
+    template<typename Expression>
+    EIGEN_DEVICE_FUNC void construct(const Expression& expr, internal::false_type)
+    {
+      internal::call_assignment_no_alias(m_object,expr,internal::assign_op<Scalar,Scalar>());
+      Base::construct(m_object);
+    }
+
+  protected:
+    TPlainObjectType m_object;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_REF_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Replicate.h b/src/3rdparty/eigen/Eigen/src/Core/Replicate.h
new file mode 100644
index 000000000..ab5be7e64
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Replicate.h
@@ -0,0 +1,142 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REPLICATE_H
+#define EIGEN_REPLICATE_H
+
+namespace Eigen {
+
+namespace internal {
+template<typename MatrixType,int RowFactor,int ColFactor>
+struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
+ : traits<MatrixType>
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename traits<MatrixType>::StorageKind StorageKind;
+  typedef typename traits<MatrixType>::XprKind XprKind;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
+  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
+  enum {
+    RowsAtCompileTime = RowFactor==Dynamic || int(MatrixType::RowsAtCompileTime)==Dynamic
+                      ? Dynamic
+                      : RowFactor * MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = ColFactor==Dynamic || int(MatrixType::ColsAtCompileTime)==Dynamic
+                      ? Dynamic
+                      : ColFactor * MatrixType::ColsAtCompileTime,
+   //FIXME we don't propagate the max sizes !!!
+    MaxRowsAtCompileTime = RowsAtCompileTime,
+    MaxColsAtCompileTime = ColsAtCompileTime,
+    IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1
+               : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0
+               : (MatrixType::Flags & RowMajorBit) ? 1 : 0,
+
+    // FIXME enable DirectAccess with negative strides?
+    Flags = IsRowMajor ? RowMajorBit : 0
+  };
+};
+}
+
+/**
+  * \class Replicate
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the multiple replication of a matrix or vector
+  *
+  * \tparam MatrixType the type of the object we are replicating
+  * \tparam RowFactor number of repetitions at compile time along the vertical direction, can be Dynamic.
+  * \tparam ColFactor number of repetitions at compile time along the horizontal direction, can be Dynamic.
+  *
+  * This class represents an expression of the multiple replication of a matrix or vector.
+  * It is the return type of DenseBase::replicate() and most of the time
+  * this is the only way it is used.
+  *
+  * \sa DenseBase::replicate()
+  */
+template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
+  : public internal::dense_xpr_base< Replicate<MatrixType,RowFactor,ColFactor> >::type
+{
+    typedef typename internal::traits<Replicate>::MatrixTypeNested MatrixTypeNested;
+    typedef typename internal::traits<Replicate>::_MatrixTypeNested _MatrixTypeNested;
+  public:
+
+    typedef typename internal::dense_xpr_base<Replicate>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Replicate)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
+
+    template<typename OriginalMatrixType>
+    EIGEN_DEVICE_FUNC
+    inline explicit Replicate(const OriginalMatrixType& matrix)
+      : m_matrix(matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor)
+    {
+      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::remove_const<MatrixType>::type,OriginalMatrixType>::value),
+                          THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
+      eigen_assert(RowFactor!=Dynamic && ColFactor!=Dynamic);
+    }
+
+    template<typename OriginalMatrixType>
+    EIGEN_DEVICE_FUNC
+    inline Replicate(const OriginalMatrixType& matrix, Index rowFactor, Index colFactor)
+      : m_matrix(matrix), m_rowFactor(rowFactor), m_colFactor(colFactor)
+    {
+      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::remove_const<MatrixType>::type,OriginalMatrixType>::value),
+                          THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
+
+    EIGEN_DEVICE_FUNC
+    const _MatrixTypeNested& nestedExpression() const
+    {
+      return m_matrix;
+    }
+
+  protected:
+    MatrixTypeNested m_matrix;
+    const internal::variable_if_dynamic<Index, RowFactor> m_rowFactor;
+    const internal::variable_if_dynamic<Index, ColFactor> m_colFactor;
+};
+
+/**
+  * \return an expression of the replication of \c *this
+  *
+  * Example: \include MatrixBase_replicate.cpp
+  * Output: \verbinclude MatrixBase_replicate.out
+  *
+  * \sa VectorwiseOp::replicate(), DenseBase::replicate(Index,Index), class Replicate
+  */
+template<typename Derived>
+template<int RowFactor, int ColFactor>
+EIGEN_DEVICE_FUNC const Replicate<Derived,RowFactor,ColFactor>
+DenseBase<Derived>::replicate() const
+{
+  return Replicate<Derived,RowFactor,ColFactor>(derived());
+}
+
+/**
+  * \return an expression of the replication of each column (or row) of \c *this
+  *
+  * Example: \include DirectionWise_replicate_int.cpp
+  * Output: \verbinclude DirectionWise_replicate_int.out
+  *
+  * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
+  */
+template<typename ExpressionType, int Direction>
+EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
+VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
+{
+  return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
+          (_expression(),Direction==Vertical?factor:1,Direction==Horizontal?factor:1);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_REPLICATE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Reshaped.h b/src/3rdparty/eigen/Eigen/src/Core/Reshaped.h
new file mode 100644
index 000000000..52de73b6f
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Reshaped.h
@@ -0,0 +1,454 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2014 yoco <peter.xiau@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RESHAPED_H
+#define EIGEN_RESHAPED_H
+
+namespace Eigen {
+
+/** \class Reshaped
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a fixed-size or dynamic-size reshape
+  *
+  * \tparam XprType the type of the expression in which we are taking a reshape
+  * \tparam Rows the number of rows of the reshape we are taking at compile time (optional)
+  * \tparam Cols the number of columns of the reshape we are taking at compile time (optional)
+  * \tparam Order can be ColMajor or RowMajor, default is ColMajor.
+  *
+  * This class represents an expression of either a fixed-size or dynamic-size reshape.
+  * It is the return type of DenseBase::reshaped(NRowsType,NColsType) and
+  * most of the time this is the only way it is used.
+  *
+  * However, in C++98, if you want to directly maniputate reshaped expressions,
+  * for instance if you want to write a function returning such an expression, you
+  * will need to use this class. In C++11, it is advised to use the \em auto
+  * keyword for such use cases.
+  *
+  * Here is an example illustrating the dynamic case:
+  * \include class_Reshaped.cpp
+  * Output: \verbinclude class_Reshaped.out
+  *
+  * Here is an example illustrating the fixed-size case:
+  * \include class_FixedReshaped.cpp
+  * Output: \verbinclude class_FixedReshaped.out
+  *
+  * \sa DenseBase::reshaped(NRowsType,NColsType)
+  */
+
+namespace internal {
+
+template<typename XprType, int Rows, int Cols, int Order>
+struct traits<Reshaped<XprType, Rows, Cols, Order> > : traits<XprType>
+{
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::XprKind XprKind;
+  enum{
+    MatrixRows = traits<XprType>::RowsAtCompileTime,
+    MatrixCols = traits<XprType>::ColsAtCompileTime,
+    RowsAtCompileTime = Rows,
+    ColsAtCompileTime = Cols,
+    MaxRowsAtCompileTime = Rows,
+    MaxColsAtCompileTime = Cols,
+    XpxStorageOrder = ((int(traits<XprType>::Flags) & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor,
+    ReshapedStorageOrder = (RowsAtCompileTime == 1 && ColsAtCompileTime != 1) ? RowMajor
+                         : (ColsAtCompileTime == 1 && RowsAtCompileTime != 1) ? ColMajor
+                         : XpxStorageOrder,
+    HasSameStorageOrderAsXprType = (ReshapedStorageOrder == XpxStorageOrder),
+    InnerSize = (ReshapedStorageOrder==int(RowMajor)) ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+    InnerStrideAtCompileTime = HasSameStorageOrderAsXprType
+                             ? int(inner_stride_at_compile_time<XprType>::ret)
+                             : Dynamic,
+    OuterStrideAtCompileTime = Dynamic,
+
+    HasDirectAccess = internal::has_direct_access<XprType>::ret
+                    && (Order==int(XpxStorageOrder))
+                    && ((evaluator<XprType>::Flags&LinearAccessBit)==LinearAccessBit),
+
+    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
+                       && (InnerStrideAtCompileTime == 1)
+                        ? PacketAccessBit : 0,
+    //MaskAlignedBit = ((OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0,
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
+    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
+    FlagsRowMajorBit = (ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0,
+    FlagsDirectAccessBit = HasDirectAccess ? DirectAccessBit : 0,
+    Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) | MaskPacketAccessBit),
+
+    Flags = (Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit | FlagsDirectAccessBit)
+  };
+};
+
+template<typename XprType, int Rows, int Cols, int Order, bool HasDirectAccess> class ReshapedImpl_dense;
+
+} // end namespace internal
+
+template<typename XprType, int Rows, int Cols, int Order, typename StorageKind> class ReshapedImpl;
+
+template<typename XprType, int Rows, int Cols, int Order> class Reshaped
+  : public ReshapedImpl<XprType, Rows, Cols, Order, typename internal::traits<XprType>::StorageKind>
+{
+    typedef ReshapedImpl<XprType, Rows, Cols, Order, typename internal::traits<XprType>::StorageKind> Impl;
+  public:
+    //typedef typename Impl::Base Base;
+    typedef Impl Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Reshaped)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reshaped)
+
+    /** Fixed-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline Reshaped(XprType& xpr)
+      : Impl(xpr)
+    {
+      EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
+      eigen_assert(Rows * Cols == xpr.rows() * xpr.cols());
+    }
+
+    /** Dynamic-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline Reshaped(XprType& xpr,
+          Index reshapeRows, Index reshapeCols)
+      : Impl(xpr, reshapeRows, reshapeCols)
+    {
+      eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==reshapeRows)
+          && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==reshapeCols));
+      eigen_assert(reshapeRows * reshapeCols == xpr.rows() * xpr.cols());
+    }
+};
+
+// The generic default implementation for dense reshape simply forward to the internal::ReshapedImpl_dense
+// that must be specialized for direct and non-direct access...
+template<typename XprType, int Rows, int Cols, int Order>
+class ReshapedImpl<XprType, Rows, Cols, Order, Dense>
+  : public internal::ReshapedImpl_dense<XprType, Rows, Cols, Order,internal::traits<Reshaped<XprType,Rows,Cols,Order> >::HasDirectAccess>
+{
+    typedef internal::ReshapedImpl_dense<XprType, Rows, Cols, Order,internal::traits<Reshaped<XprType,Rows,Cols,Order> >::HasDirectAccess> Impl;
+  public:
+    typedef Impl Base;
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl)
+    EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr) : Impl(xpr) {}
+    EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr, Index reshapeRows, Index reshapeCols)
+      : Impl(xpr, reshapeRows, reshapeCols) {}
+};
+
+namespace internal {
+
+/** \internal Internal implementation of dense Reshaped in the general case. */
+template<typename XprType, int Rows, int Cols, int Order>
+class ReshapedImpl_dense<XprType,Rows,Cols,Order,false>
+  : public internal::dense_xpr_base<Reshaped<XprType, Rows, Cols, Order> >::type
+{
+    typedef Reshaped<XprType, Rows, Cols, Order> ReshapedType;
+  public:
+
+    typedef typename internal::dense_xpr_base<ReshapedType>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense)
+
+    typedef typename internal::ref_selector<XprType>::non_const_type MatrixTypeNested;
+    typedef typename internal::remove_all<XprType>::type NestedExpression;
+
+    class InnerIterator;
+
+    /** Fixed-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline ReshapedImpl_dense(XprType& xpr)
+      : m_xpr(xpr), m_rows(Rows), m_cols(Cols)
+    {}
+
+    /** Dynamic-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
+      : m_xpr(xpr), m_rows(nRows), m_cols(nCols)
+    {}
+
+    EIGEN_DEVICE_FUNC Index rows() const { return m_rows; }
+    EIGEN_DEVICE_FUNC Index cols() const { return m_cols; }
+
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    /** \sa MapBase::data() */
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const;
+    EIGEN_DEVICE_FUNC inline Index innerStride() const;
+    EIGEN_DEVICE_FUNC inline Index outerStride() const;
+    #endif
+
+    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<XprType>::type&
+    nestedExpression() const { return m_xpr; }
+
+    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
+    typename internal::remove_reference<XprType>::type&
+    nestedExpression() { return m_xpr; }
+
+  protected:
+
+    MatrixTypeNested m_xpr;
+    const internal::variable_if_dynamic<Index, Rows> m_rows;
+    const internal::variable_if_dynamic<Index, Cols> m_cols;
+};
+
+
+/** \internal Internal implementation of dense Reshaped in the direct access case. */
+template<typename XprType, int Rows, int Cols, int Order>
+class ReshapedImpl_dense<XprType, Rows, Cols, Order, true>
+  : public MapBase<Reshaped<XprType, Rows, Cols, Order> >
+{
+    typedef Reshaped<XprType, Rows, Cols, Order> ReshapedType;
+    typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
+  public:
+
+    typedef MapBase<ReshapedType> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense)
+
+    /** Fixed-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline ReshapedImpl_dense(XprType& xpr)
+      : Base(xpr.data()), m_xpr(xpr)
+    {}
+
+    /** Dynamic-size constructor
+      */
+    EIGEN_DEVICE_FUNC
+    inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
+      : Base(xpr.data(), nRows, nCols),
+        m_xpr(xpr)
+    {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
+    {
+      return m_xpr;
+    }
+
+    EIGEN_DEVICE_FUNC
+    XprType& nestedExpression() { return m_xpr; }
+
+    /** \sa MapBase::innerStride() */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const
+    {
+      return m_xpr.innerStride();
+    }
+
+    /** \sa MapBase::outerStride() */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const
+    {
+      return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows();
+    }
+
+  protected:
+
+    XprTypeNested m_xpr;
+};
+
+// Evaluators
+template<typename ArgType, int Rows, int Cols, int Order, bool HasDirectAccess> struct reshaped_evaluator;
+
+template<typename ArgType, int Rows, int Cols, int Order>
+struct evaluator<Reshaped<ArgType, Rows, Cols, Order> >
+  : reshaped_evaluator<ArgType, Rows, Cols, Order, traits<Reshaped<ArgType,Rows,Cols,Order> >::HasDirectAccess>
+{
+  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
+  typedef typename XprType::Scalar Scalar;
+  // TODO: should check for smaller packet types
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    HasDirectAccess = traits<XprType>::HasDirectAccess,
+
+//     RowsAtCompileTime = traits<XprType>::RowsAtCompileTime,
+//     ColsAtCompileTime = traits<XprType>::ColsAtCompileTime,
+//     MaxRowsAtCompileTime = traits<XprType>::MaxRowsAtCompileTime,
+//     MaxColsAtCompileTime = traits<XprType>::MaxColsAtCompileTime,
+//
+//     InnerStrideAtCompileTime = traits<XprType>::HasSameStorageOrderAsXprType
+//                              ? int(inner_stride_at_compile_time<ArgType>::ret)
+//                              : Dynamic,
+//     OuterStrideAtCompileTime = Dynamic,
+
+    FlagsLinearAccessBit = (traits<XprType>::RowsAtCompileTime == 1 || traits<XprType>::ColsAtCompileTime == 1 || HasDirectAccess) ? LinearAccessBit : 0,
+    FlagsRowMajorBit = (traits<XprType>::ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0,
+    FlagsDirectAccessBit =  HasDirectAccess ? DirectAccessBit : 0,
+    Flags0 = evaluator<ArgType>::Flags & (HereditaryBits & ~RowMajorBit),
+    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit | FlagsDirectAccessBit,
+
+    PacketAlignment = unpacket_traits<PacketScalar>::alignment,
+    Alignment = evaluator<ArgType>::Alignment
+  };
+  typedef reshaped_evaluator<ArgType, Rows, Cols, Order, HasDirectAccess> reshaped_evaluator_type;
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : reshaped_evaluator_type(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+};
+
+template<typename ArgType, int Rows, int Cols, int Order>
+struct reshaped_evaluator<ArgType, Rows, Cols, Order, /* HasDirectAccess */ false>
+  : evaluator_base<Reshaped<ArgType, Rows, Cols, Order> >
+{
+  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost /* TODO + cost of index computations */,
+
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits /*| LinearAccessBit | DirectAccessBit*/)),
+
+    Alignment = 0
+  };
+
+  EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  typedef std::pair<Index, Index> RowCol;
+
+  inline RowCol index_remap(Index rowId, Index colId) const
+  {
+    if(Order==ColMajor)
+    {
+      const Index nth_elem_idx = colId * m_xpr.rows() + rowId;
+      return RowCol(nth_elem_idx % m_xpr.nestedExpression().rows(),
+                    nth_elem_idx / m_xpr.nestedExpression().rows());
+    }
+    else
+    {
+      const Index nth_elem_idx = colId + rowId * m_xpr.cols();
+      return RowCol(nth_elem_idx / m_xpr.nestedExpression().cols(),
+                    nth_elem_idx % m_xpr.nestedExpression().cols());
+    }
+  }
+
+  EIGEN_DEVICE_FUNC
+  inline Scalar& coeffRef(Index rowId, Index colId)
+  {
+    EIGEN_STATIC_ASSERT_LVALUE(XprType)
+    const RowCol row_col = index_remap(rowId, colId);
+    return m_argImpl.coeffRef(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC
+  inline const Scalar& coeffRef(Index rowId, Index colId) const
+  {
+    const RowCol row_col = index_remap(rowId, colId);
+    return m_argImpl.coeffRef(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const
+  {
+    const RowCol row_col = index_remap(rowId, colId);
+    return m_argImpl.coeff(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC
+  inline Scalar& coeffRef(Index index)
+  {
+    EIGEN_STATIC_ASSERT_LVALUE(XprType)
+    const RowCol row_col = index_remap(Rows == 1 ? 0 : index,
+                                       Rows == 1 ? index : 0);
+    return m_argImpl.coeffRef(row_col.first, row_col.second);
+
+  }
+
+  EIGEN_DEVICE_FUNC
+  inline const Scalar& coeffRef(Index index) const
+  {
+    const RowCol row_col = index_remap(Rows == 1 ? 0 : index,
+                                       Rows == 1 ? index : 0);
+    return m_argImpl.coeffRef(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC
+  inline const CoeffReturnType coeff(Index index) const
+  {
+    const RowCol row_col = index_remap(Rows == 1 ? 0 : index,
+                                       Rows == 1 ? index : 0);
+    return m_argImpl.coeff(row_col.first, row_col.second);
+  }
+#if 0
+  EIGEN_DEVICE_FUNC
+  template<int LoadMode>
+  inline PacketScalar packet(Index rowId, Index colId) const
+  {
+    const RowCol row_col = index_remap(rowId, colId);
+    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second);
+
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC
+  inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
+  {
+    const RowCol row_col = index_remap(rowId, colId);
+    m_argImpl.const_cast_derived().template writePacket<Unaligned>
+            (row_col.first, row_col.second, val);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC
+  inline PacketScalar packet(Index index) const
+  {
+    const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index,
+                                        RowsAtCompileTime == 1 ? index : 0);
+    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC
+  inline void writePacket(Index index, const PacketScalar& val)
+  {
+    const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index,
+                                        RowsAtCompileTime == 1 ? index : 0);
+    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second, val);
+  }
+#endif
+protected:
+
+  evaluator<ArgType> m_argImpl;
+  const XprType& m_xpr;
+
+};
+
+template<typename ArgType, int Rows, int Cols, int Order>
+struct reshaped_evaluator<ArgType, Rows, Cols, Order, /* HasDirectAccess */ true>
+: mapbase_evaluator<Reshaped<ArgType, Rows, Cols, Order>,
+                      typename Reshaped<ArgType, Rows, Cols, Order>::PlainObject>
+{
+  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
+  typedef typename XprType::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr)
+    : mapbase_evaluator<XprType, typename XprType::PlainObject>(xpr)
+  {
+    // TODO: for the 3.4 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
+    eigen_assert(((internal::UIntPtr(xpr.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_RESHAPED_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/ReturnByValue.h b/src/3rdparty/eigen/Eigen/src/Core/ReturnByValue.h
new file mode 100644
index 000000000..4dad13ea1
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/ReturnByValue.h
@@ -0,0 +1,119 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RETURNBYVALUE_H
+#define EIGEN_RETURNBYVALUE_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Derived>
+struct traits<ReturnByValue<Derived> >
+  : public traits<typename traits<Derived>::ReturnType>
+{
+  enum {
+    // We're disabling the DirectAccess because e.g. the constructor of
+    // the Block-with-DirectAccess expression requires to have a coeffRef method.
+    // Also, we don't want to have to implement the stride stuff.
+    Flags = (traits<typename traits<Derived>::ReturnType>::Flags
+             | EvalBeforeNestingBit) & ~DirectAccessBit
+  };
+};
+
+/* The ReturnByValue object doesn't even have a coeff() method.
+ * So the only way that nesting it in an expression can work, is by evaluating it into a plain matrix.
+ * So internal::nested always gives the plain return matrix type.
+ *
+ * FIXME: I don't understand why we need this specialization: isn't this taken care of by the EvalBeforeNestingBit ??
+ * Answer: EvalBeforeNestingBit should be deprecated since we have the evaluators
+ */
+template<typename Derived,int n,typename PlainObject>
+struct nested_eval<ReturnByValue<Derived>, n, PlainObject>
+{
+  typedef typename traits<Derived>::ReturnType type;
+};
+
+} // end namespace internal
+
+/** \class ReturnByValue
+  * \ingroup Core_Module
+  *
+  */
+template<typename Derived> class ReturnByValue
+  : public internal::dense_xpr_base< ReturnByValue<Derived> >::type, internal::no_assignment_operator
+{
+  public:
+    typedef typename internal::traits<Derived>::ReturnType ReturnType;
+
+    typedef typename internal::dense_xpr_base<ReturnByValue>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(ReturnByValue)
+
+    template<typename Dest>
+    EIGEN_DEVICE_FUNC
+    inline void evalTo(Dest& dst) const
+    { static_cast<const Derived*>(this)->evalTo(dst); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return static_cast<const Derived*>(this)->rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return static_cast<const Derived*>(this)->cols(); }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+#define Unusable YOU_ARE_TRYING_TO_ACCESS_A_SINGLE_COEFFICIENT_IN_A_SPECIAL_EXPRESSION_WHERE_THAT_IS_NOT_ALLOWED_BECAUSE_THAT_WOULD_BE_INEFFICIENT
+    class Unusable{
+      Unusable(const Unusable&) {}
+      Unusable& operator=(const Unusable&) {return *this;}
+    };
+    const Unusable& coeff(Index) const { return *reinterpret_cast<const Unusable*>(this); }
+    const Unusable& coeff(Index,Index) const { return *reinterpret_cast<const Unusable*>(this); }
+    Unusable& coeffRef(Index) { return *reinterpret_cast<Unusable*>(this); }
+    Unusable& coeffRef(Index,Index) { return *reinterpret_cast<Unusable*>(this); }
+#undef Unusable
+#endif
+};
+
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+{
+  other.evalTo(derived());
+  return derived();
+}
+
+namespace internal {
+
+// Expression is evaluated in a temporary; default implementation of Assignment is bypassed so that
+// when a ReturnByValue expression is assigned, the evaluator is not constructed.
+// TODO: Finalize port to new regime; ReturnByValue should not exist in the expression world
+
+template<typename Derived>
+struct evaluator<ReturnByValue<Derived> >
+  : public evaluator<typename internal::traits<Derived>::ReturnType>
+{
+  typedef ReturnByValue<Derived> XprType;
+  typedef typename internal::traits<Derived>::ReturnType PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    xpr.evalTo(m_result);
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_RETURNBYVALUE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Reverse.h b/src/3rdparty/eigen/Eigen/src/Core/Reverse.h
new file mode 100644
index 000000000..28cdd76ac
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Reverse.h
@@ -0,0 +1,217 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2009 Ricard Marxer <email@ricardmarxer.com>
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REVERSE_H
+#define EIGEN_REVERSE_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename MatrixType, int Direction>
+struct traits<Reverse<MatrixType, Direction> >
+ : traits<MatrixType>
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename traits<MatrixType>::StorageKind StorageKind;
+  typedef typename traits<MatrixType>::XprKind XprKind;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
+  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+    Flags = _MatrixTypeNested::Flags & (RowMajorBit | LvalueBit)
+  };
+};
+
+template<typename PacketType, bool ReversePacket> struct reverse_packet_cond
+{
+  static inline PacketType run(const PacketType& x) { return preverse(x); }
+};
+
+template<typename PacketType> struct reverse_packet_cond<PacketType,false>
+{
+  static inline PacketType run(const PacketType& x) { return x; }
+};
+
+} // end namespace internal
+
+/** \class Reverse
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the reverse of a vector or matrix
+  *
+  * \tparam MatrixType the type of the object of which we are taking the reverse
+  * \tparam Direction defines the direction of the reverse operation, can be Vertical, Horizontal, or BothDirections
+  *
+  * This class represents an expression of the reverse of a vector.
+  * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::reverse(), VectorwiseOp::reverse()
+  */
+template<typename MatrixType, int Direction> class Reverse
+  : public internal::dense_xpr_base< Reverse<MatrixType, Direction> >::type
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<Reverse>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Reverse)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
+    using Base::IsRowMajor;
+
+  protected:
+    enum {
+      PacketSize = internal::packet_traits<Scalar>::size,
+      IsColMajor = !IsRowMajor,
+      ReverseRow = (Direction == Vertical)   || (Direction == BothDirections),
+      ReverseCol = (Direction == Horizontal) || (Direction == BothDirections),
+      OffsetRow  = ReverseRow && IsColMajor ? PacketSize : 1,
+      OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1,
+      ReversePacket = (Direction == BothDirections)
+                    || ((Direction == Vertical)   && IsColMajor)
+                    || ((Direction == Horizontal) && IsRowMajor)
+    };
+    typedef internal::reverse_packet_cond<PacketScalar,ReversePacket> reverse_packet;
+  public:
+
+    EIGEN_DEVICE_FUNC explicit inline Reverse(const MatrixType& matrix) : m_matrix(matrix) { }
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+
+    EIGEN_DEVICE_FUNC inline Index innerStride() const
+    {
+      return -m_matrix.innerStride();
+    }
+
+    EIGEN_DEVICE_FUNC const typename internal::remove_all<typename MatrixType::Nested>::type&
+    nestedExpression() const
+    {
+      return m_matrix;
+    }
+
+  protected:
+    typename MatrixType::Nested m_matrix;
+};
+
+/** \returns an expression of the reverse of *this.
+  *
+  * Example: \include MatrixBase_reverse.cpp
+  * Output: \verbinclude MatrixBase_reverse.out
+  *
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType
+DenseBase<Derived>::reverse()
+{
+  return ReverseReturnType(derived());
+}
+
+
+//reverse const overload moved DenseBase.h due to a CUDA compiler bug
+
+/** This is the "in place" version of reverse: it reverses \c *this.
+  *
+  * In most cases it is probably better to simply use the reversed expression
+  * of a matrix. However, when reversing the matrix data itself is really needed,
+  * then this "in-place" version is probably the right choice because it provides
+  * the following additional benefits:
+  *  - less error prone: doing the same operation with .reverse() requires special care:
+  *    \code m = m.reverse().eval(); \endcode
+  *  - this API enables reverse operations without the need for a temporary
+  *  - it allows future optimizations (cache friendliness, etc.)
+  *
+  * \sa VectorwiseOp::reverseInPlace(), reverse() */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace()
+{
+  if(cols()>rows())
+  {
+    Index half = cols()/2;
+    leftCols(half).swap(rightCols(half).reverse());
+    if((cols()%2)==1)
+    {
+      Index half2 = rows()/2;
+      col(half).head(half2).swap(col(half).tail(half2).reverse());
+    }
+  }
+  else
+  {
+    Index half = rows()/2;
+    topRows(half).swap(bottomRows(half).reverse());
+    if((rows()%2)==1)
+    {
+      Index half2 = cols()/2;
+      row(half).head(half2).swap(row(half).tail(half2).reverse());
+    }
+  }
+}
+
+namespace internal {
+
+template<int Direction>
+struct vectorwise_reverse_inplace_impl;
+
+template<>
+struct vectorwise_reverse_inplace_impl<Vertical>
+{
+  template<typename ExpressionType>
+  static void run(ExpressionType &xpr)
+  {
+    const int HalfAtCompileTime = ExpressionType::RowsAtCompileTime==Dynamic?Dynamic:ExpressionType::RowsAtCompileTime/2;
+    Index half = xpr.rows()/2;
+    xpr.topRows(fix<HalfAtCompileTime>(half))
+       .swap(xpr.bottomRows(fix<HalfAtCompileTime>(half)).colwise().reverse());
+  }
+};
+
+template<>
+struct vectorwise_reverse_inplace_impl<Horizontal>
+{
+  template<typename ExpressionType>
+  static void run(ExpressionType &xpr)
+  {
+    const int HalfAtCompileTime = ExpressionType::ColsAtCompileTime==Dynamic?Dynamic:ExpressionType::ColsAtCompileTime/2;
+    Index half = xpr.cols()/2;
+    xpr.leftCols(fix<HalfAtCompileTime>(half))
+       .swap(xpr.rightCols(fix<HalfAtCompileTime>(half)).rowwise().reverse());
+  }
+};
+
+} // end namespace internal
+
+/** This is the "in place" version of VectorwiseOp::reverse: it reverses each column or row of \c *this.
+  *
+  * In most cases it is probably better to simply use the reversed expression
+  * of a matrix. However, when reversing the matrix data itself is really needed,
+  * then this "in-place" version is probably the right choice because it provides
+  * the following additional benefits:
+  *  - less error prone: doing the same operation with .reverse() requires special care:
+  *    \code m = m.reverse().eval(); \endcode
+  *  - this API enables reverse operations without the need for a temporary
+  *
+  * \sa DenseBase::reverseInPlace(), reverse() */
+template<typename ExpressionType, int Direction>
+EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+{
+  internal::vectorwise_reverse_inplace_impl<Direction>::run(m_matrix);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_REVERSE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Select.h b/src/3rdparty/eigen/Eigen/src/Core/Select.h
new file mode 100644
index 000000000..7c86bf87c
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Select.h
@@ -0,0 +1,164 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELECT_H
+#define EIGEN_SELECT_H
+
+namespace Eigen {
+
+/** \class Select
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a coefficient wise version of the C++ ternary operator ?:
+  *
+  * \param ConditionMatrixType the type of the \em condition expression which must be a boolean matrix
+  * \param ThenMatrixType the type of the \em then expression
+  * \param ElseMatrixType the type of the \em else expression
+  *
+  * This class represents an expression of a coefficient wise version of the C++ ternary operator ?:.
+  * It is the return type of DenseBase::select() and most of the time this is the only way it is used.
+  *
+  * \sa DenseBase::select(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const
+  */
+
+namespace internal {
+template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
+struct traits<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
+ : traits<ThenMatrixType>
+{
+  typedef typename traits<ThenMatrixType>::Scalar Scalar;
+  typedef Dense StorageKind;
+  typedef typename traits<ThenMatrixType>::XprKind XprKind;
+  typedef typename ConditionMatrixType::Nested ConditionMatrixNested;
+  typedef typename ThenMatrixType::Nested ThenMatrixNested;
+  typedef typename ElseMatrixType::Nested ElseMatrixNested;
+  enum {
+    RowsAtCompileTime = ConditionMatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = ConditionMatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = ConditionMatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = ConditionMatrixType::MaxColsAtCompileTime,
+    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & RowMajorBit
+  };
+};
+}
+
+template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
+class Select : public internal::dense_xpr_base< Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >::type,
+               internal::no_assignment_operator
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<Select>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Select)
+
+    inline EIGEN_DEVICE_FUNC
+    Select(const ConditionMatrixType& a_conditionMatrix,
+           const ThenMatrixType& a_thenMatrix,
+           const ElseMatrixType& a_elseMatrix)
+      : m_condition(a_conditionMatrix), m_then(a_thenMatrix), m_else(a_elseMatrix)
+    {
+      eigen_assert(m_condition.rows() == m_then.rows() && m_condition.rows() == m_else.rows());
+      eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols());
+    }
+
+    inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_condition.rows(); }
+    inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_condition.cols(); }
+
+    inline EIGEN_DEVICE_FUNC
+    const Scalar coeff(Index i, Index j) const
+    {
+      if (m_condition.coeff(i,j))
+        return m_then.coeff(i,j);
+      else
+        return m_else.coeff(i,j);
+    }
+
+    inline EIGEN_DEVICE_FUNC
+    const Scalar coeff(Index i) const
+    {
+      if (m_condition.coeff(i))
+        return m_then.coeff(i);
+      else
+        return m_else.coeff(i);
+    }
+
+    inline EIGEN_DEVICE_FUNC const ConditionMatrixType& conditionMatrix() const
+    {
+      return m_condition;
+    }
+
+    inline EIGEN_DEVICE_FUNC const ThenMatrixType& thenMatrix() const
+    {
+      return m_then;
+    }
+
+    inline EIGEN_DEVICE_FUNC const ElseMatrixType& elseMatrix() const
+    {
+      return m_else;
+    }
+
+  protected:
+    typename ConditionMatrixType::Nested m_condition;
+    typename ThenMatrixType::Nested m_then;
+    typename ElseMatrixType::Nested m_else;
+};
+
+
+/** \returns a matrix where each coefficient (i,j) is equal to \a thenMatrix(i,j)
+  * if \c *this(i,j), and \a elseMatrix(i,j) otherwise.
+  *
+  * Example: \include MatrixBase_select.cpp
+  * Output: \verbinclude MatrixBase_select.out
+  *
+  * \sa class Select
+  */
+template<typename Derived>
+template<typename ThenDerived,typename ElseDerived>
+inline EIGEN_DEVICE_FUNC const Select<Derived,ThenDerived,ElseDerived>
+DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
+                            const DenseBase<ElseDerived>& elseMatrix) const
+{
+  return Select<Derived,ThenDerived,ElseDerived>(derived(), thenMatrix.derived(), elseMatrix.derived());
+}
+
+/** Version of DenseBase::select(const DenseBase&, const DenseBase&) with
+  * the \em else expression being a scalar value.
+  *
+  * \sa DenseBase::select(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const, class Select
+  */
+template<typename Derived>
+template<typename ThenDerived>
+inline EIGEN_DEVICE_FUNC const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
+DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
+                           const typename ThenDerived::Scalar& elseScalar) const
+{
+  return Select<Derived,ThenDerived,typename ThenDerived::ConstantReturnType>(
+    derived(), thenMatrix.derived(), ThenDerived::Constant(rows(),cols(),elseScalar));
+}
+
+/** Version of DenseBase::select(const DenseBase&, const DenseBase&) with
+  * the \em then expression being a scalar value.
+  *
+  * \sa DenseBase::select(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const, class Select
+  */
+template<typename Derived>
+template<typename ElseDerived>
+inline EIGEN_DEVICE_FUNC const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
+DenseBase<Derived>::select(const typename ElseDerived::Scalar& thenScalar,
+                           const DenseBase<ElseDerived>& elseMatrix) const
+{
+  return Select<Derived,typename ElseDerived::ConstantReturnType,ElseDerived>(
+    derived(), ElseDerived::Constant(rows(),cols(),thenScalar), elseMatrix.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELECT_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/SelfAdjointView.h b/src/3rdparty/eigen/Eigen/src/Core/SelfAdjointView.h
new file mode 100644
index 000000000..8ce3b372a
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/SelfAdjointView.h
@@ -0,0 +1,365 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINTMATRIX_H
+#define EIGEN_SELFADJOINTMATRIX_H
+
+namespace Eigen {
+
+/** \class SelfAdjointView
+  * \ingroup Core_Module
+  *
+  *
+  * \brief Expression of a selfadjoint matrix from a triangular part of a dense matrix
+  *
+  * \param MatrixType the type of the dense matrix storing the coefficients
+  * \param TriangularPart can be either \c #Lower or \c #Upper
+  *
+  * This class is an expression of a sefladjoint matrix from a triangular part of a matrix
+  * with given dense storage of the coefficients. It is the return type of MatrixBase::selfadjointView()
+  * and most of the time this is the only way that it is used.
+  *
+  * \sa class TriangularBase, MatrixBase::selfadjointView()
+  */
+
+namespace internal {
+template<typename MatrixType, unsigned int UpLo>
+struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
+{
+  typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested;
+  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
+  typedef MatrixType ExpressionType;
+  typedef typename MatrixType::PlainObject FullMatrixType;
+  enum {
+    Mode = UpLo | SelfAdjoint,
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags =  MatrixTypeNestedCleaned::Flags & (HereditaryBits|FlagsLvalueBit)
+           & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)) // FIXME these flags should be preserved
+  };
+};
+}
+
+
+template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
+  : public TriangularBase<SelfAdjointView<_MatrixType, UpLo> >
+{
+  public:
+
+    typedef _MatrixType MatrixType;
+    typedef TriangularBase<SelfAdjointView> Base;
+    typedef typename internal::traits<SelfAdjointView>::MatrixTypeNested MatrixTypeNested;
+    typedef typename internal::traits<SelfAdjointView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
+    typedef MatrixTypeNestedCleaned NestedExpression;
+
+    /** \brief The type of coefficients in this matrix */
+    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
+    typedef SelfAdjointView<typename internal::add_const<MatrixType>::type, UpLo> ConstSelfAdjointView;
+
+    enum {
+      Mode = internal::traits<SelfAdjointView>::Mode,
+      Flags = internal::traits<SelfAdjointView>::Flags,
+      TransposeMode = ((int(Mode) & int(Upper)) ? Lower : 0) | ((int(Mode) & int(Lower)) ? Upper : 0)
+    };
+    typedef typename MatrixType::PlainObject PlainObject;
+
+    EIGEN_DEVICE_FUNC
+    explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
+    {
+      EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_matrix.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_matrix.innerStride(); }
+
+    /** \sa MatrixBase::coeff()
+      * \warning the coordinates must fit into the referenced triangular part
+      */
+    EIGEN_DEVICE_FUNC
+    inline Scalar coeff(Index row, Index col) const
+    {
+      Base::check_coordinates_internal(row, col);
+      return m_matrix.coeff(row, col);
+    }
+
+    /** \sa MatrixBase::coeffRef()
+      * \warning the coordinates must fit into the referenced triangular part
+      */
+    EIGEN_DEVICE_FUNC
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(SelfAdjointView);
+      Base::check_coordinates_internal(row, col);
+      return m_matrix.coeffRef(row, col);
+    }
+
+    /** \internal */
+    EIGEN_DEVICE_FUNC
+    const MatrixTypeNestedCleaned& _expression() const { return m_matrix; }
+
+    EIGEN_DEVICE_FUNC
+    const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
+    EIGEN_DEVICE_FUNC
+    MatrixTypeNestedCleaned& nestedExpression() { return m_matrix; }
+
+    /** Efficient triangular matrix times vector/matrix product */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    const Product<SelfAdjointView,OtherDerived>
+    operator*(const MatrixBase<OtherDerived>& rhs) const
+    {
+      return Product<SelfAdjointView,OtherDerived>(*this, rhs.derived());
+    }
+
+    /** Efficient vector/matrix times triangular matrix product */
+    template<typename OtherDerived> friend
+    EIGEN_DEVICE_FUNC
+    const Product<OtherDerived,SelfAdjointView>
+    operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView& rhs)
+    {
+      return Product<OtherDerived,SelfAdjointView>(lhs.derived(),rhs);
+    }
+
+    friend EIGEN_DEVICE_FUNC
+    const SelfAdjointView<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,MatrixType,product),UpLo>
+    operator*(const Scalar& s, const SelfAdjointView& mat)
+    {
+      return (s*mat.nestedExpression()).template selfadjointView<UpLo>();
+    }
+
+    /** Perform a symmetric rank 2 update of the selfadjoint matrix \c *this:
+      * \f$ this = this + \alpha u v^* + conj(\alpha) v u^* \f$
+      * \returns a reference to \c *this
+      *
+      * The vectors \a u and \c v \b must be column vectors, however they can be
+      * a adjoint expression without any overhead. Only the meaningful triangular
+      * part of the matrix is updated, the rest is left unchanged.
+      *
+      * \sa rankUpdate(const MatrixBase<DerivedU>&, Scalar)
+      */
+    template<typename DerivedU, typename DerivedV>
+    EIGEN_DEVICE_FUNC
+    SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha = Scalar(1));
+
+    /** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
+      * \f$ this = this + \alpha ( u u^* ) \f$ where \a u is a vector or matrix.
+      *
+      * \returns a reference to \c *this
+      *
+      * Note that to perform \f$ this = this + \alpha ( u^* u ) \f$ you can simply
+      * call this function with u.adjoint().
+      *
+      * \sa rankUpdate(const MatrixBase<DerivedU>&, const MatrixBase<DerivedV>&, Scalar)
+      */
+    template<typename DerivedU>
+    EIGEN_DEVICE_FUNC
+    SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
+
+    /** \returns an expression of a triangular view extracted from the current selfadjoint view of a given triangular part
+      *
+      * The parameter \a TriMode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
+      * \c #Lower, \c #StrictlyLower, \c #UnitLower.
+      *
+      * If \c TriMode references the same triangular part than \c *this, then this method simply return a \c TriangularView of the nested expression,
+      * otherwise, the nested expression is first transposed, thus returning a \c TriangularView<Transpose<MatrixType>> object.
+      *
+      * \sa MatrixBase::triangularView(), class TriangularView
+      */
+    template<unsigned int TriMode>
+    EIGEN_DEVICE_FUNC
+    typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)),
+                                   TriangularView<MatrixType,TriMode>,
+                                   TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type
+    triangularView() const
+    {
+      typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::ConstTransposeReturnType>::type tmp1(m_matrix);
+      typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)), MatrixType&, typename MatrixType::AdjointReturnType>::type tmp2(tmp1);
+      return typename internal::conditional<(TriMode&(Upper|Lower))==(UpLo&(Upper|Lower)),
+                                   TriangularView<MatrixType,TriMode>,
+                                   TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
+    }
+
+    typedef SelfAdjointView<const MatrixConjugateReturnType,UpLo> ConjugateReturnType;
+    /** \sa MatrixBase::conjugate() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConjugateReturnType conjugate() const
+    { return ConjugateReturnType(m_matrix.conjugate()); }
+
+    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+     *           returns \c *this otherwise.
+     */
+    template<bool Cond>
+    EIGEN_DEVICE_FUNC
+    inline typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type
+    conjugateIf() const
+    {
+      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type ReturnType;
+      return ReturnType(m_matrix.template conjugateIf<Cond>());
+    }
+
+    typedef SelfAdjointView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
+    /** \sa MatrixBase::adjoint() const */
+    EIGEN_DEVICE_FUNC
+    inline const AdjointReturnType adjoint() const
+    { return AdjointReturnType(m_matrix.adjoint()); }
+
+    typedef SelfAdjointView<typename MatrixType::TransposeReturnType,TransposeMode> TransposeReturnType;
+     /** \sa MatrixBase::transpose() */
+    EIGEN_DEVICE_FUNC
+    inline TransposeReturnType transpose()
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      typename MatrixType::TransposeReturnType tmp(m_matrix);
+      return TransposeReturnType(tmp);
+    }
+
+    typedef SelfAdjointView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
+    /** \sa MatrixBase::transpose() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConstTransposeReturnType transpose() const
+    {
+      return ConstTransposeReturnType(m_matrix.transpose());
+    }
+
+    /** \returns a const expression of the main diagonal of the matrix \c *this
+      *
+      * This method simply returns the diagonal of the nested expression, thus by-passing the SelfAdjointView decorator.
+      *
+      * \sa MatrixBase::diagonal(), class Diagonal */
+    EIGEN_DEVICE_FUNC
+    typename MatrixType::ConstDiagonalReturnType diagonal() const
+    {
+      return typename MatrixType::ConstDiagonalReturnType(m_matrix);
+    }
+
+/////////// Cholesky module ///////////
+
+    const LLT<PlainObject, UpLo> llt() const;
+    const LDLT<PlainObject, UpLo> ldlt() const;
+
+/////////// Eigenvalue module ///////////
+
+    /** Real part of #Scalar */
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    /** Return type of eigenvalues() */
+    typedef Matrix<RealScalar, internal::traits<MatrixType>::ColsAtCompileTime, 1> EigenvaluesReturnType;
+
+    EIGEN_DEVICE_FUNC
+    EigenvaluesReturnType eigenvalues() const;
+    EIGEN_DEVICE_FUNC
+    RealScalar operatorNorm() const;
+
+  protected:
+    MatrixTypeNested m_matrix;
+};
+
+
+// template<typename OtherDerived, typename MatrixType, unsigned int UpLo>
+// internal::selfadjoint_matrix_product_returntype<OtherDerived,SelfAdjointView<MatrixType,UpLo> >
+// operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView<MatrixType,UpLo>& rhs)
+// {
+//   return internal::matrix_selfadjoint_product_returntype<OtherDerived,SelfAdjointView<MatrixType,UpLo> >(lhs.derived(),rhs);
+// }
+
+// selfadjoint to dense matrix
+
+namespace internal {
+
+// TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
+//      in the future selfadjoint-ness should be defined by the expression traits
+//      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
+template<typename MatrixType, unsigned int Mode>
+struct evaluator_traits<SelfAdjointView<MatrixType,Mode> >
+{
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SelfAdjointShape Shape;
+};
+
+template<int UpLo, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version>
+class triangular_dense_assignment_kernel<UpLo,SelfAdjoint,SetOpposite,DstEvaluatorTypeT,SrcEvaluatorTypeT,Functor,Version>
+  : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version>
+{
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version> Base;
+  typedef typename Base::DstXprType DstXprType;
+  typedef typename Base::SrcXprType SrcXprType;
+  using Base::m_dst;
+  using Base::m_src;
+  using Base::m_functor;
+public:
+
+  typedef typename Base::DstEvaluatorType DstEvaluatorType;
+  typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::AssignmentTraits AssignmentTraits;
+
+
+  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {}
+
+  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
+  {
+    eigen_internal_assert(row!=col);
+    Scalar tmp = m_src.coeff(row,col);
+    m_functor.assignCoeff(m_dst.coeffRef(row,col), tmp);
+    m_functor.assignCoeff(m_dst.coeffRef(col,row), numext::conj(tmp));
+  }
+
+  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
+  {
+    Base::assignCoeff(id,id);
+  }
+
+  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index)
+  { eigen_internal_assert(false && "should never be called"); }
+};
+
+} // end namespace internal
+
+/***************************************************************************
+* Implementation of MatrixBase methods
+***************************************************************************/
+
+/** This is the const version of MatrixBase::selfadjointView() */
+template<typename Derived>
+template<unsigned int UpLo>
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+MatrixBase<Derived>::selfadjointView() const
+{
+  return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
+}
+
+/** \returns an expression of a symmetric/self-adjoint view extracted from the upper or lower triangular part of the current matrix
+  *
+  * The parameter \a UpLo can be either \c #Upper or \c #Lower
+  *
+  * Example: \include MatrixBase_selfadjointView.cpp
+  * Output: \verbinclude MatrixBase_selfadjointView.out
+  *
+  * \sa class SelfAdjointView
+  */
+template<typename Derived>
+template<unsigned int UpLo>
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+MatrixBase<Derived>::selfadjointView()
+{
+  return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINTMATRIX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h b/src/3rdparty/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h
new file mode 100644
index 000000000..7c89c2e23
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -0,0 +1,47 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFCWISEBINARYOP_H
+#define EIGEN_SELFCWISEBINARYOP_H
+
+namespace Eigen { 
+
+// TODO generalize the scalar type of 'other'
+
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
+{
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
+{
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
+{
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
+{
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFCWISEBINARYOP_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Solve.h b/src/3rdparty/eigen/Eigen/src/Core/Solve.h
new file mode 100644
index 000000000..23d5cb707
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Solve.h
@@ -0,0 +1,188 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVE_H
+#define EIGEN_SOLVE_H
+
+namespace Eigen {
+
+template<typename Decomposition, typename RhsType, typename StorageKind> class SolveImpl;
+
+/** \class Solve
+  * \ingroup Core_Module
+  *
+  * \brief Pseudo expression representing a solving operation
+  *
+  * \tparam Decomposition the type of the matrix or decomposition object
+  * \tparam Rhstype the type of the right-hand side
+  *
+  * This class represents an expression of A.solve(B)
+  * and most of the time this is the only way it is used.
+  *
+  */
+namespace internal {
+
+// this solve_traits class permits to determine the evaluation type with respect to storage kind (Dense vs Sparse)
+template<typename Decomposition, typename RhsType,typename StorageKind> struct solve_traits;
+
+template<typename Decomposition, typename RhsType>
+struct solve_traits<Decomposition,RhsType,Dense>
+{
+  typedef typename make_proper_matrix_type<typename RhsType::Scalar,
+                 Decomposition::ColsAtCompileTime,
+                 RhsType::ColsAtCompileTime,
+                 RhsType::PlainObject::Options,
+                 Decomposition::MaxColsAtCompileTime,
+                 RhsType::MaxColsAtCompileTime>::type PlainObject;
+};
+
+template<typename Decomposition, typename RhsType>
+struct traits<Solve<Decomposition, RhsType> >
+  : traits<typename solve_traits<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>::PlainObject>
+{
+  typedef typename solve_traits<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>::PlainObject PlainObject;
+  typedef typename promote_index_type<typename Decomposition::StorageIndex, typename RhsType::StorageIndex>::type StorageIndex;
+  typedef traits<PlainObject> BaseTraits;
+  enum {
+    Flags = BaseTraits::Flags & RowMajorBit,
+    CoeffReadCost = HugeCost
+  };
+};
+
+}
+
+
+template<typename Decomposition, typename RhsType>
+class Solve : public SolveImpl<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>
+{
+public:
+  typedef typename internal::traits<Solve>::PlainObject PlainObject;
+  typedef typename internal::traits<Solve>::StorageIndex StorageIndex;
+
+  Solve(const Decomposition &dec, const RhsType &rhs)
+    : m_dec(dec), m_rhs(rhs)
+  {}
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+
+  EIGEN_DEVICE_FUNC const Decomposition& dec() const { return m_dec; }
+  EIGEN_DEVICE_FUNC const RhsType&       rhs() const { return m_rhs; }
+
+protected:
+  const Decomposition &m_dec;
+  const RhsType       &m_rhs;
+};
+
+
+// Specialization of the Solve expression for dense results
+template<typename Decomposition, typename RhsType>
+class SolveImpl<Decomposition,RhsType,Dense>
+  : public MatrixBase<Solve<Decomposition,RhsType> >
+{
+  typedef Solve<Decomposition,RhsType> Derived;
+
+public:
+
+  typedef MatrixBase<Solve<Decomposition,RhsType> > Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+
+private:
+
+  Scalar coeff(Index row, Index col) const;
+  Scalar coeff(Index i) const;
+};
+
+// Generic API dispatcher
+template<typename Decomposition, typename RhsType, typename StorageKind>
+class SolveImpl : public internal::generic_xpr_base<Solve<Decomposition,RhsType>, MatrixXpr, StorageKind>::type
+{
+  public:
+    typedef typename internal::generic_xpr_base<Solve<Decomposition,RhsType>, MatrixXpr, StorageKind>::type Base;
+};
+
+namespace internal {
+
+// Evaluator of Solve -> eval into a temporary
+template<typename Decomposition, typename RhsType>
+struct evaluator<Solve<Decomposition,RhsType> >
+  : public evaluator<typename Solve<Decomposition,RhsType>::PlainObject>
+{
+  typedef Solve<Decomposition,RhsType> SolveType;
+  typedef typename SolveType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve)
+    : m_result(solve.rows(), solve.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    solve.dec()._solve_impl(solve.rhs(), m_result);
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+// Specialization for "dst = dec.solve(rhs)"
+// NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse specialization must exist somewhere
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
+{
+  typedef Solve<DecType,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    src.dec()._solve_impl(src.rhs(), dst);
+  }
+};
+
+// Specialization for "dst = dec.transpose().solve(rhs)"
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal::assign_op<Scalar,Scalar>, Dense2Dense>
+{
+  typedef Solve<Transpose<const DecType>,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    src.dec().nestedExpression().template _solve_impl_transposed<false>(src.rhs(), dst);
+  }
+};
+
+// Specialization for "dst = dec.adjoint().solve(rhs)"
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType>,
+                  internal::assign_op<Scalar,Scalar>, Dense2Dense>
+{
+  typedef Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    src.dec().nestedExpression().nestedExpression().template _solve_impl_transposed<true>(src.rhs(), dst);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SOLVE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/SolveTriangular.h b/src/3rdparty/eigen/Eigen/src/Core/SolveTriangular.h
new file mode 100644
index 000000000..dfbf99523
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/SolveTriangular.h
@@ -0,0 +1,235 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVETRIANGULAR_H
+#define EIGEN_SOLVETRIANGULAR_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Forward declarations:
+// The following two routines are implemented in the products/TriangularSolver*.h files
+template<typename LhsScalar, typename RhsScalar, typename Index, int Side, int Mode, bool Conjugate, int StorageOrder>
+struct triangular_solve_vector;
+
+template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder, int OtherInnerStride>
+struct triangular_solve_matrix;
+
+// small helper struct extracting some traits on the underlying solver operation
+template<typename Lhs, typename Rhs, int Side>
+class trsolve_traits
+{
+  private:
+    enum {
+      RhsIsVectorAtCompileTime = (Side==OnTheLeft ? Rhs::ColsAtCompileTime : Rhs::RowsAtCompileTime)==1
+    };
+  public:
+    enum {
+      Unrolling   = (RhsIsVectorAtCompileTime && Rhs::SizeAtCompileTime != Dynamic && Rhs::SizeAtCompileTime <= 8)
+                  ? CompleteUnrolling : NoUnrolling,
+      RhsVectors  = RhsIsVectorAtCompileTime ? 1 : Dynamic
+    };
+};
+
+template<typename Lhs, typename Rhs,
+  int Side, // can be OnTheLeft/OnTheRight
+  int Mode, // can be Upper/Lower | UnitDiag
+  int Unrolling = trsolve_traits<Lhs,Rhs,Side>::Unrolling,
+  int RhsVectors = trsolve_traits<Lhs,Rhs,Side>::RhsVectors
+  >
+struct triangular_solver_selector;
+
+template<typename Lhs, typename Rhs, int Side, int Mode>
+struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,1>
+{
+  typedef typename Lhs::Scalar LhsScalar;
+  typedef typename Rhs::Scalar RhsScalar;
+  typedef blas_traits<Lhs> LhsProductTraits;
+  typedef typename LhsProductTraits::ExtractType ActualLhsType;
+  typedef Map<Matrix<RhsScalar,Dynamic,1>, Aligned> MappedRhs;
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
+  {
+    ActualLhsType actualLhs = LhsProductTraits::extract(lhs);
+
+    // FIXME find a way to allow an inner stride if packet_traits<Scalar>::size==1
+
+    bool useRhsDirectly = Rhs::InnerStrideAtCompileTime==1 || rhs.innerStride()==1;
+
+    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhs,rhs.size(),
+                                                  (useRhsDirectly ? rhs.data() : 0));
+
+    if(!useRhsDirectly)
+      MappedRhs(actualRhs,rhs.size()) = rhs;
+
+    triangular_solve_vector<LhsScalar, RhsScalar, Index, Side, Mode, LhsProductTraits::NeedToConjugate,
+                            (int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor>
+      ::run(actualLhs.cols(), actualLhs.data(), actualLhs.outerStride(), actualRhs);
+
+    if(!useRhsDirectly)
+      rhs = MappedRhs(actualRhs, rhs.size());
+  }
+};
+
+// the rhs is a matrix
+template<typename Lhs, typename Rhs, int Side, int Mode>
+struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
+{
+  typedef typename Rhs::Scalar Scalar;
+  typedef blas_traits<Lhs> LhsProductTraits;
+  typedef typename LhsProductTraits::DirectLinearAccessType ActualLhsType;
+
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
+  {
+    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsProductTraits::extract(lhs);
+
+    const Index size = lhs.rows();
+    const Index othersize = Side==OnTheLeft? rhs.cols() : rhs.rows();
+
+    typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
+              Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType;
+
+    BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
+
+    triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
+                               (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor, Rhs::InnerStrideAtCompileTime>
+      ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.innerStride(), rhs.outerStride(), blocking);
+  }
+};
+
+/***************************************************************************
+* meta-unrolling implementation
+***************************************************************************/
+
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size,
+         bool Stop = LoopIndex==Size>
+struct triangular_solver_unroller;
+
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
+struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,false> {
+  enum {
+    IsLower = ((Mode&Lower)==Lower),
+    DiagIndex  = IsLower ? LoopIndex : Size - LoopIndex - 1,
+    StartIndex = IsLower ? 0         : DiagIndex+1
+  };
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
+  {
+    if (LoopIndex>0)
+      rhs.coeffRef(DiagIndex) -= lhs.row(DiagIndex).template segment<LoopIndex>(StartIndex).transpose()
+                                .cwiseProduct(rhs.template segment<LoopIndex>(StartIndex)).sum();
+
+    if(!(Mode & UnitDiag))
+      rhs.coeffRef(DiagIndex) /= lhs.coeff(DiagIndex,DiagIndex);
+
+    triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex+1,Size>::run(lhs,rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
+struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,true> {
+  static EIGEN_DEVICE_FUNC void run(const Lhs&, Rhs&) {}
+};
+
+template<typename Lhs, typename Rhs, int Mode>
+struct triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,CompleteUnrolling,1> {
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
+  { triangular_solver_unroller<Lhs,Rhs,Mode,0,Rhs::SizeAtCompileTime>::run(lhs,rhs); }
+};
+
+template<typename Lhs, typename Rhs, int Mode>
+struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
+  {
+    Transpose<const Lhs> trLhs(lhs);
+    Transpose<Rhs> trRhs(rhs);
+
+    triangular_solver_unroller<Transpose<const Lhs>,Transpose<Rhs>,
+                              ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
+                              0,Rhs::SizeAtCompileTime>::run(trLhs,trRhs);
+  }
+};
+
+} // end namespace internal
+
+/***************************************************************************
+* TriangularView methods
+***************************************************************************/
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename MatrixType, unsigned int Mode>
+template<int Side, typename OtherDerived>
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+{
+  OtherDerived& other = _other.const_cast_derived();
+  eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
+  eigen_assert((!(int(Mode) & int(ZeroDiag))) && bool(int(Mode) & (int(Upper) | int(Lower))));
+  // If solving for a 0x0 matrix, nothing to do, simply return.
+  if (derived().cols() == 0)
+    return;
+
+  enum { copy = (internal::traits<OtherDerived>::Flags & RowMajorBit)  && OtherDerived::IsVectorAtCompileTime && OtherDerived::SizeAtCompileTime!=1};
+  typedef typename internal::conditional<copy,
+    typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
+  OtherCopy otherCopy(other);
+
+  internal::triangular_solver_selector<MatrixType, typename internal::remove_reference<OtherCopy>::type,
+    Side, Mode>::run(derived().nestedExpression(), otherCopy);
+
+  if (copy)
+    other = otherCopy;
+}
+
+template<typename Derived, unsigned int Mode>
+template<int Side, typename Other>
+const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
+TriangularViewImpl<Derived,Mode,Dense>::solve(const MatrixBase<Other>& other) const
+{
+  return internal::triangular_solve_retval<Side,TriangularViewType,Other>(derived(), other.derived());
+}
+#endif
+
+namespace internal {
+
+
+template<int Side, typename TriangularType, typename Rhs>
+struct traits<triangular_solve_retval<Side, TriangularType, Rhs> >
+{
+  typedef typename internal::plain_matrix_type_column_major<Rhs>::type ReturnType;
+};
+
+template<int Side, typename TriangularType, typename Rhs> struct triangular_solve_retval
+ : public ReturnByValue<triangular_solve_retval<Side, TriangularType, Rhs> >
+{
+  typedef typename remove_all<typename Rhs::Nested>::type RhsNestedCleaned;
+  typedef ReturnByValue<triangular_solve_retval> Base;
+
+  triangular_solve_retval(const TriangularType& tri, const Rhs& rhs)
+    : m_triangularMatrix(tri), m_rhs(rhs)
+  {}
+
+  inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_rhs.rows(); }
+  inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
+
+  template<typename Dest> inline void evalTo(Dest& dst) const
+  {
+    if(!is_same_dense(dst,m_rhs))
+      dst = m_rhs;
+    m_triangularMatrix.template solveInPlace<Side>(dst);
+  }
+
+  protected:
+    const TriangularType& m_triangularMatrix;
+    typename Rhs::Nested m_rhs;
+};
+
+} // namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SOLVETRIANGULAR_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/SolverBase.h b/src/3rdparty/eigen/Eigen/src/Core/SolverBase.h
new file mode 100644
index 000000000..501461042
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/SolverBase.h
@@ -0,0 +1,168 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVERBASE_H
+#define EIGEN_SOLVERBASE_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Derived>
+struct solve_assertion {
+    template<bool Transpose_, typename Rhs>
+    static void run(const Derived& solver, const Rhs& b) { solver.template _check_solve_assertion<Transpose_>(b); }
+};
+
+template<typename Derived>
+struct solve_assertion<Transpose<Derived> >
+{
+    typedef Transpose<Derived> type;
+
+    template<bool Transpose_, typename Rhs>
+    static void run(const type& transpose, const Rhs& b)
+    {
+        internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<true>(transpose.nestedExpression(), b);
+    }
+};
+
+template<typename Scalar, typename Derived>
+struct solve_assertion<CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > >
+{
+    typedef CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > type;
+
+    template<bool Transpose_, typename Rhs>
+    static void run(const type& adjoint, const Rhs& b)
+    {
+        internal::solve_assertion<typename internal::remove_all<Transpose<Derived> >::type>::template run<true>(adjoint.nestedExpression(), b);
+    }
+};
+} // end namespace internal
+
+/** \class SolverBase
+  * \brief A base class for matrix decomposition and solvers
+  *
+  * \tparam Derived the actual type of the decomposition/solver.
+  *
+  * Any matrix decomposition inheriting this base class provide the following API:
+  *
+  * \code
+  * MatrixType A, b, x;
+  * DecompositionType dec(A);
+  * x = dec.solve(b);             // solve A   * x = b
+  * x = dec.transpose().solve(b); // solve A^T * x = b
+  * x = dec.adjoint().solve(b);   // solve A'  * x = b
+  * \endcode
+  *
+  * \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors.
+  *
+  * \sa class PartialPivLU, class FullPivLU, class HouseholderQR, class ColPivHouseholderQR, class FullPivHouseholderQR, class CompleteOrthogonalDecomposition, class LLT, class LDLT, class SVDBase
+  */
+template<typename Derived>
+class SolverBase : public EigenBase<Derived>
+{
+  public:
+
+    typedef EigenBase<Derived> Base;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef Scalar CoeffReturnType;
+
+    template<typename Derived_>
+    friend struct internal::solve_assertion;
+
+    enum {
+      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
+                                                          internal::traits<Derived>::ColsAtCompileTime>::ret),
+      MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
+      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
+                                                             internal::traits<Derived>::MaxColsAtCompileTime>::ret),
+      IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
+                           || internal::traits<Derived>::MaxColsAtCompileTime == 1,
+      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2
+    };
+
+    /** Default constructor */
+    SolverBase()
+    {}
+
+    ~SolverBase()
+    {}
+
+    using Base::derived;
+
+    /** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      */
+    template<typename Rhs>
+    inline const Solve<Derived, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<false>(derived(), b);
+      return Solve<Derived, Rhs>(derived(), b.derived());
+    }
+
+    /** \internal the return type of transpose() */
+    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    /** \returns an expression of the transposed of the factored matrix.
+      *
+      * A typical usage is to solve for the transposed problem A^T x = b:
+      * \code x = dec.transpose().solve(b); \endcode
+      *
+      * \sa adjoint(), solve()
+      */
+    inline ConstTransposeReturnType transpose() const
+    {
+      return ConstTransposeReturnType(derived());
+    }
+
+    /** \internal the return type of adjoint() */
+    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
+                        CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
+                        ConstTransposeReturnType
+                     >::type AdjointReturnType;
+    /** \returns an expression of the adjoint of the factored matrix
+      *
+      * A typical usage is to solve for the adjoint problem A' x = b:
+      * \code x = dec.adjoint().solve(b); \endcode
+      *
+      * For real scalar types, this function is equivalent to transpose().
+      *
+      * \sa transpose(), solve()
+      */
+    inline AdjointReturnType adjoint() const
+    {
+      return AdjointReturnType(derived().transpose());
+    }
+
+  protected:
+
+    template<bool Transpose_, typename Rhs>
+    void _check_solve_assertion(const Rhs& b) const {
+        EIGEN_ONLY_USED_FOR_DEBUG(b);
+        eigen_assert(derived().m_isInitialized && "Solver is not initialized.");
+        eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "SolverBase::solve(): invalid number of rows of the right hand side matrix b");
+    }
+};
+
+namespace internal {
+
+template<typename Derived>
+struct generic_xpr_base<Derived, MatrixXpr, SolverStorage>
+{
+  typedef SolverBase<Derived> type;
+
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SOLVERBASE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/StableNorm.h b/src/3rdparty/eigen/Eigen/src/Core/StableNorm.h
new file mode 100644
index 000000000..4a3f0cca8
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/StableNorm.h
@@ -0,0 +1,251 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STABLENORM_H
+#define EIGEN_STABLENORM_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename ExpressionType, typename Scalar>
+inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& scale, Scalar& invScale)
+{
+  Scalar maxCoeff = bl.cwiseAbs().maxCoeff();
+  
+  if(maxCoeff>scale)
+  {
+    ssq = ssq * numext::abs2(scale/maxCoeff);
+    Scalar tmp = Scalar(1)/maxCoeff;
+    if(tmp > NumTraits<Scalar>::highest())
+    {
+      invScale = NumTraits<Scalar>::highest();
+      scale = Scalar(1)/invScale;
+    }
+    else if(maxCoeff>NumTraits<Scalar>::highest()) // we got a INF
+    {
+      invScale = Scalar(1);
+      scale = maxCoeff;
+    }
+    else
+    {
+      scale = maxCoeff;
+      invScale = tmp;
+    }
+  }
+  else if(maxCoeff!=maxCoeff) // we got a NaN
+  {
+    scale = maxCoeff;
+  }
+  
+  // TODO if the maxCoeff is much much smaller than the current scale,
+  // then we can neglect this sub vector
+  if(scale>Scalar(0)) // if scale==0, then bl is 0 
+    ssq += (bl*invScale).squaredNorm();
+}
+
+template<typename VectorType, typename RealScalar>
+void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale)
+{
+  typedef typename VectorType::Scalar Scalar;
+  const Index blockSize = 4096;
+  
+  typedef typename internal::nested_eval<VectorType,2>::type VectorTypeCopy;
+  typedef typename internal::remove_all<VectorTypeCopy>::type VectorTypeCopyClean;
+  const VectorTypeCopy copy(vec);
+  
+  enum {
+    CanAlign = (   (int(VectorTypeCopyClean::Flags)&DirectAccessBit)
+                || (int(internal::evaluator<VectorTypeCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
+               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
+                 && (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
+  };
+  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<VectorTypeCopyClean>::Alignment>,
+                                                   typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
+  Index n = vec.size();
+  
+  Index bi = internal::first_default_aligned(copy);
+  if (bi>0)
+    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
+  for (; bi<n; bi+=blockSize)
+    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
+}
+
+template<typename VectorType>
+typename VectorType::RealScalar
+stable_norm_impl(const VectorType &vec, typename enable_if<VectorType::IsVectorAtCompileTime>::type* = 0 )
+{
+  using std::sqrt;
+  using std::abs;
+
+  Index n = vec.size();
+
+  if(n==1)
+    return abs(vec.coeff(0));
+
+  typedef typename VectorType::RealScalar RealScalar;
+  RealScalar scale(0);
+  RealScalar invScale(1);
+  RealScalar ssq(0); // sum of squares
+
+  stable_norm_impl_inner_step(vec, ssq, scale, invScale);
+  
+  return scale * sqrt(ssq);
+}
+
+template<typename MatrixType>
+typename MatrixType::RealScalar
+stable_norm_impl(const MatrixType &mat, typename enable_if<!MatrixType::IsVectorAtCompileTime>::type* = 0 )
+{
+  using std::sqrt;
+
+  typedef typename MatrixType::RealScalar RealScalar;
+  RealScalar scale(0);
+  RealScalar invScale(1);
+  RealScalar ssq(0); // sum of squares
+
+  for(Index j=0; j<mat.outerSize(); ++j)
+    stable_norm_impl_inner_step(mat.innerVector(j), ssq, scale, invScale);
+  return scale * sqrt(ssq);
+}
+
+template<typename Derived>
+inline typename NumTraits<typename traits<Derived>::Scalar>::Real
+blueNorm_impl(const EigenBase<Derived>& _vec)
+{
+  typedef typename Derived::RealScalar RealScalar;  
+  using std::pow;
+  using std::sqrt;
+  using std::abs;
+
+  // This program calculates the machine-dependent constants
+  // bl, b2, slm, s2m, relerr overfl
+  // from the "basic" machine-dependent numbers
+  // nbig, ibeta, it, iemin, iemax, rbig.
+  // The following define the basic machine-dependent constants.
+  // For portability, the PORT subprograms "ilmaeh" and "rlmach"
+  // are used. For any specific computer, each of the assignment
+  // statements can be replaced
+  static const int ibeta = std::numeric_limits<RealScalar>::radix;  // base for floating-point numbers
+  static const int it    = NumTraits<RealScalar>::digits();  // number of base-beta digits in mantissa
+  static const int iemin = NumTraits<RealScalar>::min_exponent();  // minimum exponent
+  static const int iemax = NumTraits<RealScalar>::max_exponent();  // maximum exponent
+  static const RealScalar rbig   = NumTraits<RealScalar>::highest();  // largest floating-point number
+  static const RealScalar b1     = RealScalar(pow(RealScalar(ibeta),RealScalar(-((1-iemin)/2))));  // lower boundary of midrange
+  static const RealScalar b2     = RealScalar(pow(RealScalar(ibeta),RealScalar((iemax + 1 - it)/2)));  // upper boundary of midrange
+  static const RealScalar s1m    = RealScalar(pow(RealScalar(ibeta),RealScalar((2-iemin)/2)));  // scaling factor for lower range
+  static const RealScalar s2m    = RealScalar(pow(RealScalar(ibeta),RealScalar(- ((iemax+it)/2))));  // scaling factor for upper range
+  static const RealScalar eps    = RealScalar(pow(double(ibeta), 1-it));
+  static const RealScalar relerr = sqrt(eps);  // tolerance for neglecting asml
+
+  const Derived& vec(_vec.derived());
+  Index n = vec.size();
+  RealScalar ab2 = b2 / RealScalar(n);
+  RealScalar asml = RealScalar(0);
+  RealScalar amed = RealScalar(0);
+  RealScalar abig = RealScalar(0);
+
+  for(Index j=0; j<vec.outerSize(); ++j)
+  {
+    for(typename Derived::InnerIterator iter(vec, j); iter; ++iter)
+    {
+      RealScalar ax = abs(iter.value());
+      if(ax > ab2)     abig += numext::abs2(ax*s2m);
+      else if(ax < b1) asml += numext::abs2(ax*s1m);
+      else             amed += numext::abs2(ax);
+    }
+  }
+  if(amed!=amed)
+    return amed;  // we got a NaN
+  if(abig > RealScalar(0))
+  {
+    abig = sqrt(abig);
+    if(abig > rbig) // overflow, or *this contains INF values
+      return abig;  // return INF
+    if(amed > RealScalar(0))
+    {
+      abig = abig/s2m;
+      amed = sqrt(amed);
+    }
+    else
+      return abig/s2m;
+  }
+  else if(asml > RealScalar(0))
+  {
+    if (amed > RealScalar(0))
+    {
+      abig = sqrt(amed);
+      amed = sqrt(asml) / s1m;
+    }
+    else
+      return sqrt(asml)/s1m;
+  }
+  else
+    return sqrt(amed);
+  asml = numext::mini(abig, amed);
+  abig = numext::maxi(abig, amed);
+  if(asml <= abig*relerr)
+    return abig;
+  else
+    return abig * sqrt(RealScalar(1) + numext::abs2(asml/abig));
+}
+
+} // end namespace internal
+
+/** \returns the \em l2 norm of \c *this avoiding underflow and overflow.
+  * This version use a blockwise two passes algorithm:
+  *  1 - find the absolute largest coefficient \c s
+  *  2 - compute \f$ s \Vert \frac{*this}{s} \Vert \f$ in a standard way
+  *
+  * For architecture/scalar types supporting vectorization, this version
+  * is faster than blueNorm(). Otherwise the blueNorm() is much faster.
+  *
+  * \sa norm(), blueNorm(), hypotNorm()
+  */
+template<typename Derived>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+MatrixBase<Derived>::stableNorm() const
+{
+  return internal::stable_norm_impl(derived());
+}
+
+/** \returns the \em l2 norm of \c *this using the Blue's algorithm.
+  * A Portable Fortran Program to Find the Euclidean Norm of a Vector,
+  * ACM TOMS, Vol 4, Issue 1, 1978.
+  *
+  * For architecture/scalar types without vectorization, this version
+  * is much faster than stableNorm(). Otherwise the stableNorm() is faster.
+  *
+  * \sa norm(), stableNorm(), hypotNorm()
+  */
+template<typename Derived>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+MatrixBase<Derived>::blueNorm() const
+{
+  return internal::blueNorm_impl(*this);
+}
+
+/** \returns the \em l2 norm of \c *this avoiding undeflow and overflow.
+  * This version use a concatenation of hypot() calls, and it is very slow.
+  *
+  * \sa norm(), stableNorm()
+  */
+template<typename Derived>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+MatrixBase<Derived>::hypotNorm() const
+{
+  if(size()==1)
+    return numext::abs(coeff(0,0));
+  else
+    return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_STABLENORM_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/StlIterators.h b/src/3rdparty/eigen/Eigen/src/Core/StlIterators.h
new file mode 100644
index 000000000..09041db1d
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/StlIterators.h
@@ -0,0 +1,463 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STLITERATORS_H
+#define EIGEN_STLITERATORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename IteratorType>
+struct indexed_based_stl_iterator_traits;
+
+template<typename  Derived>
+class indexed_based_stl_iterator_base
+{
+protected:
+  typedef indexed_based_stl_iterator_traits<Derived> traits;
+  typedef typename traits::XprType XprType;
+  typedef indexed_based_stl_iterator_base<typename traits::non_const_iterator> non_const_iterator;
+  typedef indexed_based_stl_iterator_base<typename traits::const_iterator> const_iterator;
+  typedef typename internal::conditional<internal::is_const<XprType>::value,non_const_iterator,const_iterator>::type other_iterator;
+  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
+  friend class indexed_based_stl_iterator_base<typename traits::const_iterator>;
+  friend class indexed_based_stl_iterator_base<typename traits::non_const_iterator>;
+public:
+  typedef Index difference_type;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  indexed_based_stl_iterator_base() EIGEN_NO_THROW : mp_xpr(0), m_index(0) {}
+  indexed_based_stl_iterator_base(XprType& xpr, Index index) EIGEN_NO_THROW : mp_xpr(&xpr), m_index(index) {}
+
+  indexed_based_stl_iterator_base(const non_const_iterator& other) EIGEN_NO_THROW
+    : mp_xpr(other.mp_xpr), m_index(other.m_index)
+  {}
+
+  indexed_based_stl_iterator_base& operator=(const non_const_iterator& other)
+  {
+    mp_xpr = other.mp_xpr;
+    m_index = other.m_index;
+    return *this;
+  }
+
+  Derived& operator++() { ++m_index; return derived(); }
+  Derived& operator--() { --m_index; return derived(); }
+
+  Derived operator++(int) { Derived prev(derived()); operator++(); return prev;}
+  Derived operator--(int) { Derived prev(derived()); operator--(); return prev;}
+
+  friend Derived operator+(const indexed_based_stl_iterator_base& a, Index b) { Derived ret(a.derived()); ret += b; return ret; }
+  friend Derived operator-(const indexed_based_stl_iterator_base& a, Index b) { Derived ret(a.derived()); ret -= b; return ret; }
+  friend Derived operator+(Index a, const indexed_based_stl_iterator_base& b) { Derived ret(b.derived()); ret += a; return ret; }
+  friend Derived operator-(Index a, const indexed_based_stl_iterator_base& b) { Derived ret(b.derived()); ret -= a; return ret; }
+  
+  Derived& operator+=(Index b) { m_index += b; return derived(); }
+  Derived& operator-=(Index b) { m_index -= b; return derived(); }
+
+  difference_type operator-(const indexed_based_stl_iterator_base& other) const
+  {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index - other.m_index;
+  }
+
+  difference_type operator-(const other_iterator& other) const
+  {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index - other.m_index;
+  }
+
+  bool operator==(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; }
+  bool operator!=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; }
+  bool operator< (const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <  other.m_index; }
+  bool operator<=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; }
+  bool operator> (const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >  other.m_index; }
+  bool operator>=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; }
+
+  bool operator==(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; }
+  bool operator!=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; }
+  bool operator< (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <  other.m_index; }
+  bool operator<=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; }
+  bool operator> (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >  other.m_index; }
+  bool operator>=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; }
+
+protected:
+
+  Derived& derived() { return static_cast<Derived&>(*this); }
+  const Derived& derived() const { return static_cast<const Derived&>(*this); }
+
+  XprType *mp_xpr;
+  Index m_index;
+};
+
+template<typename  Derived>
+class indexed_based_stl_reverse_iterator_base
+{
+protected:
+  typedef indexed_based_stl_iterator_traits<Derived> traits;
+  typedef typename traits::XprType XprType;
+  typedef indexed_based_stl_reverse_iterator_base<typename traits::non_const_iterator> non_const_iterator;
+  typedef indexed_based_stl_reverse_iterator_base<typename traits::const_iterator> const_iterator;
+  typedef typename internal::conditional<internal::is_const<XprType>::value,non_const_iterator,const_iterator>::type other_iterator;
+  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
+  friend class indexed_based_stl_reverse_iterator_base<typename traits::const_iterator>;
+  friend class indexed_based_stl_reverse_iterator_base<typename traits::non_const_iterator>;
+public:
+  typedef Index difference_type;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  indexed_based_stl_reverse_iterator_base() : mp_xpr(0), m_index(0) {}
+  indexed_based_stl_reverse_iterator_base(XprType& xpr, Index index) : mp_xpr(&xpr), m_index(index) {}
+
+  indexed_based_stl_reverse_iterator_base(const non_const_iterator& other)
+    : mp_xpr(other.mp_xpr), m_index(other.m_index)
+  {}
+
+  indexed_based_stl_reverse_iterator_base& operator=(const non_const_iterator& other)
+  {
+    mp_xpr = other.mp_xpr;
+    m_index = other.m_index;
+    return *this;
+  }
+
+  Derived& operator++() { --m_index; return derived(); }
+  Derived& operator--() { ++m_index; return derived(); }
+
+  Derived operator++(int) { Derived prev(derived()); operator++(); return prev;}
+  Derived operator--(int) { Derived prev(derived()); operator--(); return prev;}
+
+  friend Derived operator+(const indexed_based_stl_reverse_iterator_base& a, Index b) { Derived ret(a.derived()); ret += b; return ret; }
+  friend Derived operator-(const indexed_based_stl_reverse_iterator_base& a, Index b) { Derived ret(a.derived()); ret -= b; return ret; }
+  friend Derived operator+(Index a, const indexed_based_stl_reverse_iterator_base& b) { Derived ret(b.derived()); ret += a; return ret; }
+  friend Derived operator-(Index a, const indexed_based_stl_reverse_iterator_base& b) { Derived ret(b.derived()); ret -= a; return ret; }
+  
+  Derived& operator+=(Index b) { m_index -= b; return derived(); }
+  Derived& operator-=(Index b) { m_index += b; return derived(); }
+
+  difference_type operator-(const indexed_based_stl_reverse_iterator_base& other) const
+  {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return other.m_index - m_index;
+  }
+
+  difference_type operator-(const other_iterator& other) const
+  {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return other.m_index - m_index;
+  }
+
+  bool operator==(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; }
+  bool operator!=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; }
+  bool operator< (const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >  other.m_index; }
+  bool operator<=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; }
+  bool operator> (const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <  other.m_index; }
+  bool operator>=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; }
+
+  bool operator==(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; }
+  bool operator!=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; }
+  bool operator< (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >  other.m_index; }
+  bool operator<=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; }
+  bool operator> (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <  other.m_index; }
+  bool operator>=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; }
+
+protected:
+
+  Derived& derived() { return static_cast<Derived&>(*this); }
+  const Derived& derived() const { return static_cast<const Derived&>(*this); }
+
+  XprType *mp_xpr;
+  Index m_index;
+};
+
+template<typename XprType>
+class pointer_based_stl_iterator
+{
+  enum { is_lvalue  = internal::is_lvalue<XprType>::value };
+  typedef pointer_based_stl_iterator<typename internal::remove_const<XprType>::type> non_const_iterator;
+  typedef pointer_based_stl_iterator<typename internal::add_const<XprType>::type> const_iterator;
+  typedef typename internal::conditional<internal::is_const<XprType>::value,non_const_iterator,const_iterator>::type other_iterator;
+  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
+  friend class pointer_based_stl_iterator<typename internal::add_const<XprType>::type>;
+  friend class pointer_based_stl_iterator<typename internal::remove_const<XprType>::type>;
+public:
+  typedef Index difference_type;
+  typedef typename XprType::Scalar value_type;
+  typedef std::random_access_iterator_tag iterator_category;
+  typedef typename internal::conditional<bool(is_lvalue), value_type*, const value_type*>::type pointer;
+  typedef typename internal::conditional<bool(is_lvalue), value_type&, const value_type&>::type reference;
+
+
+  pointer_based_stl_iterator() EIGEN_NO_THROW : m_ptr(0) {}
+  pointer_based_stl_iterator(XprType& xpr, Index index) EIGEN_NO_THROW : m_incr(xpr.innerStride())
+  {
+    m_ptr = xpr.data() + index * m_incr.value();
+  }
+
+  pointer_based_stl_iterator(const non_const_iterator& other) EIGEN_NO_THROW
+    : m_ptr(other.m_ptr), m_incr(other.m_incr)
+  {}
+
+  pointer_based_stl_iterator& operator=(const non_const_iterator& other) EIGEN_NO_THROW
+  {
+    m_ptr = other.m_ptr;
+    m_incr.setValue(other.m_incr);
+    return *this;
+  }
+
+  reference operator*()         const { return *m_ptr;   }
+  reference operator[](Index i) const { return *(m_ptr+i*m_incr.value()); }
+  pointer   operator->()        const { return m_ptr;    }
+
+  pointer_based_stl_iterator& operator++() { m_ptr += m_incr.value(); return *this; }
+  pointer_based_stl_iterator& operator--() { m_ptr -= m_incr.value(); return *this; }
+
+  pointer_based_stl_iterator operator++(int) { pointer_based_stl_iterator prev(*this); operator++(); return prev;}
+  pointer_based_stl_iterator operator--(int) { pointer_based_stl_iterator prev(*this); operator--(); return prev;}
+
+  friend pointer_based_stl_iterator operator+(const pointer_based_stl_iterator& a, Index b) { pointer_based_stl_iterator ret(a); ret += b; return ret; }
+  friend pointer_based_stl_iterator operator-(const pointer_based_stl_iterator& a, Index b) { pointer_based_stl_iterator ret(a); ret -= b; return ret; }
+  friend pointer_based_stl_iterator operator+(Index a, const pointer_based_stl_iterator& b) { pointer_based_stl_iterator ret(b); ret += a; return ret; }
+  friend pointer_based_stl_iterator operator-(Index a, const pointer_based_stl_iterator& b) { pointer_based_stl_iterator ret(b); ret -= a; return ret; }
+  
+  pointer_based_stl_iterator& operator+=(Index b) { m_ptr += b*m_incr.value(); return *this; }
+  pointer_based_stl_iterator& operator-=(Index b) { m_ptr -= b*m_incr.value(); return *this; }
+
+  difference_type operator-(const pointer_based_stl_iterator& other) const {
+    return (m_ptr - other.m_ptr)/m_incr.value();
+  }
+
+  difference_type operator-(const other_iterator& other) const {
+    return (m_ptr - other.m_ptr)/m_incr.value();
+  }
+
+  bool operator==(const pointer_based_stl_iterator& other) const { return m_ptr == other.m_ptr; }
+  bool operator!=(const pointer_based_stl_iterator& other) const { return m_ptr != other.m_ptr; }
+  bool operator< (const pointer_based_stl_iterator& other) const { return m_ptr <  other.m_ptr; }
+  bool operator<=(const pointer_based_stl_iterator& other) const { return m_ptr <= other.m_ptr; }
+  bool operator> (const pointer_based_stl_iterator& other) const { return m_ptr >  other.m_ptr; }
+  bool operator>=(const pointer_based_stl_iterator& other) const { return m_ptr >= other.m_ptr; }
+
+  bool operator==(const other_iterator& other) const { return m_ptr == other.m_ptr; }
+  bool operator!=(const other_iterator& other) const { return m_ptr != other.m_ptr; }
+  bool operator< (const other_iterator& other) const { return m_ptr <  other.m_ptr; }
+  bool operator<=(const other_iterator& other) const { return m_ptr <= other.m_ptr; }
+  bool operator> (const other_iterator& other) const { return m_ptr >  other.m_ptr; }
+  bool operator>=(const other_iterator& other) const { return m_ptr >= other.m_ptr; }
+
+protected:
+
+  pointer m_ptr;
+  internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_incr;
+};
+
+template<typename _XprType>
+struct indexed_based_stl_iterator_traits<generic_randaccess_stl_iterator<_XprType> >
+{
+  typedef _XprType XprType;
+  typedef generic_randaccess_stl_iterator<typename internal::remove_const<XprType>::type> non_const_iterator;
+  typedef generic_randaccess_stl_iterator<typename internal::add_const<XprType>::type> const_iterator;
+};
+
+template<typename XprType>
+class generic_randaccess_stl_iterator : public indexed_based_stl_iterator_base<generic_randaccess_stl_iterator<XprType> >
+{
+public:
+  typedef typename XprType::Scalar value_type;
+
+protected:
+
+  enum {
+    has_direct_access = (internal::traits<XprType>::Flags & DirectAccessBit) ? 1 : 0,
+    is_lvalue  = internal::is_lvalue<XprType>::value
+  };
+
+  typedef indexed_based_stl_iterator_base<generic_randaccess_stl_iterator> Base;
+  using Base::m_index;
+  using Base::mp_xpr;
+
+  // TODO currently const Transpose/Reshape expressions never returns const references,
+  // so lets return by value too.
+  //typedef typename internal::conditional<bool(has_direct_access), const value_type&, const value_type>::type read_only_ref_t;
+  typedef const value_type read_only_ref_t;
+
+public:
+  
+  typedef typename internal::conditional<bool(is_lvalue), value_type *, const value_type *>::type pointer;
+  typedef typename internal::conditional<bool(is_lvalue), value_type&, read_only_ref_t>::type reference;
+  
+  generic_randaccess_stl_iterator() : Base() {}
+  generic_randaccess_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {}
+  generic_randaccess_stl_iterator(const typename Base::non_const_iterator& other) : Base(other) {}
+  using Base::operator=;
+
+  reference operator*()         const { return   (*mp_xpr)(m_index);   }
+  reference operator[](Index i) const { return   (*mp_xpr)(m_index+i); }
+  pointer   operator->()        const { return &((*mp_xpr)(m_index)); }
+};
+
+template<typename _XprType, DirectionType Direction>
+struct indexed_based_stl_iterator_traits<subvector_stl_iterator<_XprType,Direction> >
+{
+  typedef _XprType XprType;
+  typedef subvector_stl_iterator<typename internal::remove_const<XprType>::type, Direction> non_const_iterator;
+  typedef subvector_stl_iterator<typename internal::add_const<XprType>::type, Direction> const_iterator;
+};
+
+template<typename XprType, DirectionType Direction>
+class subvector_stl_iterator : public indexed_based_stl_iterator_base<subvector_stl_iterator<XprType,Direction> >
+{
+protected:
+
+  enum { is_lvalue  = internal::is_lvalue<XprType>::value };
+
+  typedef indexed_based_stl_iterator_base<subvector_stl_iterator> Base;
+  using Base::m_index;
+  using Base::mp_xpr;
+
+  typedef typename internal::conditional<Direction==Vertical,typename XprType::ColXpr,typename XprType::RowXpr>::type SubVectorType;
+  typedef typename internal::conditional<Direction==Vertical,typename XprType::ConstColXpr,typename XprType::ConstRowXpr>::type ConstSubVectorType;
+
+
+public:
+  typedef typename internal::conditional<bool(is_lvalue), SubVectorType, ConstSubVectorType>::type reference;
+  typedef typename reference::PlainObject value_type;
+
+private:
+  class subvector_stl_iterator_ptr
+  {
+  public:
+      subvector_stl_iterator_ptr(const reference &subvector) : m_subvector(subvector) {}
+      reference* operator->() { return &m_subvector; }
+  private:
+      reference m_subvector;
+  };
+public:
+
+  typedef subvector_stl_iterator_ptr pointer;
+  
+  subvector_stl_iterator() : Base() {}
+  subvector_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {}
+
+  reference operator*()         const { return (*mp_xpr).template subVector<Direction>(m_index); }
+  reference operator[](Index i) const { return (*mp_xpr).template subVector<Direction>(m_index+i); }
+  pointer   operator->()        const { return (*mp_xpr).template subVector<Direction>(m_index); }
+};
+
+template<typename _XprType, DirectionType Direction>
+struct indexed_based_stl_iterator_traits<subvector_stl_reverse_iterator<_XprType,Direction> >
+{
+  typedef _XprType XprType;
+  typedef subvector_stl_reverse_iterator<typename internal::remove_const<XprType>::type, Direction> non_const_iterator;
+  typedef subvector_stl_reverse_iterator<typename internal::add_const<XprType>::type, Direction> const_iterator;
+};
+
+template<typename XprType, DirectionType Direction>
+class subvector_stl_reverse_iterator : public indexed_based_stl_reverse_iterator_base<subvector_stl_reverse_iterator<XprType,Direction> >
+{
+protected:
+
+  enum { is_lvalue  = internal::is_lvalue<XprType>::value };
+
+  typedef indexed_based_stl_reverse_iterator_base<subvector_stl_reverse_iterator> Base;
+  using Base::m_index;
+  using Base::mp_xpr;
+
+  typedef typename internal::conditional<Direction==Vertical,typename XprType::ColXpr,typename XprType::RowXpr>::type SubVectorType;
+  typedef typename internal::conditional<Direction==Vertical,typename XprType::ConstColXpr,typename XprType::ConstRowXpr>::type ConstSubVectorType;
+
+
+public:
+  typedef typename internal::conditional<bool(is_lvalue), SubVectorType, ConstSubVectorType>::type reference;
+  typedef typename reference::PlainObject value_type;
+
+private:
+  class subvector_stl_reverse_iterator_ptr
+  {
+  public:
+      subvector_stl_reverse_iterator_ptr(const reference &subvector) : m_subvector(subvector) {}
+      reference* operator->() { return &m_subvector; }
+  private:
+      reference m_subvector;
+  };
+public:
+
+  typedef subvector_stl_reverse_iterator_ptr pointer;
+  
+  subvector_stl_reverse_iterator() : Base() {}
+  subvector_stl_reverse_iterator(XprType& xpr, Index index) : Base(xpr,index) {}
+
+  reference operator*()         const { return (*mp_xpr).template subVector<Direction>(m_index); }
+  reference operator[](Index i) const { return (*mp_xpr).template subVector<Direction>(m_index+i); }
+  pointer   operator->()        const { return (*mp_xpr).template subVector<Direction>(m_index); }
+};
+
+} // namespace internal
+
+
+/** returns an iterator to the first element of the 1D vector or array
+  * \only_for_vectors
+  * \sa end(), cbegin()
+  */
+template<typename Derived>
+inline typename DenseBase<Derived>::iterator DenseBase<Derived>::begin()
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  return iterator(derived(), 0);
+}
+
+/** const version of begin() */
+template<typename Derived>
+inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::begin() const
+{
+  return cbegin();
+}
+
+/** returns a read-only const_iterator to the first element of the 1D vector or array
+  * \only_for_vectors
+  * \sa cend(), begin()
+  */
+template<typename Derived>
+inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::cbegin() const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  return const_iterator(derived(), 0);
+}
+
+/** returns an iterator to the element following the last element of the 1D vector or array
+  * \only_for_vectors
+  * \sa begin(), cend()
+  */
+template<typename Derived>
+inline typename DenseBase<Derived>::iterator DenseBase<Derived>::end()
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  return iterator(derived(), size());
+}
+
+/** const version of end() */
+template<typename Derived>
+inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::end() const
+{
+  return cend();
+}
+
+/** returns a read-only const_iterator to the element following the last element of the 1D vector or array
+  * \only_for_vectors
+  * \sa begin(), cend()
+  */
+template<typename Derived>
+inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::cend() const
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  return const_iterator(derived(), size());
+}
+
+} // namespace Eigen
+
+#endif // EIGEN_STLITERATORS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Stride.h b/src/3rdparty/eigen/Eigen/src/Core/Stride.h
new file mode 100644
index 000000000..6494d5142
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Stride.h
@@ -0,0 +1,116 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STRIDE_H
+#define EIGEN_STRIDE_H
+
+namespace Eigen {
+
+/** \class Stride
+  * \ingroup Core_Module
+  *
+  * \brief Holds strides information for Map
+  *
+  * This class holds the strides information for mapping arrays with strides with class Map.
+  *
+  * It holds two values: the inner stride and the outer stride.
+  *
+  * The inner stride is the pointer increment between two consecutive entries within a given row of a
+  * row-major matrix or within a given column of a column-major matrix.
+  *
+  * The outer stride is the pointer increment between two consecutive rows of a row-major matrix or
+  * between two consecutive columns of a column-major matrix.
+  *
+  * These two values can be passed either at compile-time as template parameters, or at runtime as
+  * arguments to the constructor.
+  *
+  * Indeed, this class takes two template parameters:
+  *  \tparam _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime.
+  *  \tparam _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime.
+  *
+  * Here is an example:
+  * \include Map_general_stride.cpp
+  * Output: \verbinclude Map_general_stride.out
+  *
+  * Both strides can be negative, however, a negative stride of -1 cannot be specified at compiletime
+  * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were
+  * not allowed).
+  *
+  * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders
+  */
+template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
+class Stride
+{
+  public:
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    enum {
+      InnerStrideAtCompileTime = _InnerStrideAtCompileTime,
+      OuterStrideAtCompileTime = _OuterStrideAtCompileTime
+    };
+
+    /** Default constructor, for use when strides are fixed at compile time */
+    EIGEN_DEVICE_FUNC
+    Stride()
+      : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime)
+    {
+      // FIXME: for Eigen 4 we should use DynamicIndex instead of Dynamic.
+      // FIXME: for Eigen 4 we should also unify this API with fix<>
+      eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic);
+    }
+
+    /** Constructor allowing to pass the strides at runtime */
+    EIGEN_DEVICE_FUNC
+    Stride(Index outerStride, Index innerStride)
+      : m_outer(outerStride), m_inner(innerStride)
+    {
+    }
+
+    /** Copy constructor */
+    EIGEN_DEVICE_FUNC
+    Stride(const Stride& other)
+      : m_outer(other.outer()), m_inner(other.inner())
+    {}
+
+    /** \returns the outer stride */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outer() const { return m_outer.value(); }
+    /** \returns the inner stride */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index inner() const { return m_inner.value(); }
+
+  protected:
+    internal::variable_if_dynamic<Index, OuterStrideAtCompileTime> m_outer;
+    internal::variable_if_dynamic<Index, InnerStrideAtCompileTime> m_inner;
+};
+
+/** \brief Convenience specialization of Stride to specify only an inner stride
+  * See class Map for some examples */
+template<int Value>
+class InnerStride : public Stride<0, Value>
+{
+    typedef Stride<0, Value> Base;
+  public:
+    EIGEN_DEVICE_FUNC InnerStride() : Base() {}
+    EIGEN_DEVICE_FUNC InnerStride(Index v) : Base(0, v) {} // FIXME making this explicit could break valid code
+};
+
+/** \brief Convenience specialization of Stride to specify only an outer stride
+  * See class Map for some examples */
+template<int Value>
+class OuterStride : public Stride<Value, 0>
+{
+    typedef Stride<Value, 0> Base;
+  public:
+    EIGEN_DEVICE_FUNC OuterStride() : Base() {}
+    EIGEN_DEVICE_FUNC OuterStride(Index v) : Base(v,0) {} // FIXME making this explicit could break valid code
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_STRIDE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Swap.h b/src/3rdparty/eigen/Eigen/src/Core/Swap.h
new file mode 100644
index 000000000..180a4e5ad
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Swap.h
@@ -0,0 +1,68 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SWAP_H
+#define EIGEN_SWAP_H
+
+namespace Eigen { 
+
+namespace internal {
+
+// Overload default assignPacket behavior for swapping them
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT>
+class generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, Specialized>
+ : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn>
+{
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn> Base;
+  using Base::m_dst;
+  using Base::m_src;
+  using Base::m_functor;
+  
+public:
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::DstXprType DstXprType;
+  typedef swap_assign_op<Scalar> Functor;
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {}
+  
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
+  {
+    PacketType tmp = m_src.template packet<LoadMode,PacketType>(row,col);
+    const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(row,col, m_dst.template packet<StoreMode,PacketType>(row,col));
+    m_dst.template writePacket<StoreMode>(row,col,tmp);
+  }
+  
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacket(Index index)
+  {
+    PacketType tmp = m_src.template packet<LoadMode,PacketType>(index);
+    const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(index, m_dst.template packet<StoreMode,PacketType>(index));
+    m_dst.template writePacket<StoreMode>(index,tmp);
+  }
+  
+  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
+  {
+    Index row = Base::rowIndexByOuterInner(outer, inner); 
+    Index col = Base::colIndexByOuterInner(outer, inner);
+    assignPacket<StoreMode,LoadMode,PacketType>(row, col);
+  }
+};
+
+} // namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SWAP_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Transpose.h b/src/3rdparty/eigen/Eigen/src/Core/Transpose.h
new file mode 100644
index 000000000..2bc658f40
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Transpose.h
@@ -0,0 +1,464 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRANSPOSE_H
+#define EIGEN_TRANSPOSE_H
+
+namespace Eigen {
+
+namespace internal {
+template<typename MatrixType>
+struct traits<Transpose<MatrixType> > : public traits<MatrixType>
+{
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
+  typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedPlain;
+  enum {
+    RowsAtCompileTime = MatrixType::ColsAtCompileTime,
+    ColsAtCompileTime = MatrixType::RowsAtCompileTime,
+    MaxRowsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags0 = traits<MatrixTypeNestedPlain>::Flags & ~(LvalueBit | NestByRefBit),
+    Flags1 = Flags0 | FlagsLvalueBit,
+    Flags = Flags1 ^ RowMajorBit,
+    InnerStrideAtCompileTime = inner_stride_at_compile_time<MatrixType>::ret,
+    OuterStrideAtCompileTime = outer_stride_at_compile_time<MatrixType>::ret
+  };
+};
+}
+
+template<typename MatrixType, typename StorageKind> class TransposeImpl;
+
+/** \class Transpose
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the transpose of a matrix
+  *
+  * \tparam MatrixType the type of the object of which we are taking the transpose
+  *
+  * This class represents an expression of the transpose of a matrix.
+  * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint()
+  * and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::transpose(), MatrixBase::adjoint()
+  */
+template<typename MatrixType> class Transpose
+  : public TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>
+{
+  public:
+
+    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
+
+    typedef typename TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
+
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {}
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+
+    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<MatrixTypeNested>::type&
+    nestedExpression() const { return m_matrix; }
+
+    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename internal::remove_reference<MatrixTypeNested>::type&
+    nestedExpression() { return m_matrix; }
+
+    /** \internal */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void resize(Index nrows, Index ncols) {
+      m_matrix.resize(ncols,nrows);
+    }
+
+  protected:
+    typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
+};
+
+namespace internal {
+
+template<typename MatrixType, bool HasDirectAccess = has_direct_access<MatrixType>::ret>
+struct TransposeImpl_base
+{
+  typedef typename dense_xpr_base<Transpose<MatrixType> >::type type;
+};
+
+template<typename MatrixType>
+struct TransposeImpl_base<MatrixType, false>
+{
+  typedef typename dense_xpr_base<Transpose<MatrixType> >::type type;
+};
+
+} // end namespace internal
+
+// Generic API dispatcher
+template<typename XprType, typename StorageKind>
+class TransposeImpl
+  : public internal::generic_xpr_base<Transpose<XprType> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<Transpose<XprType> >::type Base;
+};
+
+template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
+  : public internal::TransposeImpl_base<MatrixType>::type
+{
+  public:
+
+    typedef typename internal::TransposeImpl_base<MatrixType>::type Base;
+    using Base::coeffRef;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index innerStride() const { return derived().nestedExpression().innerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index outerStride() const { return derived().nestedExpression().outerStride(); }
+
+    typedef typename internal::conditional<
+                       internal::is_lvalue<MatrixType>::value,
+                       Scalar,
+                       const Scalar
+                     >::type ScalarWithConstIfNotLvalue;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Scalar* data() const { return derived().nestedExpression().data(); }
+
+    // FIXME: shall we keep the const version of coeffRef?
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Scalar& coeffRef(Index rowId, Index colId) const
+    {
+      return derived().nestedExpression().coeffRef(colId, rowId);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Scalar& coeffRef(Index index) const
+    {
+      return derived().nestedExpression().coeffRef(index);
+    }
+  protected:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TransposeImpl)
+};
+
+/** \returns an expression of the transpose of *this.
+  *
+  * Example: \include MatrixBase_transpose.cpp
+  * Output: \verbinclude MatrixBase_transpose.out
+  *
+  * \warning If you want to replace a matrix by its own transpose, do \b NOT do this:
+  * \code
+  * m = m.transpose(); // bug!!! caused by aliasing effect
+  * \endcode
+  * Instead, use the transposeInPlace() method:
+  * \code
+  * m.transposeInPlace();
+  * \endcode
+  * which gives Eigen good opportunities for optimization, or alternatively you can also do:
+  * \code
+  * m = m.transpose().eval();
+  * \endcode
+  *
+  * \sa transposeInPlace(), adjoint() */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Transpose<Derived>
+DenseBase<Derived>::transpose()
+{
+  return TransposeReturnType(derived());
+}
+
+/** This is the const version of transpose().
+  *
+  * Make sure you read the warning for transpose() !
+  *
+  * \sa transposeInPlace(), adjoint() */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename DenseBase<Derived>::ConstTransposeReturnType
+DenseBase<Derived>::transpose() const
+{
+  return ConstTransposeReturnType(derived());
+}
+
+/** \returns an expression of the adjoint (i.e. conjugate transpose) of *this.
+  *
+  * Example: \include MatrixBase_adjoint.cpp
+  * Output: \verbinclude MatrixBase_adjoint.out
+  *
+  * \warning If you want to replace a matrix by its own adjoint, do \b NOT do this:
+  * \code
+  * m = m.adjoint(); // bug!!! caused by aliasing effect
+  * \endcode
+  * Instead, use the adjointInPlace() method:
+  * \code
+  * m.adjointInPlace();
+  * \endcode
+  * which gives Eigen good opportunities for optimization, or alternatively you can also do:
+  * \code
+  * m = m.adjoint().eval();
+  * \endcode
+  *
+  * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType
+MatrixBase<Derived>::adjoint() const
+{
+  return AdjointReturnType(this->transpose());
+}
+
+/***************************************************************************
+* "in place" transpose implementation
+***************************************************************************/
+
+namespace internal {
+
+template<typename MatrixType,
+  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic,
+  bool MatchPacketSize =
+        (int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits<typename MatrixType::Scalar>::size))
+    &&  (internal::evaluator<MatrixType>::Flags&PacketAccessBit) >
+struct inplace_transpose_selector;
+
+template<typename MatrixType>
+struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
+  static void run(MatrixType& m) {
+    m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose().template triangularView<StrictlyUpper>());
+  }
+};
+
+template<typename MatrixType>
+struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
+  static void run(MatrixType& m) {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
+    const Index PacketSize = internal::packet_traits<Scalar>::size;
+    const Index Alignment = internal::evaluator<MatrixType>::Alignment;
+    PacketBlock<Packet> A;
+    for (Index i=0; i<PacketSize; ++i)
+      A.packet[i] = m.template packetByOuterInner<Alignment>(i,0);
+    internal::ptranspose(A);
+    for (Index i=0; i<PacketSize; ++i)
+      m.template writePacket<Alignment>(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]);
+  }
+};
+
+
+template <typename MatrixType, Index Alignment>
+void BlockedInPlaceTranspose(MatrixType& m) {
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
+  const Index PacketSize = internal::packet_traits<Scalar>::size;
+  eigen_assert(m.rows() == m.cols());
+  int row_start = 0;
+  for (; row_start + PacketSize <= m.rows(); row_start += PacketSize) {
+    for (int col_start = row_start; col_start + PacketSize <= m.cols(); col_start += PacketSize) {
+      PacketBlock<Packet> A;
+      if (row_start == col_start) {
+        for (Index i=0; i<PacketSize; ++i)
+          A.packet[i] = m.template packetByOuterInner<Alignment>(row_start + i,col_start);
+        internal::ptranspose(A);
+        for (Index i=0; i<PacketSize; ++i)
+          m.template writePacket<Alignment>(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), A.packet[i]);
+      } else {
+        PacketBlock<Packet> B;
+        for (Index i=0; i<PacketSize; ++i) {
+          A.packet[i] = m.template packetByOuterInner<Alignment>(row_start + i,col_start);
+          B.packet[i] = m.template packetByOuterInner<Alignment>(col_start + i, row_start);
+        }
+        internal::ptranspose(A);
+        internal::ptranspose(B);
+        for (Index i=0; i<PacketSize; ++i) {
+          m.template writePacket<Alignment>(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), B.packet[i]);
+          m.template writePacket<Alignment>(m.rowIndexByOuterInner(col_start + i, row_start), m.colIndexByOuterInner(col_start + i,row_start), A.packet[i]);
+        }
+      }
+    }
+  }
+  for (Index row = row_start; row < m.rows(); ++row) {
+    m.matrix().row(row).head(row).swap(
+        m.matrix().col(row).head(row).transpose());
+  }
+}
+
+template<typename MatrixType,bool MatchPacketSize>
+struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square or dynamic matrix
+  static void run(MatrixType& m) {
+    typedef typename MatrixType::Scalar Scalar;
+    if (m.rows() == m.cols()) {
+      const Index PacketSize = internal::packet_traits<Scalar>::size;
+      if (!NumTraits<Scalar>::IsComplex && m.rows() >= PacketSize) {
+        if ((m.rows() % PacketSize) == 0)
+          BlockedInPlaceTranspose<MatrixType,internal::evaluator<MatrixType>::Alignment>(m);
+        else
+          BlockedInPlaceTranspose<MatrixType,Unaligned>(m);
+      }
+      else {
+        m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose().template triangularView<StrictlyUpper>());
+      }
+    } else {
+      m = m.transpose().eval();
+    }
+  }
+};
+
+
+} // end namespace internal
+
+/** This is the "in place" version of transpose(): it replaces \c *this by its own transpose.
+  * Thus, doing
+  * \code
+  * m.transposeInPlace();
+  * \endcode
+  * has the same effect on m as doing
+  * \code
+  * m = m.transpose().eval();
+  * \endcode
+  * and is faster and also safer because in the latter line of code, forgetting the eval() results
+  * in a bug caused by \ref TopicAliasing "aliasing".
+  *
+  * Notice however that this method is only useful if you want to replace a matrix by its own transpose.
+  * If you just need the transpose of a matrix, use transpose().
+  *
+  * \note if the matrix is not square, then \c *this must be a resizable matrix.
+  * This excludes (non-square) fixed-size matrices, block-expressions and maps.
+  *
+  * \sa transpose(), adjoint(), adjointInPlace() */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
+{
+  eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
+               && "transposeInPlace() called on a non-square non-resizable matrix");
+  internal::inplace_transpose_selector<Derived>::run(derived());
+}
+
+/***************************************************************************
+* "in place" adjoint implementation
+***************************************************************************/
+
+/** This is the "in place" version of adjoint(): it replaces \c *this by its own transpose.
+  * Thus, doing
+  * \code
+  * m.adjointInPlace();
+  * \endcode
+  * has the same effect on m as doing
+  * \code
+  * m = m.adjoint().eval();
+  * \endcode
+  * and is faster and also safer because in the latter line of code, forgetting the eval() results
+  * in a bug caused by aliasing.
+  *
+  * Notice however that this method is only useful if you want to replace a matrix by its own adjoint.
+  * If you just need the adjoint of a matrix, use adjoint().
+  *
+  * \note if the matrix is not square, then \c *this must be a resizable matrix.
+  * This excludes (non-square) fixed-size matrices, block-expressions and maps.
+  *
+  * \sa transpose(), adjoint(), transposeInPlace() */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::adjointInPlace()
+{
+  derived() = adjoint().eval();
+}
+
+#ifndef EIGEN_NO_DEBUG
+
+// The following is to detect aliasing problems in most common cases.
+
+namespace internal {
+
+template<bool DestIsTransposed, typename OtherDerived>
+struct check_transpose_aliasing_compile_time_selector
+{
+  enum { ret = bool(blas_traits<OtherDerived>::IsTransposed) != DestIsTransposed };
+};
+
+template<bool DestIsTransposed, typename BinOp, typename DerivedA, typename DerivedB>
+struct check_transpose_aliasing_compile_time_selector<DestIsTransposed,CwiseBinaryOp<BinOp,DerivedA,DerivedB> >
+{
+  enum { ret =    bool(blas_traits<DerivedA>::IsTransposed) != DestIsTransposed
+               || bool(blas_traits<DerivedB>::IsTransposed) != DestIsTransposed
+  };
+};
+
+template<typename Scalar, bool DestIsTransposed, typename OtherDerived>
+struct check_transpose_aliasing_run_time_selector
+{
+  static bool run(const Scalar* dest, const OtherDerived& src)
+  {
+    return (bool(blas_traits<OtherDerived>::IsTransposed) != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src));
+  }
+};
+
+template<typename Scalar, bool DestIsTransposed, typename BinOp, typename DerivedA, typename DerivedB>
+struct check_transpose_aliasing_run_time_selector<Scalar,DestIsTransposed,CwiseBinaryOp<BinOp,DerivedA,DerivedB> >
+{
+  static bool run(const Scalar* dest, const CwiseBinaryOp<BinOp,DerivedA,DerivedB>& src)
+  {
+    return ((blas_traits<DerivedA>::IsTransposed != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src.lhs())))
+        || ((blas_traits<DerivedB>::IsTransposed != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src.rhs())));
+  }
+};
+
+// the following selector, checkTransposeAliasing_impl, based on MightHaveTransposeAliasing,
+// is because when the condition controlling the assert is known at compile time, ICC emits a warning.
+// This is actually a good warning: in expressions that don't have any transposing, the condition is
+// known at compile time to be false, and using that, we can avoid generating the code of the assert again
+// and again for all these expressions that don't need it.
+
+template<typename Derived, typename OtherDerived,
+         bool MightHaveTransposeAliasing
+                 = check_transpose_aliasing_compile_time_selector
+                     <blas_traits<Derived>::IsTransposed,OtherDerived>::ret
+        >
+struct checkTransposeAliasing_impl
+{
+    static void run(const Derived& dst, const OtherDerived& other)
+    {
+        eigen_assert((!check_transpose_aliasing_run_time_selector
+                      <typename Derived::Scalar,blas_traits<Derived>::IsTransposed,OtherDerived>
+                      ::run(extract_data(dst), other))
+          && "aliasing detected during transposition, use transposeInPlace() "
+             "or evaluate the rhs into a temporary using .eval()");
+
+    }
+};
+
+template<typename Derived, typename OtherDerived>
+struct checkTransposeAliasing_impl<Derived, OtherDerived, false>
+{
+    static void run(const Derived&, const OtherDerived&)
+    {
+    }
+};
+
+template<typename Dst, typename Src>
+void check_for_aliasing(const Dst &dst, const Src &src)
+{
+  if((!Dst::IsVectorAtCompileTime) && dst.rows()>1 && dst.cols()>1)
+    internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
+}
+
+} // end namespace internal
+
+#endif // EIGEN_NO_DEBUG
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRANSPOSE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Transpositions.h b/src/3rdparty/eigen/Eigen/src/Core/Transpositions.h
new file mode 100644
index 000000000..38a7b01cb
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Transpositions.h
@@ -0,0 +1,386 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRANSPOSITIONS_H
+#define EIGEN_TRANSPOSITIONS_H
+
+namespace Eigen {
+
+template<typename Derived>
+class TranspositionsBase
+{
+    typedef internal::traits<Derived> Traits;
+
+  public:
+
+    typedef typename Traits::IndicesType IndicesType;
+    typedef typename IndicesType::Scalar StorageIndex;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+
+    EIGEN_DEVICE_FUNC
+    Derived& derived() { return *static_cast<Derived*>(this); }
+    EIGEN_DEVICE_FUNC
+    const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+    /** Copies the \a other transpositions into \c *this */
+    template<typename OtherDerived>
+    Derived& operator=(const TranspositionsBase<OtherDerived>& other)
+    {
+      indices() = other.indices();
+      return derived();
+    }
+
+    /** \returns the number of transpositions */
+    EIGEN_DEVICE_FUNC
+    Index size() const { return indices().size(); }
+    /** \returns the number of rows of the equivalent permutation matrix */
+    EIGEN_DEVICE_FUNC
+    Index rows() const { return indices().size(); }
+    /** \returns the number of columns of the equivalent permutation matrix */
+    EIGEN_DEVICE_FUNC
+    Index cols() const { return indices().size(); }
+
+    /** Direct access to the underlying index vector */
+    EIGEN_DEVICE_FUNC
+    inline const StorageIndex& coeff(Index i) const { return indices().coeff(i); }
+    /** Direct access to the underlying index vector */
+    inline StorageIndex& coeffRef(Index i) { return indices().coeffRef(i); }
+    /** Direct access to the underlying index vector */
+    inline const StorageIndex& operator()(Index i) const { return indices()(i); }
+    /** Direct access to the underlying index vector */
+    inline StorageIndex& operator()(Index i) { return indices()(i); }
+    /** Direct access to the underlying index vector */
+    inline const StorageIndex& operator[](Index i) const { return indices()(i); }
+    /** Direct access to the underlying index vector */
+    inline StorageIndex& operator[](Index i) { return indices()(i); }
+
+    /** const version of indices(). */
+    EIGEN_DEVICE_FUNC
+    const IndicesType& indices() const { return derived().indices(); }
+    /** \returns a reference to the stored array representing the transpositions. */
+    EIGEN_DEVICE_FUNC
+    IndicesType& indices() { return derived().indices(); }
+
+    /** Resizes to given size. */
+    inline void resize(Index newSize)
+    {
+      indices().resize(newSize);
+    }
+
+    /** Sets \c *this to represents an identity transformation */
+    void setIdentity()
+    {
+      for(StorageIndex i = 0; i < indices().size(); ++i)
+        coeffRef(i) = i;
+    }
+
+    // FIXME: do we want such methods ?
+    // might be useful when the target matrix expression is complex, e.g.:
+    // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..);
+    /*
+    template<typename MatrixType>
+    void applyForwardToRows(MatrixType& mat) const
+    {
+      for(Index k=0 ; k<size() ; ++k)
+        if(m_indices(k)!=k)
+          mat.row(k).swap(mat.row(m_indices(k)));
+    }
+
+    template<typename MatrixType>
+    void applyBackwardToRows(MatrixType& mat) const
+    {
+      for(Index k=size()-1 ; k>=0 ; --k)
+        if(m_indices(k)!=k)
+          mat.row(k).swap(mat.row(m_indices(k)));
+    }
+    */
+
+    /** \returns the inverse transformation */
+    inline Transpose<TranspositionsBase> inverse() const
+    { return Transpose<TranspositionsBase>(derived()); }
+
+    /** \returns the tranpose transformation */
+    inline Transpose<TranspositionsBase> transpose() const
+    { return Transpose<TranspositionsBase>(derived()); }
+
+  protected:
+};
+
+namespace internal {
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
+struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
+ : traits<PermutationMatrix<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
+{
+  typedef Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef TranspositionsStorage StorageKind;
+};
+}
+
+/** \class Transpositions
+  * \ingroup Core_Module
+  *
+  * \brief Represents a sequence of transpositions (row/column interchange)
+  *
+  * \tparam SizeAtCompileTime the number of transpositions, or Dynamic
+  * \tparam MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
+  *
+  * This class represents a permutation transformation as a sequence of \em n transpositions
+  * \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices.
+  * Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges
+  * the rows \c i and \c indices[i] of the matrix \c M.
+  * A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange.
+  *
+  * Compared to the class PermutationMatrix, such a sequence of transpositions is what is
+  * computed during a decomposition with pivoting, and it is faster when applying the permutation in-place.
+  *
+  * To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example:
+  * \code
+  * Transpositions tr;
+  * MatrixXf mat;
+  * mat = tr * mat;
+  * \endcode
+  * In this example, we detect that the matrix appears on both side, and so the transpositions
+  * are applied in-place without any temporary or extra copy.
+  *
+  * \sa class PermutationMatrix
+  */
+
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
+class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
+{
+    typedef internal::traits<Transpositions> Traits;
+  public:
+
+    typedef TranspositionsBase<Transpositions> Base;
+    typedef typename Traits::IndicesType IndicesType;
+    typedef typename IndicesType::Scalar StorageIndex;
+
+    inline Transpositions() {}
+
+    /** Copy constructor. */
+    template<typename OtherDerived>
+    inline Transpositions(const TranspositionsBase<OtherDerived>& other)
+      : m_indices(other.indices()) {}
+
+    /** Generic constructor from expression of the transposition indices. */
+    template<typename Other>
+    explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices)
+    {}
+
+    /** Copies the \a other transpositions into \c *this */
+    template<typename OtherDerived>
+    Transpositions& operator=(const TranspositionsBase<OtherDerived>& other)
+    {
+      return Base::operator=(other);
+    }
+
+    /** Constructs an uninitialized permutation matrix of given size.
+      */
+    inline Transpositions(Index size) : m_indices(size)
+    {}
+
+    /** const version of indices(). */
+    EIGEN_DEVICE_FUNC
+    const IndicesType& indices() const { return m_indices; }
+    /** \returns a reference to the stored array representing the transpositions. */
+    EIGEN_DEVICE_FUNC
+    IndicesType& indices() { return m_indices; }
+
+  protected:
+
+    IndicesType m_indices;
+};
+
+
+namespace internal {
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
+struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,_PacketAccess> >
+ : traits<PermutationMatrix<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
+{
+  typedef Map<const Matrix<_StorageIndex,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
+  typedef _StorageIndex StorageIndex;
+  typedef TranspositionsStorage StorageKind;
+};
+}
+
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int PacketAccess>
+class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,PacketAccess>
+ : public TranspositionsBase<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,PacketAccess> >
+{
+    typedef internal::traits<Map> Traits;
+  public:
+
+    typedef TranspositionsBase<Map> Base;
+    typedef typename Traits::IndicesType IndicesType;
+    typedef typename IndicesType::Scalar StorageIndex;
+
+    explicit inline Map(const StorageIndex* indicesPtr)
+      : m_indices(indicesPtr)
+    {}
+
+    inline Map(const StorageIndex* indicesPtr, Index size)
+      : m_indices(indicesPtr,size)
+    {}
+
+    /** Copies the \a other transpositions into \c *this */
+    template<typename OtherDerived>
+    Map& operator=(const TranspositionsBase<OtherDerived>& other)
+    {
+      return Base::operator=(other);
+    }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    Map& operator=(const Map& other)
+    {
+      m_indices = other.m_indices;
+      return *this;
+    }
+    #endif
+
+    /** const version of indices(). */
+    EIGEN_DEVICE_FUNC
+    const IndicesType& indices() const { return m_indices; }
+
+    /** \returns a reference to the stored array representing the transpositions. */
+    EIGEN_DEVICE_FUNC
+    IndicesType& indices() { return m_indices; }
+
+  protected:
+
+    IndicesType m_indices;
+};
+
+namespace internal {
+template<typename _IndicesType>
+struct traits<TranspositionsWrapper<_IndicesType> >
+ : traits<PermutationWrapper<_IndicesType> >
+{
+  typedef TranspositionsStorage StorageKind;
+};
+}
+
+template<typename _IndicesType>
+class TranspositionsWrapper
+ : public TranspositionsBase<TranspositionsWrapper<_IndicesType> >
+{
+    typedef internal::traits<TranspositionsWrapper> Traits;
+  public:
+
+    typedef TranspositionsBase<TranspositionsWrapper> Base;
+    typedef typename Traits::IndicesType IndicesType;
+    typedef typename IndicesType::Scalar StorageIndex;
+
+    explicit inline TranspositionsWrapper(IndicesType& indices)
+      : m_indices(indices)
+    {}
+
+    /** Copies the \a other transpositions into \c *this */
+    template<typename OtherDerived>
+    TranspositionsWrapper& operator=(const TranspositionsBase<OtherDerived>& other)
+    {
+      return Base::operator=(other);
+    }
+
+    /** const version of indices(). */
+    EIGEN_DEVICE_FUNC
+    const IndicesType& indices() const { return m_indices; }
+
+    /** \returns a reference to the stored array representing the transpositions. */
+    EIGEN_DEVICE_FUNC
+    IndicesType& indices() { return m_indices; }
+
+  protected:
+
+    typename IndicesType::Nested m_indices;
+};
+
+
+
+/** \returns the \a matrix with the \a transpositions applied to the columns.
+  */
+template<typename MatrixDerived, typename TranspositionsDerived>
+EIGEN_DEVICE_FUNC
+const Product<MatrixDerived, TranspositionsDerived, AliasFreeProduct>
+operator*(const MatrixBase<MatrixDerived> &matrix,
+          const TranspositionsBase<TranspositionsDerived>& transpositions)
+{
+  return Product<MatrixDerived, TranspositionsDerived, AliasFreeProduct>
+            (matrix.derived(), transpositions.derived());
+}
+
+/** \returns the \a matrix with the \a transpositions applied to the rows.
+  */
+template<typename TranspositionsDerived, typename MatrixDerived>
+EIGEN_DEVICE_FUNC
+const Product<TranspositionsDerived, MatrixDerived, AliasFreeProduct>
+operator*(const TranspositionsBase<TranspositionsDerived> &transpositions,
+          const MatrixBase<MatrixDerived>& matrix)
+{
+  return Product<TranspositionsDerived, MatrixDerived, AliasFreeProduct>
+            (transpositions.derived(), matrix.derived());
+}
+
+// Template partial specialization for transposed/inverse transpositions
+
+namespace internal {
+
+template<typename Derived>
+struct traits<Transpose<TranspositionsBase<Derived> > >
+ : traits<Derived>
+{};
+
+} // end namespace internal
+
+template<typename TranspositionsDerived>
+class Transpose<TranspositionsBase<TranspositionsDerived> >
+{
+    typedef TranspositionsDerived TranspositionType;
+    typedef typename TranspositionType::IndicesType IndicesType;
+  public:
+
+    explicit Transpose(const TranspositionType& t) : m_transpositions(t) {}
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index size() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
+
+    /** \returns the \a matrix with the inverse transpositions applied to the columns.
+      */
+    template<typename OtherDerived> friend
+    const Product<OtherDerived, Transpose, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt)
+    {
+      return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt);
+    }
+
+    /** \returns the \a matrix with the inverse transpositions applied to the rows.
+      */
+    template<typename OtherDerived>
+    const Product<Transpose, OtherDerived, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix) const
+    {
+      return Product<Transpose, OtherDerived, AliasFreeProduct>(*this, matrix.derived());
+    }
+
+    EIGEN_DEVICE_FUNC
+    const TranspositionType& nestedExpression() const { return m_transpositions; }
+
+  protected:
+    const TranspositionType& m_transpositions;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRANSPOSITIONS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/TriangularMatrix.h b/src/3rdparty/eigen/Eigen/src/Core/TriangularMatrix.h
new file mode 100644
index 000000000..fdb8bc15a
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/TriangularMatrix.h
@@ -0,0 +1,1001 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULARMATRIX_H
+#define EIGEN_TRIANGULARMATRIX_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<int Side, typename TriangularType, typename Rhs> struct triangular_solve_retval;
+
+}
+
+/** \class TriangularBase
+  * \ingroup Core_Module
+  *
+  * \brief Base class for triangular part in a matrix
+  */
+template<typename Derived> class TriangularBase : public EigenBase<Derived>
+{
+  public:
+
+    enum {
+      Mode = internal::traits<Derived>::Mode,
+      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+      MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
+
+      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
+                                                   internal::traits<Derived>::ColsAtCompileTime>::ret),
+      /**< This is equal to the number of coefficients, i.e. the number of
+          * rows times the number of columns, or to \a Dynamic if this is not
+          * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
+
+      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
+                                                   internal::traits<Derived>::MaxColsAtCompileTime>::ret)
+
+    };
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::traits<Derived>::StorageKind StorageKind;
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+    typedef typename internal::traits<Derived>::FullMatrixType DenseMatrixType;
+    typedef DenseMatrixType DenseType;
+    typedef Derived const& Nested;
+
+    EIGEN_DEVICE_FUNC
+    inline TriangularBase() { eigen_assert(!((int(Mode) & int(UnitDiag)) && (int(Mode) & int(ZeroDiag)))); }
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); }
+
+    // dummy resize function
+    EIGEN_DEVICE_FUNC
+    void resize(Index rows, Index cols)
+    {
+      EIGEN_UNUSED_VARIABLE(rows);
+      EIGEN_UNUSED_VARIABLE(cols);
+      eigen_assert(rows==this->rows() && cols==this->cols());
+    }
+
+    EIGEN_DEVICE_FUNC
+    inline Scalar coeff(Index row, Index col) const  { return derived().coeff(row,col); }
+    EIGEN_DEVICE_FUNC
+    inline Scalar& coeffRef(Index row, Index col) { return derived().coeffRef(row,col); }
+
+    /** \see MatrixBase::copyCoeff(row,col)
+      */
+    template<typename Other>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, Other& other)
+    {
+      derived().coeffRef(row, col) = other.coeff(row, col);
+    }
+
+    EIGEN_DEVICE_FUNC
+    inline Scalar operator()(Index row, Index col) const
+    {
+      check_coordinates(row, col);
+      return coeff(row,col);
+    }
+    EIGEN_DEVICE_FUNC
+    inline Scalar& operator()(Index row, Index col)
+    {
+      check_coordinates(row, col);
+      return coeffRef(row,col);
+    }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    EIGEN_DEVICE_FUNC
+    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    EIGEN_DEVICE_FUNC
+    inline Derived& derived() { return *static_cast<Derived*>(this); }
+    #endif // not EIGEN_PARSED_BY_DOXYGEN
+
+    template<typename DenseDerived>
+    EIGEN_DEVICE_FUNC
+    void evalTo(MatrixBase<DenseDerived> &other) const;
+    template<typename DenseDerived>
+    EIGEN_DEVICE_FUNC
+    void evalToLazy(MatrixBase<DenseDerived> &other) const;
+
+    EIGEN_DEVICE_FUNC
+    DenseMatrixType toDenseMatrix() const
+    {
+      DenseMatrixType res(rows(), cols());
+      evalToLazy(res);
+      return res;
+    }
+
+  protected:
+
+    void check_coordinates(Index row, Index col) const
+    {
+      EIGEN_ONLY_USED_FOR_DEBUG(row);
+      EIGEN_ONLY_USED_FOR_DEBUG(col);
+      eigen_assert(col>=0 && col<cols() && row>=0 && row<rows());
+      const int mode = int(Mode) & ~SelfAdjoint;
+      EIGEN_ONLY_USED_FOR_DEBUG(mode);
+      eigen_assert((mode==Upper && col>=row)
+                || (mode==Lower && col<=row)
+                || ((mode==StrictlyUpper || mode==UnitUpper) && col>row)
+                || ((mode==StrictlyLower || mode==UnitLower) && col<row));
+    }
+
+    #ifdef EIGEN_INTERNAL_DEBUGGING
+    void check_coordinates_internal(Index row, Index col) const
+    {
+      check_coordinates(row, col);
+    }
+    #else
+    void check_coordinates_internal(Index , Index ) const {}
+    #endif
+
+};
+
+/** \class TriangularView
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a triangular part in a matrix
+  *
+  * \param MatrixType the type of the object in which we are taking the triangular part
+  * \param Mode the kind of triangular matrix expression to construct. Can be #Upper,
+  *             #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower.
+  *             This is in fact a bit field; it must have either #Upper or #Lower,
+  *             and additionally it may have #UnitDiag or #ZeroDiag or neither.
+  *
+  * This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular
+  * matrices one should speak of "trapezoid" parts. This class is the return type
+  * of MatrixBase::triangularView() and SparseMatrixBase::triangularView(), and most of the time this is the only way it is used.
+  *
+  * \sa MatrixBase::triangularView()
+  */
+namespace internal {
+template<typename MatrixType, unsigned int _Mode>
+struct traits<TriangularView<MatrixType, _Mode> > : traits<MatrixType>
+{
+  typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested;
+  typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedNonRef;
+  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
+  typedef typename MatrixType::PlainObject FullMatrixType;
+  typedef MatrixType ExpressionType;
+  enum {
+    Mode = _Mode,
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags = (MatrixTypeNestedCleaned::Flags & (HereditaryBits | FlagsLvalueBit) & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)))
+  };
+};
+}
+
+template<typename _MatrixType, unsigned int _Mode, typename StorageKind> class TriangularViewImpl;
+
+template<typename _MatrixType, unsigned int _Mode> class TriangularView
+  : public TriangularViewImpl<_MatrixType, _Mode, typename internal::traits<_MatrixType>::StorageKind >
+{
+  public:
+
+    typedef TriangularViewImpl<_MatrixType, _Mode, typename internal::traits<_MatrixType>::StorageKind > Base;
+    typedef typename internal::traits<TriangularView>::Scalar Scalar;
+    typedef _MatrixType MatrixType;
+
+  protected:
+    typedef typename internal::traits<TriangularView>::MatrixTypeNested MatrixTypeNested;
+    typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;
+
+    typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
+    typedef TriangularView<typename internal::add_const<MatrixType>::type, _Mode> ConstTriangularView;
+
+  public:
+
+    typedef typename internal::traits<TriangularView>::StorageKind StorageKind;
+    typedef typename internal::traits<TriangularView>::MatrixTypeNestedCleaned NestedExpression;
+
+    enum {
+      Mode = _Mode,
+      Flags = internal::traits<TriangularView>::Flags,
+      TransposeMode = (Mode & Upper ? Lower : 0)
+                    | (Mode & Lower ? Upper : 0)
+                    | (Mode & (UnitDiag))
+                    | (Mode & (ZeroDiag)),
+      IsVectorAtCompileTime = false
+    };
+
+    EIGEN_DEVICE_FUNC
+    explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
+    {}
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView)
+
+    /** \copydoc EigenBase::rows() */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    /** \copydoc EigenBase::cols() */
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+
+    /** \returns a const reference to the nested expression */
+    EIGEN_DEVICE_FUNC
+    const NestedExpression& nestedExpression() const { return m_matrix; }
+
+    /** \returns a reference to the nested expression */
+    EIGEN_DEVICE_FUNC
+    NestedExpression& nestedExpression() { return m_matrix; }
+
+    typedef TriangularView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
+    /** \sa MatrixBase::conjugate() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConjugateReturnType conjugate() const
+    { return ConjugateReturnType(m_matrix.conjugate()); }
+
+    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+     *           returns \c *this otherwise.
+     */
+    template<bool Cond>
+    EIGEN_DEVICE_FUNC
+    inline typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type
+    conjugateIf() const
+    {
+      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type ReturnType;
+      return ReturnType(m_matrix.template conjugateIf<Cond>());
+    }
+
+    typedef TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
+    /** \sa MatrixBase::adjoint() const */
+    EIGEN_DEVICE_FUNC
+    inline const AdjointReturnType adjoint() const
+    { return AdjointReturnType(m_matrix.adjoint()); }
+
+    typedef TriangularView<typename MatrixType::TransposeReturnType,TransposeMode> TransposeReturnType;
+     /** \sa MatrixBase::transpose() */
+    EIGEN_DEVICE_FUNC
+    inline TransposeReturnType transpose()
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      typename MatrixType::TransposeReturnType tmp(m_matrix);
+      return TransposeReturnType(tmp);
+    }
+
+    typedef TriangularView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
+    /** \sa MatrixBase::transpose() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConstTransposeReturnType transpose() const
+    {
+      return ConstTransposeReturnType(m_matrix.transpose());
+    }
+
+    template<typename Other>
+    EIGEN_DEVICE_FUNC
+    inline const Solve<TriangularView, Other>
+    solve(const MatrixBase<Other>& other) const
+    { return Solve<TriangularView, Other>(*this, other.derived()); }
+
+  // workaround MSVC ICE
+  #if EIGEN_COMP_MSVC
+    template<int Side, typename Other>
+    EIGEN_DEVICE_FUNC
+    inline const internal::triangular_solve_retval<Side,TriangularView, Other>
+    solve(const MatrixBase<Other>& other) const
+    { return Base::template solve<Side>(other); }
+  #else
+    using Base::solve;
+  #endif
+
+    /** \returns a selfadjoint view of the referenced triangular part which must be either \c #Upper or \c #Lower.
+      *
+      * This is a shortcut for \code this->nestedExpression().selfadjointView<(*this)::Mode>() \endcode
+      * \sa MatrixBase::selfadjointView() */
+    EIGEN_DEVICE_FUNC
+    SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView()
+    {
+      EIGEN_STATIC_ASSERT((Mode&(UnitDiag|ZeroDiag))==0,PROGRAMMING_ERROR);
+      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
+    }
+
+    /** This is the const version of selfadjointView() */
+    EIGEN_DEVICE_FUNC
+    const SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView() const
+    {
+      EIGEN_STATIC_ASSERT((Mode&(UnitDiag|ZeroDiag))==0,PROGRAMMING_ERROR);
+      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
+    }
+
+
+    /** \returns the determinant of the triangular matrix
+      * \sa MatrixBase::determinant() */
+    EIGEN_DEVICE_FUNC
+    Scalar determinant() const
+    {
+      if (Mode & UnitDiag)
+        return 1;
+      else if (Mode & ZeroDiag)
+        return 0;
+      else
+        return m_matrix.diagonal().prod();
+    }
+
+  protected:
+
+    MatrixTypeNested m_matrix;
+};
+
+/** \ingroup Core_Module
+  *
+  * \brief Base class for a triangular part in a \b dense matrix
+  *
+  * This class is an abstract base class of class TriangularView, and objects of type TriangularViewImpl cannot be instantiated.
+  * It extends class TriangularView with additional methods which available for dense expressions only.
+  *
+  * \sa class TriangularView, MatrixBase::triangularView()
+  */
+template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_MatrixType,_Mode,Dense>
+  : public TriangularBase<TriangularView<_MatrixType, _Mode> >
+{
+  public:
+
+    typedef TriangularView<_MatrixType, _Mode> TriangularViewType;
+    typedef TriangularBase<TriangularViewType> Base;
+    typedef typename internal::traits<TriangularViewType>::Scalar Scalar;
+
+    typedef _MatrixType MatrixType;
+    typedef typename MatrixType::PlainObject DenseMatrixType;
+    typedef DenseMatrixType PlainObject;
+
+  public:
+    using Base::evalToLazy;
+    using Base::derived;
+
+    typedef typename internal::traits<TriangularViewType>::StorageKind StorageKind;
+
+    enum {
+      Mode = _Mode,
+      Flags = internal::traits<TriangularViewType>::Flags
+    };
+
+    /** \returns the outer-stride of the underlying dense matrix
+      * \sa DenseCoeffsBase::outerStride() */
+    EIGEN_DEVICE_FUNC
+    inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
+    /** \returns the inner-stride of the underlying dense matrix
+      * \sa DenseCoeffsBase::innerStride() */
+    EIGEN_DEVICE_FUNC
+    inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
+
+    /** \sa MatrixBase::operator+=() */
+    template<typename Other>
+    EIGEN_DEVICE_FUNC
+    TriangularViewType&  operator+=(const DenseBase<Other>& other) {
+      internal::call_assignment_no_alias(derived(), other.derived(), internal::add_assign_op<Scalar,typename Other::Scalar>());
+      return derived();
+    }
+    /** \sa MatrixBase::operator-=() */
+    template<typename Other>
+    EIGEN_DEVICE_FUNC
+    TriangularViewType&  operator-=(const DenseBase<Other>& other) {
+      internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar,typename Other::Scalar>());
+      return derived();
+    }
+
+    /** \sa MatrixBase::operator*=() */
+    EIGEN_DEVICE_FUNC
+    TriangularViewType&  operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() * other; }
+    /** \sa DenseBase::operator/=() */
+    EIGEN_DEVICE_FUNC
+    TriangularViewType&  operator/=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() / other; }
+
+    /** \sa MatrixBase::fill() */
+    EIGEN_DEVICE_FUNC
+    void fill(const Scalar& value) { setConstant(value); }
+    /** \sa MatrixBase::setConstant() */
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& setConstant(const Scalar& value)
+    { return *this = MatrixType::Constant(derived().rows(), derived().cols(), value); }
+    /** \sa MatrixBase::setZero() */
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& setZero() { return setConstant(Scalar(0)); }
+    /** \sa MatrixBase::setOnes() */
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& setOnes() { return setConstant(Scalar(1)); }
+
+    /** \sa MatrixBase::coeff()
+      * \warning the coordinates must fit into the referenced triangular part
+      */
+    EIGEN_DEVICE_FUNC
+    inline Scalar coeff(Index row, Index col) const
+    {
+      Base::check_coordinates_internal(row, col);
+      return derived().nestedExpression().coeff(row, col);
+    }
+
+    /** \sa MatrixBase::coeffRef()
+      * \warning the coordinates must fit into the referenced triangular part
+      */
+    EIGEN_DEVICE_FUNC
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(TriangularViewType);
+      Base::check_coordinates_internal(row, col);
+      return derived().nestedExpression().coeffRef(row, col);
+    }
+
+    /** Assigns a triangular matrix to a triangular part of a dense matrix */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& operator=(const TriangularBase<OtherDerived>& other);
+
+    /** Shortcut for\code *this = other.other.triangularView<(*this)::Mode>() \endcode */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& operator=(const MatrixBase<OtherDerived>& other);
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& operator=(const TriangularViewImpl& other)
+    { return *this = other.derived().nestedExpression(); }
+
+    template<typename OtherDerived>
+    /** \deprecated */
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
+    void lazyAssign(const TriangularBase<OtherDerived>& other);
+
+    template<typename OtherDerived>
+    /** \deprecated */
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
+    void lazyAssign(const MatrixBase<OtherDerived>& other);
+#endif
+
+    /** Efficient triangular matrix times vector/matrix product */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    const Product<TriangularViewType,OtherDerived>
+    operator*(const MatrixBase<OtherDerived>& rhs) const
+    {
+      return Product<TriangularViewType,OtherDerived>(derived(), rhs.derived());
+    }
+
+    /** Efficient vector/matrix times triangular matrix product */
+    template<typename OtherDerived> friend
+    EIGEN_DEVICE_FUNC
+    const Product<OtherDerived,TriangularViewType>
+    operator*(const MatrixBase<OtherDerived>& lhs, const TriangularViewImpl& rhs)
+    {
+      return Product<OtherDerived,TriangularViewType>(lhs.derived(),rhs.derived());
+    }
+
+    /** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.
+      *
+      * This function computes the inverse-matrix matrix product inverse(\c *this) * \a other if
+      * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
+      * \a Side==OnTheRight.
+      *
+      * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
+      *
+      * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
+      * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
+      * is an upper (resp. lower) triangular matrix.
+      *
+      * Example: \include Triangular_solve.cpp
+      * Output: \verbinclude Triangular_solve.out
+      *
+      * This function returns an expression of the inverse-multiply and can works in-place if it is assigned
+      * to the same matrix or vector \a other.
+      *
+      * For users coming from BLAS, this function (and more specifically solveInPlace()) offer
+      * all the operations supported by the \c *TRSV and \c *TRSM BLAS routines.
+      *
+      * \sa TriangularView::solveInPlace()
+      */
+    template<int Side, typename Other>
+    inline const internal::triangular_solve_retval<Side,TriangularViewType, Other>
+    solve(const MatrixBase<Other>& other) const;
+
+    /** "in-place" version of TriangularView::solve() where the result is written in \a other
+      *
+      * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
+      * This function will const_cast it, so constness isn't honored here.
+      *
+      * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
+      *
+      * See TriangularView:solve() for the details.
+      */
+    template<int Side, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    void solveInPlace(const MatrixBase<OtherDerived>& other) const;
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    void solveInPlace(const MatrixBase<OtherDerived>& other) const
+    { return solveInPlace<OnTheLeft>(other); }
+
+    /** Swaps the coefficients of the common triangular parts of two matrices */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+    void swap(TriangularBase<OtherDerived> &other)
+#else
+    void swap(TriangularBase<OtherDerived> const & other)
+#endif
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
+      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
+    }
+
+    /** Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */
+    template<typename OtherDerived>
+    /** \deprecated */
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
+    void swap(MatrixBase<OtherDerived> const & other)
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
+      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
+    }
+
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _solve_impl(const RhsType &rhs, DstType &dst) const {
+      if(!internal::is_same_dense(dst,rhs))
+        dst = rhs;
+      this->solveInPlace(dst);
+    }
+
+    template<typename ProductType>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta);
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TriangularViewImpl)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TriangularViewImpl)
+
+};
+
+/***************************************************************************
+* Implementation of triangular evaluation/assignment
+***************************************************************************/
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+// FIXME should we keep that possibility
+template<typename MatrixType, unsigned int Mode>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
+TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
+{
+  internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+// FIXME should we keep that possibility
+template<typename MatrixType, unsigned int Mode>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
+{
+  internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
+}
+
+
+
+template<typename MatrixType, unsigned int Mode>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
+TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
+{
+  eigen_assert(Mode == int(OtherDerived::Mode));
+  internal::call_assignment(derived(), other.derived());
+  return derived();
+}
+
+template<typename MatrixType, unsigned int Mode>
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
+{
+  eigen_assert(Mode == int(OtherDerived::Mode));
+  internal::call_assignment_no_alias(derived(), other.derived());
+}
+#endif
+
+/***************************************************************************
+* Implementation of TriangularBase methods
+***************************************************************************/
+
+/** Assigns a triangular or selfadjoint matrix to a dense matrix.
+  * If the matrix is triangular, the opposite part is set to zero. */
+template<typename Derived>
+template<typename DenseDerived>
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
+{
+  evalToLazy(other.derived());
+}
+
+/***************************************************************************
+* Implementation of TriangularView methods
+***************************************************************************/
+
+/***************************************************************************
+* Implementation of MatrixBase methods
+***************************************************************************/
+
+/**
+  * \returns an expression of a triangular view extracted from the current matrix
+  *
+  * The parameter \a Mode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
+  * \c #Lower, \c #StrictlyLower, \c #UnitLower.
+  *
+  * Example: \include MatrixBase_triangularView.cpp
+  * Output: \verbinclude MatrixBase_triangularView.out
+  *
+  * \sa class TriangularView
+  */
+template<typename Derived>
+template<unsigned int Mode>
+EIGEN_DEVICE_FUNC
+typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
+MatrixBase<Derived>::triangularView()
+{
+  return typename TriangularViewReturnType<Mode>::Type(derived());
+}
+
+/** This is the const version of MatrixBase::triangularView() */
+template<typename Derived>
+template<unsigned int Mode>
+EIGEN_DEVICE_FUNC
+typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
+MatrixBase<Derived>::triangularView() const
+{
+  return typename ConstTriangularViewReturnType<Mode>::Type(derived());
+}
+
+/** \returns true if *this is approximately equal to an upper triangular matrix,
+  *          within the precision given by \a prec.
+  *
+  * \sa isLowerTriangular()
+  */
+template<typename Derived>
+bool MatrixBase<Derived>::isUpperTriangular(const RealScalar& prec) const
+{
+  RealScalar maxAbsOnUpperPart = static_cast<RealScalar>(-1);
+  for(Index j = 0; j < cols(); ++j)
+  {
+    Index maxi = numext::mini(j, rows()-1);
+    for(Index i = 0; i <= maxi; ++i)
+    {
+      RealScalar absValue = numext::abs(coeff(i,j));
+      if(absValue > maxAbsOnUpperPart) maxAbsOnUpperPart = absValue;
+    }
+  }
+  RealScalar threshold = maxAbsOnUpperPart * prec;
+  for(Index j = 0; j < cols(); ++j)
+    for(Index i = j+1; i < rows(); ++i)
+      if(numext::abs(coeff(i, j)) > threshold) return false;
+  return true;
+}
+
+/** \returns true if *this is approximately equal to a lower triangular matrix,
+  *          within the precision given by \a prec.
+  *
+  * \sa isUpperTriangular()
+  */
+template<typename Derived>
+bool MatrixBase<Derived>::isLowerTriangular(const RealScalar& prec) const
+{
+  RealScalar maxAbsOnLowerPart = static_cast<RealScalar>(-1);
+  for(Index j = 0; j < cols(); ++j)
+    for(Index i = j; i < rows(); ++i)
+    {
+      RealScalar absValue = numext::abs(coeff(i,j));
+      if(absValue > maxAbsOnLowerPart) maxAbsOnLowerPart = absValue;
+    }
+  RealScalar threshold = maxAbsOnLowerPart * prec;
+  for(Index j = 1; j < cols(); ++j)
+  {
+    Index maxi = numext::mini(j, rows()-1);
+    for(Index i = 0; i < maxi; ++i)
+      if(numext::abs(coeff(i, j)) > threshold) return false;
+  }
+  return true;
+}
+
+
+/***************************************************************************
+****************************************************************************
+* Evaluators and Assignment of triangular expressions
+***************************************************************************
+***************************************************************************/
+
+namespace internal {
+
+
+// TODO currently a triangular expression has the form TriangularView<.,.>
+//      in the future triangular-ness should be defined by the expression traits
+//      such that Transpose<TriangularView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
+template<typename MatrixType, unsigned int Mode>
+struct evaluator_traits<TriangularView<MatrixType,Mode> >
+{
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef typename glue_shapes<typename evaluator_traits<MatrixType>::Shape, TriangularShape>::type Shape;
+};
+
+template<typename MatrixType, unsigned int Mode>
+struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
+ : evaluator<typename internal::remove_all<MatrixType>::type>
+{
+  typedef TriangularView<MatrixType,Mode> XprType;
+  typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
+  EIGEN_DEVICE_FUNC
+  unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
+};
+
+// Additional assignment kinds:
+struct Triangular2Triangular    {};
+struct Triangular2Dense         {};
+struct Dense2Triangular         {};
+
+
+template<typename Kernel, unsigned int Mode, int UnrollCount, bool ClearOpposite> struct triangular_assignment_loop;
+
+
+/** \internal Specialization of the dense assignment kernel for triangular matrices.
+  * The main difference is that the triangular, diagonal, and opposite parts are processed through three different functions.
+  * \tparam UpLo must be either Lower or Upper
+  * \tparam Mode must be either 0, UnitDiag, ZeroDiag, or SelfAdjoint
+  */
+template<int UpLo, int Mode, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version = Specialized>
+class triangular_dense_assignment_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version>
+{
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version> Base;
+  typedef typename Base::DstXprType DstXprType;
+  typedef typename Base::SrcXprType SrcXprType;
+  using Base::m_dst;
+  using Base::m_src;
+  using Base::m_functor;
+public:
+
+  typedef typename Base::DstEvaluatorType DstEvaluatorType;
+  typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::AssignmentTraits AssignmentTraits;
+
+
+  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {}
+
+#ifdef EIGEN_INTERNAL_DEBUGGING
+  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
+  {
+    eigen_internal_assert(row!=col);
+    Base::assignCoeff(row,col);
+  }
+#else
+  using Base::assignCoeff;
+#endif
+
+  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
+  {
+         if(Mode==UnitDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(1));
+    else if(Mode==ZeroDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(0));
+    else if(Mode==0)                       Base::assignCoeff(id,id);
+  }
+
+  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index row, Index col)
+  {
+    eigen_internal_assert(row!=col);
+    if(SetOpposite)
+      m_functor.assignCoeff(m_dst.coeffRef(row,col), Scalar(0));
+  }
+};
+
+template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func)
+{
+  typedef evaluator<DstXprType> DstEvaluatorType;
+  typedef evaluator<SrcXprType> SrcEvaluatorType;
+
+  SrcEvaluatorType srcEvaluator(src);
+
+  Index dstRows = src.rows();
+  Index dstCols = src.cols();
+  if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+    dst.resize(dstRows, dstCols);
+  DstEvaluatorType dstEvaluator(dst);
+
+  typedef triangular_dense_assignment_kernel< Mode&(Lower|Upper),Mode&(UnitDiag|ZeroDiag|SelfAdjoint),SetOpposite,
+                                              DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
+  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+
+  enum {
+      unroll = DstXprType::SizeAtCompileTime != Dynamic
+            && SrcEvaluatorType::CoeffReadCost < HugeCost
+            && DstXprType::SizeAtCompileTime * (int(DstEvaluatorType::CoeffReadCost) + int(SrcEvaluatorType::CoeffReadCost)) / 2 <= EIGEN_UNROLLING_LIMIT
+    };
+
+  triangular_assignment_loop<Kernel, Mode, unroll ? int(DstXprType::SizeAtCompileTime) : Dynamic, SetOpposite>::run(kernel);
+}
+
+template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src)
+{
+  call_triangular_assignment_loop<Mode,SetOpposite>(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
+}
+
+template<> struct AssignmentKind<TriangularShape,TriangularShape> { typedef Triangular2Triangular Kind; };
+template<> struct AssignmentKind<DenseShape,TriangularShape>      { typedef Triangular2Dense      Kind; };
+template<> struct AssignmentKind<TriangularShape,DenseShape>      { typedef Dense2Triangular      Kind; };
+
+
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular>
+{
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    eigen_assert(int(DstXprType::Mode) == int(SrcXprType::Mode));
+
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
+  }
+};
+
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense>
+{
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    call_triangular_assignment_loop<SrcXprType::Mode, (int(SrcXprType::Mode) & int(SelfAdjoint)) == 0>(dst, src, func);
+  }
+};
+
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular>
+{
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
+  }
+};
+
+
+template<typename Kernel, unsigned int Mode, int UnrollCount, bool SetOpposite>
+struct triangular_assignment_loop
+{
+  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
+  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
+  typedef typename DstEvaluatorType::XprType DstXprType;
+
+  enum {
+    col = (UnrollCount-1) / DstXprType::RowsAtCompileTime,
+    row = (UnrollCount-1) % DstXprType::RowsAtCompileTime
+  };
+
+  typedef typename Kernel::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC
+  static inline void run(Kernel &kernel)
+  {
+    triangular_assignment_loop<Kernel, Mode, UnrollCount-1, SetOpposite>::run(kernel);
+
+    if(row==col)
+      kernel.assignDiagonalCoeff(row);
+    else if( ((Mode&Lower) && row>col) || ((Mode&Upper) && row<col) )
+      kernel.assignCoeff(row,col);
+    else if(SetOpposite)
+      kernel.assignOppositeCoeff(row,col);
+  }
+};
+
+// prevent buggy user code from causing an infinite recursion
+template<typename Kernel, unsigned int Mode, bool SetOpposite>
+struct triangular_assignment_loop<Kernel, Mode, 0, SetOpposite>
+{
+  EIGEN_DEVICE_FUNC
+  static inline void run(Kernel &) {}
+};
+
+
+
+// TODO: experiment with a recursive assignment procedure splitting the current
+//       triangular part into one rectangular and two triangular parts.
+
+
+template<typename Kernel, unsigned int Mode, bool SetOpposite>
+struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
+{
+  typedef typename Kernel::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  static inline void run(Kernel &kernel)
+  {
+    for(Index j = 0; j < kernel.cols(); ++j)
+    {
+      Index maxi = numext::mini(j, kernel.rows());
+      Index i = 0;
+      if (((Mode&Lower) && SetOpposite) || (Mode&Upper))
+      {
+        for(; i < maxi; ++i)
+          if(Mode&Upper) kernel.assignCoeff(i, j);
+          else           kernel.assignOppositeCoeff(i, j);
+      }
+      else
+        i = maxi;
+
+      if(i<kernel.rows()) // then i==j
+        kernel.assignDiagonalCoeff(i++);
+
+      if (((Mode&Upper) && SetOpposite) || (Mode&Lower))
+      {
+        for(; i < kernel.rows(); ++i)
+          if(Mode&Lower) kernel.assignCoeff(i, j);
+          else           kernel.assignOppositeCoeff(i, j);
+      }
+    }
+  }
+};
+
+} // end namespace internal
+
+/** Assigns a triangular or selfadjoint matrix to a dense matrix.
+  * If the matrix is triangular, the opposite part is set to zero. */
+template<typename Derived>
+template<typename DenseDerived>
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
+{
+  other.derived().resize(this->rows(), this->cols());
+  internal::call_triangular_assignment_loop<Derived::Mode, (int(Derived::Mode) & int(SelfAdjoint)) == 0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
+}
+
+namespace internal {
+
+// Triangular = Product
+template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
+{
+  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,typename SrcXprType::Scalar> &)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
+
+    dst._assignProduct(src, Scalar(1), false);
+  }
+};
+
+// Triangular += Product
+template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
+{
+  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
+  {
+    dst._assignProduct(src, Scalar(1), true);
+  }
+};
+
+// Triangular -= Product
+template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
+{
+  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
+  {
+    dst._assignProduct(src, Scalar(-1), true);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULARMATRIX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/VectorBlock.h b/src/3rdparty/eigen/Eigen/src/Core/VectorBlock.h
new file mode 100644
index 000000000..71c5b95ee
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/VectorBlock.h
@@ -0,0 +1,96 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_VECTORBLOCK_H
+#define EIGEN_VECTORBLOCK_H
+
+namespace Eigen { 
+
+namespace internal {
+template<typename VectorType, int Size>
+struct traits<VectorBlock<VectorType, Size> >
+  : public traits<Block<VectorType,
+                     traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
+                     traits<VectorType>::Flags & RowMajorBit ? Size : 1> >
+{
+};
+}
+
+/** \class VectorBlock
+  * \ingroup Core_Module
+  *
+  * \brief Expression of a fixed-size or dynamic-size sub-vector
+  *
+  * \tparam VectorType the type of the object in which we are taking a sub-vector
+  * \tparam Size size of the sub-vector we are taking at compile time (optional)
+  *
+  * This class represents an expression of either a fixed-size or dynamic-size sub-vector.
+  * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
+  * most of the time this is the only way it is used.
+  *
+  * However, if you want to directly manipulate sub-vector expressions,
+  * for instance if you want to write a function returning such an expression, you
+  * will need to use this class.
+  *
+  * Here is an example illustrating the dynamic case:
+  * \include class_VectorBlock.cpp
+  * Output: \verbinclude class_VectorBlock.out
+  *
+  * \note Even though this expression has dynamic size, in the case where \a VectorType
+  * has fixed size, this expression inherits a fixed maximal size which means that evaluating
+  * it does not cause a dynamic memory allocation.
+  *
+  * Here is an example illustrating the fixed-size case:
+  * \include class_FixedVectorBlock.cpp
+  * Output: \verbinclude class_FixedVectorBlock.out
+  *
+  * \sa class Block, DenseBase::segment(Index,Index,Index,Index), DenseBase::segment(Index,Index)
+  */
+template<typename VectorType, int Size> class VectorBlock
+  : public Block<VectorType,
+                     internal::traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
+                     internal::traits<VectorType>::Flags & RowMajorBit ? Size : 1>
+{
+    typedef Block<VectorType,
+                     internal::traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
+                     internal::traits<VectorType>::Flags & RowMajorBit ? Size : 1> Base;
+    enum {
+      IsColVector = !(internal::traits<VectorType>::Flags & RowMajorBit)
+    };
+  public:
+    EIGEN_DENSE_PUBLIC_INTERFACE(VectorBlock)
+
+    using Base::operator=;
+
+    /** Dynamic-size constructor
+      */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    VectorBlock(VectorType& vector, Index start, Index size)
+      : Base(vector,
+             IsColVector ? start : 0, IsColVector ? 0 : start,
+             IsColVector ? size  : 1, IsColVector ? 1 : size)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock);
+    }
+
+    /** Fixed-size constructor
+      */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    VectorBlock(VectorType& vector, Index start)
+      : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock);
+    }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_VECTORBLOCK_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/VectorwiseOp.h b/src/3rdparty/eigen/Eigen/src/Core/VectorwiseOp.h
new file mode 100644
index 000000000..870f4f1e4
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/VectorwiseOp.h
@@ -0,0 +1,784 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PARTIAL_REDUX_H
+#define EIGEN_PARTIAL_REDUX_H
+
+namespace Eigen {
+
+/** \class PartialReduxExpr
+  * \ingroup Core_Module
+  *
+  * \brief Generic expression of a partially reduxed matrix
+  *
+  * \tparam MatrixType the type of the matrix we are applying the redux operation
+  * \tparam MemberOp type of the member functor
+  * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal)
+  *
+  * This class represents an expression of a partial redux operator of a matrix.
+  * It is the return type of some VectorwiseOp functions,
+  * and most of the time this is the only way it is used.
+  *
+  * \sa class VectorwiseOp
+  */
+
+template< typename MatrixType, typename MemberOp, int Direction>
+class PartialReduxExpr;
+
+namespace internal {
+template<typename MatrixType, typename MemberOp, int Direction>
+struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> >
+ : traits<MatrixType>
+{
+  typedef typename MemberOp::result_type Scalar;
+  typedef typename traits<MatrixType>::StorageKind StorageKind;
+  typedef typename traits<MatrixType>::XprKind XprKind;
+  typedef typename MatrixType::Scalar InputScalar;
+  enum {
+    RowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::MaxColsAtCompileTime,
+    Flags = RowsAtCompileTime == 1 ? RowMajorBit : 0,
+    TraversalSize = Direction==Vertical ? MatrixType::RowsAtCompileTime :  MatrixType::ColsAtCompileTime
+  };
+};
+}
+
+template< typename MatrixType, typename MemberOp, int Direction>
+class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<MatrixType, MemberOp, Direction> >::type,
+                         internal::no_assignment_operator
+{
+  public:
+
+    typedef typename internal::dense_xpr_base<PartialReduxExpr>::type Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(PartialReduxExpr)
+
+    EIGEN_DEVICE_FUNC
+    explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
+      : m_matrix(mat), m_functor(func) {}
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return (Direction==Vertical   ? 1 : m_matrix.rows()); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return (Direction==Horizontal ? 1 : m_matrix.cols()); }
+
+    EIGEN_DEVICE_FUNC
+    typename MatrixType::Nested nestedExpression() const { return m_matrix; }
+
+    EIGEN_DEVICE_FUNC
+    const MemberOp& functor() const { return m_functor; }
+
+  protected:
+    typename MatrixType::Nested m_matrix;
+    const MemberOp m_functor;
+};
+
+template<typename A,typename B> struct partial_redux_dummy_func;
+
+#define EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,VECTORIZABLE,BINARYOP)                \
+  template <typename ResultType,typename Scalar>                                                            \
+  struct member_##MEMBER {                                                                  \
+    EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER)                                                \
+    typedef ResultType result_type;                                                         \
+    typedef BINARYOP<Scalar,Scalar> BinaryOp;   \
+    template<int Size> struct Cost { enum { value = COST }; };             \
+    enum { Vectorizable = VECTORIZABLE };                                                   \
+    template<typename XprType>                                                              \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                   \
+    ResultType operator()(const XprType& mat) const                                         \
+    { return mat.MEMBER(); }                                                                \
+    BinaryOp binaryFunc() const { return BinaryOp(); }                                      \
+  }
+
+#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST) \
+  EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,0,partial_redux_dummy_func)
+
+namespace internal {
+
+EIGEN_MEMBER_FUNCTOR(norm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(stableNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(blueNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(hypotNorm, (Size-1) * functor_traits<scalar_hypot_op<Scalar> >::Cost );
+EIGEN_MEMBER_FUNCTOR(all, (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits<Scalar>::AddCost);
+
+EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(sum, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_sum_op);
+EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(minCoeff, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_min_op);
+EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(maxCoeff, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_max_op);
+EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost, 1, internal::scalar_product_op);
+
+template <int p, typename ResultType,typename Scalar>
+struct member_lpnorm {
+  typedef ResultType result_type;
+  enum { Vectorizable = 0 };
+  template<int Size> struct Cost
+  { enum { value = (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost }; };
+  EIGEN_DEVICE_FUNC member_lpnorm() {}
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC inline ResultType operator()(const XprType& mat) const
+  { return mat.template lpNorm<p>(); }
+};
+
+template <typename BinaryOpT, typename Scalar>
+struct member_redux {
+  typedef BinaryOpT BinaryOp;
+  typedef typename result_of<
+                     BinaryOp(const Scalar&,const Scalar&)
+                   >::type  result_type;
+
+  enum { Vectorizable = functor_traits<BinaryOp>::PacketAccess };
+  template<int Size> struct Cost { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
+  EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {}
+  template<typename Derived>
+  EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase<Derived>& mat) const
+  { return mat.redux(m_functor); }
+  const BinaryOp& binaryFunc() const { return m_functor; }
+  const BinaryOp m_functor;
+};
+}
+
+/** \class VectorwiseOp
+  * \ingroup Core_Module
+  *
+  * \brief Pseudo expression providing broadcasting and partial reduction operations
+  *
+  * \tparam ExpressionType the type of the object on which to do partial reductions
+  * \tparam Direction indicates whether to operate on columns (#Vertical) or rows (#Horizontal)
+  *
+  * This class represents a pseudo expression with broadcasting and partial reduction features.
+  * It is the return type of DenseBase::colwise() and DenseBase::rowwise()
+  * and most of the time this is the only way it is explicitly used.
+  *
+  * To understand the logic of rowwise/colwise expression, let's consider a generic case `A.colwise().foo()`
+  * where `foo` is any method of `VectorwiseOp`. This expression is equivalent to applying `foo()` to each
+  * column of `A` and then re-assemble the outputs in a matrix expression:
+  * \code [A.col(0).foo(), A.col(1).foo(), ..., A.col(A.cols()-1).foo()] \endcode
+  *
+  * Example: \include MatrixBase_colwise.cpp
+  * Output: \verbinclude MatrixBase_colwise.out
+  *
+  * The begin() and end() methods are obviously exceptions to the previous rule as they
+  * return STL-compatible begin/end iterators to the rows or columns of the nested expression.
+  * Typical use cases include for-range-loop and calls to STL algorithms:
+  *
+  * Example: \include MatrixBase_colwise_iterator_cxx11.cpp
+  * Output: \verbinclude MatrixBase_colwise_iterator_cxx11.out
+  *
+  * For a partial reduction on an empty input, some rules apply.
+  * For the sake of clarity, let's consider a vertical reduction:
+  *   - If the number of columns is zero, then a 1x0 row-major vector expression is returned.
+  *   - Otherwise, if the number of rows is zero, then
+  *       - a row vector of zeros is returned for sum-like reductions (sum, squaredNorm, norm, etc.)
+  *       - a row vector of ones is returned for a product reduction (e.g., <code>MatrixXd(n,0).colwise().prod()</code>)
+  *       - an assert is triggered for all other reductions (minCoeff,maxCoeff,redux(bin_op))
+  *
+  * \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr
+  */
+template<typename ExpressionType, int Direction> class VectorwiseOp
+{
+  public:
+
+    typedef typename ExpressionType::Scalar Scalar;
+    typedef typename ExpressionType::RealScalar RealScalar;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type ExpressionTypeNested;
+    typedef typename internal::remove_all<ExpressionTypeNested>::type ExpressionTypeNestedCleaned;
+
+    template<template<typename OutScalar,typename InputScalar> class Functor,
+                      typename ReturnScalar=Scalar> struct ReturnType
+    {
+      typedef PartialReduxExpr<ExpressionType,
+                               Functor<ReturnScalar,Scalar>,
+                               Direction
+                              > Type;
+    };
+
+    template<typename BinaryOp> struct ReduxReturnType
+    {
+      typedef PartialReduxExpr<ExpressionType,
+                               internal::member_redux<BinaryOp,Scalar>,
+                               Direction
+                              > Type;
+    };
+
+    enum {
+      isVertical   = (Direction==Vertical) ? 1 : 0,
+      isHorizontal = (Direction==Horizontal) ? 1 : 0
+    };
+
+  protected:
+
+    template<typename OtherDerived> struct ExtendedType {
+      typedef Replicate<OtherDerived,
+                        isVertical   ? 1 : ExpressionType::RowsAtCompileTime,
+                        isHorizontal ? 1 : ExpressionType::ColsAtCompileTime> Type;
+    };
+
+    /** \internal
+      * Replicates a vector to match the size of \c *this */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    typename ExtendedType<OtherDerived>::Type
+    extendedTo(const DenseBase<OtherDerived>& other) const
+    {
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isVertical, OtherDerived::MaxColsAtCompileTime==1),
+                          YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isHorizontal, OtherDerived::MaxRowsAtCompileTime==1),
+                          YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
+      return typename ExtendedType<OtherDerived>::Type
+                      (other.derived(),
+                       isVertical   ? 1 : m_matrix.rows(),
+                       isHorizontal ? 1 : m_matrix.cols());
+    }
+
+    template<typename OtherDerived> struct OppositeExtendedType {
+      typedef Replicate<OtherDerived,
+                        isHorizontal ? 1 : ExpressionType::RowsAtCompileTime,
+                        isVertical   ? 1 : ExpressionType::ColsAtCompileTime> Type;
+    };
+
+    /** \internal
+      * Replicates a vector in the opposite direction to match the size of \c *this */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    typename OppositeExtendedType<OtherDerived>::Type
+    extendedToOpposite(const DenseBase<OtherDerived>& other) const
+    {
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isHorizontal, OtherDerived::MaxColsAtCompileTime==1),
+                          YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isVertical, OtherDerived::MaxRowsAtCompileTime==1),
+                          YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
+      return typename OppositeExtendedType<OtherDerived>::Type
+                      (other.derived(),
+                       isHorizontal  ? 1 : m_matrix.rows(),
+                       isVertical    ? 1 : m_matrix.cols());
+    }
+
+  public:
+    EIGEN_DEVICE_FUNC
+    explicit inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {}
+
+    /** \internal */
+    EIGEN_DEVICE_FUNC
+    inline const ExpressionType& _expression() const { return m_matrix; }
+
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    /** STL-like <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">RandomAccessIterator</a>
+      * iterator type over the columns or rows as returned by the begin() and end() methods.
+      */
+    random_access_iterator_type iterator;
+    /** This is the const version of iterator (aka read-only) */
+    random_access_iterator_type const_iterator;
+    #else
+    typedef internal::subvector_stl_iterator<ExpressionType,               DirectionType(Direction)> iterator;
+    typedef internal::subvector_stl_iterator<const ExpressionType,         DirectionType(Direction)> const_iterator;
+    typedef internal::subvector_stl_reverse_iterator<ExpressionType,       DirectionType(Direction)> reverse_iterator;
+    typedef internal::subvector_stl_reverse_iterator<const ExpressionType, DirectionType(Direction)> const_reverse_iterator;
+    #endif
+
+    /** returns an iterator to the first row (rowwise) or column (colwise) of the nested expression.
+      * \sa end(), cbegin()
+      */
+    iterator                 begin()       { return iterator      (m_matrix, 0); }
+    /** const version of begin() */
+    const_iterator           begin() const { return const_iterator(m_matrix, 0); }
+    /** const version of begin() */
+    const_iterator          cbegin() const { return const_iterator(m_matrix, 0); }
+
+    /** returns a reverse iterator to the last row (rowwise) or column (colwise) of the nested expression.
+      * \sa rend(), crbegin()
+      */
+    reverse_iterator        rbegin()       { return reverse_iterator       (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()-1); }
+	/** const version of rbegin() */
+    const_reverse_iterator  rbegin() const { return const_reverse_iterator (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()-1); }
+	/** const version of rbegin() */
+	const_reverse_iterator crbegin() const { return const_reverse_iterator (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()-1); }
+
+    /** returns an iterator to the row (resp. column) following the last row (resp. column) of the nested expression
+      * \sa begin(), cend()
+      */
+    iterator                 end()         { return iterator      (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
+    /** const version of end() */
+    const_iterator           end()  const  { return const_iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
+    /** const version of end() */
+    const_iterator          cend()  const  { return const_iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
+
+    /** returns a reverse iterator to the row (resp. column) before the first row (resp. column) of the nested expression
+      * \sa begin(), cend()
+      */
+    reverse_iterator        rend()         { return reverse_iterator       (m_matrix, -1); }
+    /** const version of rend() */
+    const_reverse_iterator  rend()  const  { return const_reverse_iterator (m_matrix, -1); }
+    /** const version of rend() */
+    const_reverse_iterator crend()  const  { return const_reverse_iterator (m_matrix, -1); }
+
+    /** \returns a row or column vector expression of \c *this reduxed by \a func
+      *
+      * The template parameter \a BinaryOp is the type of the functor
+      * of the custom redux operator. Note that func must be an associative operator.
+      *
+      * \warning the size along the reduction direction must be strictly positive,
+      *          otherwise an assertion is triggered.
+      *
+      * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise()
+      */
+    template<typename BinaryOp>
+    EIGEN_DEVICE_FUNC
+    const typename ReduxReturnType<BinaryOp>::Type
+    redux(const BinaryOp& func = BinaryOp()) const
+    {
+      eigen_assert(redux_length()>0 && "you are using an empty matrix");
+      return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func));
+    }
+
+    typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
+    typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
+    typedef PartialReduxExpr<const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const ExpressionTypeNestedCleaned>,internal::member_sum<RealScalar,RealScalar>,Direction> SquaredNormReturnType;
+    typedef CwiseUnaryOp<internal::scalar_sqrt_op<RealScalar>, const SquaredNormReturnType> NormReturnType;
+    typedef typename ReturnType<internal::member_blueNorm,RealScalar>::Type BlueNormReturnType;
+    typedef typename ReturnType<internal::member_stableNorm,RealScalar>::Type StableNormReturnType;
+    typedef typename ReturnType<internal::member_hypotNorm,RealScalar>::Type HypotNormReturnType;
+    typedef typename ReturnType<internal::member_sum>::Type SumReturnType;
+    typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(SumReturnType,Scalar,quotient) MeanReturnType;
+    typedef typename ReturnType<internal::member_all>::Type AllReturnType;
+    typedef typename ReturnType<internal::member_any>::Type AnyReturnType;
+    typedef PartialReduxExpr<ExpressionType, internal::member_count<Index,Scalar>, Direction> CountReturnType;
+    typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
+    typedef Reverse<const ExpressionType, Direction> ConstReverseReturnType;
+    typedef Reverse<ExpressionType, Direction> ReverseReturnType;
+
+    template<int p> struct LpNormReturnType {
+      typedef PartialReduxExpr<ExpressionType, internal::member_lpnorm<p,RealScalar,Scalar>,Direction> Type;
+    };
+
+    /** \returns a row (or column) vector expression of the smallest coefficient
+      * of each column (or row) of the referenced expression.
+      *
+      * \warning the size along the reduction direction must be strictly positive,
+      *          otherwise an assertion is triggered.
+      *
+      * \warning the result is undefined if \c *this contains NaN.
+      *
+      * Example: \include PartialRedux_minCoeff.cpp
+      * Output: \verbinclude PartialRedux_minCoeff.out
+      *
+      * \sa DenseBase::minCoeff() */
+    EIGEN_DEVICE_FUNC
+    const MinCoeffReturnType minCoeff() const
+    {
+      eigen_assert(redux_length()>0 && "you are using an empty matrix");
+      return MinCoeffReturnType(_expression());
+    }
+
+    /** \returns a row (or column) vector expression of the largest coefficient
+      * of each column (or row) of the referenced expression.
+      *
+      * \warning the size along the reduction direction must be strictly positive,
+      *          otherwise an assertion is triggered.
+      *
+      * \warning the result is undefined if \c *this contains NaN.
+      *
+      * Example: \include PartialRedux_maxCoeff.cpp
+      * Output: \verbinclude PartialRedux_maxCoeff.out
+      *
+      * \sa DenseBase::maxCoeff() */
+    EIGEN_DEVICE_FUNC
+    const MaxCoeffReturnType maxCoeff() const
+    {
+      eigen_assert(redux_length()>0 && "you are using an empty matrix");
+      return MaxCoeffReturnType(_expression());
+    }
+
+    /** \returns a row (or column) vector expression of the squared norm
+      * of each column (or row) of the referenced expression.
+      * This is a vector with real entries, even if the original matrix has complex entries.
+      *
+      * Example: \include PartialRedux_squaredNorm.cpp
+      * Output: \verbinclude PartialRedux_squaredNorm.out
+      *
+      * \sa DenseBase::squaredNorm() */
+    EIGEN_DEVICE_FUNC
+    const SquaredNormReturnType squaredNorm() const
+    { return SquaredNormReturnType(m_matrix.cwiseAbs2()); }
+
+    /** \returns a row (or column) vector expression of the norm
+      * of each column (or row) of the referenced expression.
+      * This is a vector with real entries, even if the original matrix has complex entries.
+      *
+      * Example: \include PartialRedux_norm.cpp
+      * Output: \verbinclude PartialRedux_norm.out
+      *
+      * \sa DenseBase::norm() */
+    EIGEN_DEVICE_FUNC
+    const NormReturnType norm() const
+    { return NormReturnType(squaredNorm()); }
+
+    /** \returns a row (or column) vector expression of the norm
+      * of each column (or row) of the referenced expression.
+      * This is a vector with real entries, even if the original matrix has complex entries.
+      *
+      * Example: \include PartialRedux_norm.cpp
+      * Output: \verbinclude PartialRedux_norm.out
+      *
+      * \sa DenseBase::norm() */
+    template<int p>
+    EIGEN_DEVICE_FUNC
+    const typename LpNormReturnType<p>::Type lpNorm() const
+    { return typename LpNormReturnType<p>::Type(_expression()); }
+
+
+    /** \returns a row (or column) vector expression of the norm
+      * of each column (or row) of the referenced expression, using
+      * Blue's algorithm.
+      * This is a vector with real entries, even if the original matrix has complex entries.
+      *
+      * \sa DenseBase::blueNorm() */
+    EIGEN_DEVICE_FUNC
+    const BlueNormReturnType blueNorm() const
+    { return BlueNormReturnType(_expression()); }
+
+
+    /** \returns a row (or column) vector expression of the norm
+      * of each column (or row) of the referenced expression, avoiding
+      * underflow and overflow.
+      * This is a vector with real entries, even if the original matrix has complex entries.
+      *
+      * \sa DenseBase::stableNorm() */
+    EIGEN_DEVICE_FUNC
+    const StableNormReturnType stableNorm() const
+    { return StableNormReturnType(_expression()); }
+
+
+    /** \returns a row (or column) vector expression of the norm
+      * of each column (or row) of the referenced expression, avoiding
+      * underflow and overflow using a concatenation of hypot() calls.
+      * This is a vector with real entries, even if the original matrix has complex entries.
+      *
+      * \sa DenseBase::hypotNorm() */
+    EIGEN_DEVICE_FUNC
+    const HypotNormReturnType hypotNorm() const
+    { return HypotNormReturnType(_expression()); }
+
+    /** \returns a row (or column) vector expression of the sum
+      * of each column (or row) of the referenced expression.
+      *
+      * Example: \include PartialRedux_sum.cpp
+      * Output: \verbinclude PartialRedux_sum.out
+      *
+      * \sa DenseBase::sum() */
+    EIGEN_DEVICE_FUNC
+    const SumReturnType sum() const
+    { return SumReturnType(_expression()); }
+
+    /** \returns a row (or column) vector expression of the mean
+    * of each column (or row) of the referenced expression.
+    *
+    * \sa DenseBase::mean() */
+    EIGEN_DEVICE_FUNC
+    const MeanReturnType mean() const
+    { return sum() / Scalar(Direction==Vertical?m_matrix.rows():m_matrix.cols()); }
+
+    /** \returns a row (or column) vector expression representing
+      * whether \b all coefficients of each respective column (or row) are \c true.
+      * This expression can be assigned to a vector with entries of type \c bool.
+      *
+      * \sa DenseBase::all() */
+    EIGEN_DEVICE_FUNC
+    const AllReturnType all() const
+    { return AllReturnType(_expression()); }
+
+    /** \returns a row (or column) vector expression representing
+      * whether \b at \b least one coefficient of each respective column (or row) is \c true.
+      * This expression can be assigned to a vector with entries of type \c bool.
+      *
+      * \sa DenseBase::any() */
+    EIGEN_DEVICE_FUNC
+    const AnyReturnType any() const
+    { return AnyReturnType(_expression()); }
+
+    /** \returns a row (or column) vector expression representing
+      * the number of \c true coefficients of each respective column (or row).
+      * This expression can be assigned to a vector whose entries have the same type as is used to
+      * index entries of the original matrix; for dense matrices, this is \c std::ptrdiff_t .
+      *
+      * Example: \include PartialRedux_count.cpp
+      * Output: \verbinclude PartialRedux_count.out
+      *
+      * \sa DenseBase::count() */
+    EIGEN_DEVICE_FUNC
+    const CountReturnType count() const
+    { return CountReturnType(_expression()); }
+
+    /** \returns a row (or column) vector expression of the product
+      * of each column (or row) of the referenced expression.
+      *
+      * Example: \include PartialRedux_prod.cpp
+      * Output: \verbinclude PartialRedux_prod.out
+      *
+      * \sa DenseBase::prod() */
+    EIGEN_DEVICE_FUNC
+    const ProdReturnType prod() const
+    { return ProdReturnType(_expression()); }
+
+
+    /** \returns a matrix expression
+      * where each column (or row) are reversed.
+      *
+      * Example: \include Vectorwise_reverse.cpp
+      * Output: \verbinclude Vectorwise_reverse.out
+      *
+      * \sa DenseBase::reverse() */
+    EIGEN_DEVICE_FUNC
+    const ConstReverseReturnType reverse() const
+    { return ConstReverseReturnType( _expression() ); }
+
+    /** \returns a writable matrix expression
+      * where each column (or row) are reversed.
+      *
+      * \sa reverse() const */
+    EIGEN_DEVICE_FUNC
+    ReverseReturnType reverse()
+    { return ReverseReturnType( _expression() ); }
+
+    typedef Replicate<ExpressionType,(isVertical?Dynamic:1),(isHorizontal?Dynamic:1)> ReplicateReturnType;
+    EIGEN_DEVICE_FUNC
+    const ReplicateReturnType replicate(Index factor) const;
+
+    /**
+      * \return an expression of the replication of each column (or row) of \c *this
+      *
+      * Example: \include DirectionWise_replicate.cpp
+      * Output: \verbinclude DirectionWise_replicate.out
+      *
+      * \sa VectorwiseOp::replicate(Index), DenseBase::replicate(), class Replicate
+      */
+    // NOTE implemented here because of sunstudio's compilation errors
+    // isVertical*Factor+isHorizontal instead of (isVertical?Factor:1) to handle CUDA bug with ternary operator
+    template<int Factor> const Replicate<ExpressionType,isVertical*Factor+isHorizontal,isHorizontal*Factor+isVertical>
+    EIGEN_DEVICE_FUNC
+    replicate(Index factor = Factor) const
+    {
+      return Replicate<ExpressionType,(isVertical?Factor:1),(isHorizontal?Factor:1)>
+          (_expression(),isVertical?factor:1,isHorizontal?factor:1);
+    }
+
+/////////// Artithmetic operators ///////////
+
+    /** Copies the vector \a other to each subvector of \c *this */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    ExpressionType& operator=(const DenseBase<OtherDerived>& other)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      //eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME
+      return m_matrix = extendedTo(other.derived());
+    }
+
+    /** Adds the vector \a other to each subvector of \c *this */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    ExpressionType& operator+=(const DenseBase<OtherDerived>& other)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      return m_matrix += extendedTo(other.derived());
+    }
+
+    /** Substracts the vector \a other to each subvector of \c *this */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    ExpressionType& operator-=(const DenseBase<OtherDerived>& other)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      return m_matrix -= extendedTo(other.derived());
+    }
+
+    /** Multiples each subvector of \c *this by the vector \a other */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    ExpressionType& operator*=(const DenseBase<OtherDerived>& other)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      m_matrix *= extendedTo(other.derived());
+      return m_matrix;
+    }
+
+    /** Divides each subvector of \c *this by the vector \a other */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    ExpressionType& operator/=(const DenseBase<OtherDerived>& other)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      m_matrix /= extendedTo(other.derived());
+      return m_matrix;
+    }
+
+    /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
+    template<typename OtherDerived> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+    CwiseBinaryOp<internal::scalar_sum_op<Scalar,typename OtherDerived::Scalar>,
+                  const ExpressionTypeNestedCleaned,
+                  const typename ExtendedType<OtherDerived>::Type>
+    operator+(const DenseBase<OtherDerived>& other) const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      return m_matrix + extendedTo(other.derived());
+    }
+
+    /** Returns the expression of the difference between each subvector of \c *this and the vector \a other */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    CwiseBinaryOp<internal::scalar_difference_op<Scalar,typename OtherDerived::Scalar>,
+                  const ExpressionTypeNestedCleaned,
+                  const typename ExtendedType<OtherDerived>::Type>
+    operator-(const DenseBase<OtherDerived>& other) const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      return m_matrix - extendedTo(other.derived());
+    }
+
+    /** Returns the expression where each subvector is the product of the vector \a other
+      * by the corresponding subvector of \c *this */
+    template<typename OtherDerived> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+    CwiseBinaryOp<internal::scalar_product_op<Scalar>,
+                  const ExpressionTypeNestedCleaned,
+                  const typename ExtendedType<OtherDerived>::Type>
+    EIGEN_DEVICE_FUNC
+    operator*(const DenseBase<OtherDerived>& other) const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      return m_matrix * extendedTo(other.derived());
+    }
+
+    /** Returns the expression where each subvector is the quotient of the corresponding
+      * subvector of \c *this by the vector \a other */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
+                  const ExpressionTypeNestedCleaned,
+                  const typename ExtendedType<OtherDerived>::Type>
+    operator/(const DenseBase<OtherDerived>& other) const
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+      EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
+      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+      return m_matrix / extendedTo(other.derived());
+    }
+
+    /** \returns an expression where each column (or row) of the referenced matrix are normalized.
+      * The referenced matrix is \b not modified.
+      * \sa MatrixBase::normalized(), normalize()
+      */
+    EIGEN_DEVICE_FUNC
+    CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
+                  const ExpressionTypeNestedCleaned,
+                  const typename OppositeExtendedType<NormReturnType>::Type>
+    normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); }
+
+
+    /** Normalize in-place each row or columns of the referenced matrix.
+      * \sa MatrixBase::normalize(), normalized()
+      */
+    EIGEN_DEVICE_FUNC void normalize() {
+      m_matrix = this->normalized();
+    }
+
+    EIGEN_DEVICE_FUNC inline void reverseInPlace();
+
+/////////// Geometry module ///////////
+
+    typedef Homogeneous<ExpressionType,Direction> HomogeneousReturnType;
+    EIGEN_DEVICE_FUNC
+    HomogeneousReturnType homogeneous() const;
+
+    typedef typename ExpressionType::PlainObject CrossReturnType;
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    const CrossReturnType cross(const MatrixBase<OtherDerived>& other) const;
+
+    enum {
+      HNormalized_Size = Direction==Vertical ? internal::traits<ExpressionType>::RowsAtCompileTime
+                                             : internal::traits<ExpressionType>::ColsAtCompileTime,
+      HNormalized_SizeMinusOne = HNormalized_Size==Dynamic ? Dynamic : HNormalized_Size-1
+    };
+    typedef Block<const ExpressionType,
+                  Direction==Vertical   ? int(HNormalized_SizeMinusOne)
+                                        : int(internal::traits<ExpressionType>::RowsAtCompileTime),
+                  Direction==Horizontal ? int(HNormalized_SizeMinusOne)
+                                        : int(internal::traits<ExpressionType>::ColsAtCompileTime)>
+            HNormalized_Block;
+    typedef Block<const ExpressionType,
+                  Direction==Vertical   ? 1 : int(internal::traits<ExpressionType>::RowsAtCompileTime),
+                  Direction==Horizontal ? 1 : int(internal::traits<ExpressionType>::ColsAtCompileTime)>
+            HNormalized_Factors;
+    typedef CwiseBinaryOp<internal::scalar_quotient_op<typename internal::traits<ExpressionType>::Scalar>,
+                const HNormalized_Block,
+                const Replicate<HNormalized_Factors,
+                  Direction==Vertical   ? HNormalized_SizeMinusOne : 1,
+                  Direction==Horizontal ? HNormalized_SizeMinusOne : 1> >
+            HNormalizedReturnType;
+
+    EIGEN_DEVICE_FUNC
+    const HNormalizedReturnType hnormalized() const;
+
+#   ifdef EIGEN_VECTORWISEOP_PLUGIN
+#     include EIGEN_VECTORWISEOP_PLUGIN
+#   endif
+
+  protected:
+    Index redux_length() const
+    {
+      return Direction==Vertical ? m_matrix.rows() : m_matrix.cols();
+    }
+    ExpressionTypeNested m_matrix;
+};
+
+//const colwise moved to DenseBase.h due to CUDA compiler bug
+
+
+/** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
+  *
+  * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ColwiseReturnType
+DenseBase<Derived>::colwise()
+{
+  return ColwiseReturnType(derived());
+}
+
+//const rowwise moved to DenseBase.h due to CUDA compiler bug
+
+
+/** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
+  *
+  * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::RowwiseReturnType
+DenseBase<Derived>::rowwise()
+{
+  return RowwiseReturnType(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_PARTIAL_REDUX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/Visitor.h b/src/3rdparty/eigen/Eigen/src/Core/Visitor.h
new file mode 100644
index 000000000..00bcca877
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/Visitor.h
@@ -0,0 +1,381 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_VISITOR_H
+#define EIGEN_VISITOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Visitor, typename Derived, int UnrollCount>
+struct visitor_impl
+{
+  enum {
+    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
+    row = (UnrollCount-1) % Derived::RowsAtCompileTime
+  };
+
+  EIGEN_DEVICE_FUNC
+  static inline void run(const Derived &mat, Visitor& visitor)
+  {
+    visitor_impl<Visitor, Derived, UnrollCount-1>::run(mat, visitor);
+    visitor(mat.coeff(row, col), row, col);
+  }
+};
+
+template<typename Visitor, typename Derived>
+struct visitor_impl<Visitor, Derived, 1>
+{
+  EIGEN_DEVICE_FUNC
+  static inline void run(const Derived &mat, Visitor& visitor)
+  {
+    return visitor.init(mat.coeff(0, 0), 0, 0);
+  }
+};
+
+// This specialization enables visitors on empty matrices at compile-time
+template<typename Visitor, typename Derived>
+struct visitor_impl<Visitor, Derived, 0> {
+  EIGEN_DEVICE_FUNC
+  static inline void run(const Derived &/*mat*/, Visitor& /*visitor*/)
+  {}
+};
+
+template<typename Visitor, typename Derived>
+struct visitor_impl<Visitor, Derived, Dynamic>
+{
+  EIGEN_DEVICE_FUNC
+  static inline void run(const Derived& mat, Visitor& visitor)
+  {
+    visitor.init(mat.coeff(0,0), 0, 0);
+    for(Index i = 1; i < mat.rows(); ++i)
+      visitor(mat.coeff(i, 0), i, 0);
+    for(Index j = 1; j < mat.cols(); ++j)
+      for(Index i = 0; i < mat.rows(); ++i)
+        visitor(mat.coeff(i, j), i, j);
+  }
+};
+
+// evaluator adaptor
+template<typename XprType>
+class visitor_evaluator
+{
+public:
+  EIGEN_DEVICE_FUNC
+  explicit visitor_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    RowsAtCompileTime = XprType::RowsAtCompileTime,
+    CoeffReadCost = internal::evaluator<XprType>::CoeffReadCost
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_xpr.size(); }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  { return m_evaluator.coeff(row, col); }
+
+protected:
+  internal::evaluator<XprType> m_evaluator;
+  const XprType &m_xpr;
+};
+} // end namespace internal
+
+/** Applies the visitor \a visitor to the whole coefficients of the matrix or vector.
+  *
+  * The template parameter \a Visitor is the type of the visitor and provides the following interface:
+  * \code
+  * struct MyVisitor {
+  *   // called for the first coefficient
+  *   void init(const Scalar& value, Index i, Index j);
+  *   // called for all other coefficients
+  *   void operator() (const Scalar& value, Index i, Index j);
+  * };
+  * \endcode
+  *
+  * \note compared to one or two \em for \em loops, visitors offer automatic
+  * unrolling for small fixed size matrix.
+  *
+  * \note if the matrix is empty, then the visitor is left unchanged.
+  *
+  * \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux()
+  */
+template<typename Derived>
+template<typename Visitor>
+EIGEN_DEVICE_FUNC
+void DenseBase<Derived>::visit(Visitor& visitor) const
+{
+  if(size()==0)
+    return;
+
+  typedef typename internal::visitor_evaluator<Derived> ThisEvaluator;
+  ThisEvaluator thisEval(derived());
+
+  enum {
+    unroll =  SizeAtCompileTime != Dynamic
+           && SizeAtCompileTime * int(ThisEvaluator::CoeffReadCost) + (SizeAtCompileTime-1) * int(internal::functor_traits<Visitor>::Cost) <= EIGEN_UNROLLING_LIMIT
+  };
+  return internal::visitor_impl<Visitor, ThisEvaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(thisEval, visitor);
+}
+
+namespace internal {
+
+/** \internal
+  * \brief Base class to implement min and max visitors
+  */
+template <typename Derived>
+struct coeff_visitor
+{
+  // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc
+  EIGEN_DEVICE_FUNC
+  coeff_visitor() : row(-1), col(-1), res(0) {}
+  typedef typename Derived::Scalar Scalar;
+  Index row, col;
+  Scalar res;
+  EIGEN_DEVICE_FUNC
+  inline void init(const Scalar& value, Index i, Index j)
+  {
+    res = value;
+    row = i;
+    col = j;
+  }
+};
+
+/** \internal
+  * \brief Visitor computing the min coefficient with its value and coordinates
+  *
+  * \sa DenseBase::minCoeff(Index*, Index*)
+  */
+template <typename Derived, int NaNPropagation>
+struct min_coeff_visitor : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if(value < this->res)
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template <typename Derived>
+struct min_coeff_visitor<Derived, PropagateNumbers> : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value < this->res))
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template <typename Derived>
+struct min_coeff_visitor<Derived, PropagateNaN> : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if((numext::isnan)(value) || value < this->res)
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template<typename Scalar, int NaNPropagation>
+    struct functor_traits<min_coeff_visitor<Scalar, NaNPropagation> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost
+  };
+};
+
+/** \internal
+  * \brief Visitor computing the max coefficient with its value and coordinates
+  *
+  * \sa DenseBase::maxCoeff(Index*, Index*)
+  */
+template <typename Derived, int NaNPropagation>
+struct max_coeff_visitor : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if(value > this->res)
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template <typename Derived>
+struct max_coeff_visitor<Derived, PropagateNumbers> : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value > this->res))
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template <typename Derived>
+struct max_coeff_visitor<Derived, PropagateNaN> : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if((numext::isnan)(value) || value > this->res)
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template<typename Scalar, int NaNPropagation>
+struct functor_traits<max_coeff_visitor<Scalar, NaNPropagation> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost
+  };
+};
+
+} // end namespace internal
+
+/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
+  * \returns the minimum of all coefficients of *this and puts in *row and *col its location.
+  *
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  *
+  * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
+  */
+template<typename Derived>
+template<int NaNPropagation, typename IndexType>
+EIGEN_DEVICE_FUNC
+typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
+{
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
+  internal::min_coeff_visitor<Derived, NaNPropagation> minVisitor;
+  this->visit(minVisitor);
+  *rowId = minVisitor.row;
+  if (colId) *colId = minVisitor.col;
+  return minVisitor.res;
+}
+
+/** \returns the minimum of all coefficients of *this and puts in *index its location.
+  *
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  *
+  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff()
+  */
+template<typename Derived>
+template<int NaNPropagation, typename IndexType>
+EIGEN_DEVICE_FUNC
+typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::minCoeff(IndexType* index) const
+{
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+      internal::min_coeff_visitor<Derived, NaNPropagation> minVisitor;
+  this->visit(minVisitor);
+  *index = IndexType((RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row);
+  return minVisitor.res;
+}
+
+/** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
+  * \returns the maximum of all coefficients of *this and puts in *row and *col its location.
+  *
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  *
+  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
+  */
+template<typename Derived>
+template<int NaNPropagation, typename IndexType>
+EIGEN_DEVICE_FUNC
+typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
+{
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
+  internal::max_coeff_visitor<Derived, NaNPropagation> maxVisitor;
+  this->visit(maxVisitor);
+  *rowPtr = maxVisitor.row;
+  if (colPtr) *colPtr = maxVisitor.col;
+  return maxVisitor.res;
+}
+
+/** \returns the maximum of all coefficients of *this and puts in *index its location.
+  *
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  *
+  * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
+  */
+template<typename Derived>
+template<int NaNPropagation, typename IndexType>
+EIGEN_DEVICE_FUNC
+typename internal::traits<Derived>::Scalar
+DenseBase<Derived>::maxCoeff(IndexType* index) const
+{
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+      internal::max_coeff_visitor<Derived, NaNPropagation> maxVisitor;
+  this->visit(maxVisitor);
+  *index = (RowsAtCompileTime==1) ? maxVisitor.col : maxVisitor.row;
+  return maxVisitor.res;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_VISITOR_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/AVX/Complex.h b/src/3rdparty/eigen/Eigen/src/Core/arch/AVX/Complex.h
new file mode 100644
index 000000000..ab7bd6c65
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/AVX/Complex.h
@@ -0,0 +1,372 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_AVX_H
+#define EIGEN_COMPLEX_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet4cf
+{
+  EIGEN_STRONG_INLINE Packet4cf() {}
+  EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {}
+  __m256  v;
+};
+
+#ifndef EIGEN_VECTORIZE_AVX512
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+{
+  typedef Packet4cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 1,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasSqrt   = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+#endif
+
+template<> struct unpacket_traits<Packet4cf> {
+  typedef std::complex<float> type;
+  typedef Packet2cf half;
+  typedef Packet8f as_real;
+  enum {
+    size=4,
+    alignment=Aligned32,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a)
+{
+  return Packet4cf(pnegate(a.v));
+}
+template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a)
+{
+  const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
+  return Packet4cf(_mm256_xor_ps(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
+{
+  __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
+  __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
+  __m256 result = _mm256_addsub_ps(tmp1, tmp2);
+  return Packet4cf(result);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) {
+  __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ);
+  return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet4cf pand   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf por    <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pxor   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(b.v,a.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
+template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
+
+
+template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
+{
+  return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
+{
+  // FIXME The following might be optimized using _mm256_movedup_pd
+  Packet2cf a = ploaddup<Packet2cf>(from);
+  Packet2cf b = ploaddup<Packet2cf>(from+1);
+  return  Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, Index stride)
+{
+  return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]),
+                                 std::imag(from[2*stride]), std::real(from[2*stride]),
+                                 std::imag(from[1*stride]), std::real(from[1*stride]),
+                                 std::imag(from[0*stride]), std::real(from[0*stride])));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index stride)
+{
+  __m128 low = _mm256_extractf128_ps(from.v, 0);
+  to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
+  to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
+
+  __m128 high = _mm256_extractf128_ps(from.v, 1);
+  to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
+  to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
+
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet4cf>(const Packet4cf& a)
+{
+  return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
+  __m128 low  = _mm256_extractf128_ps(a.v, 0);
+  __m128 high = _mm256_extractf128_ps(a.v, 1);
+  __m128d lowd  = _mm_castps_pd(low);
+  __m128d highd = _mm_castps_pd(high);
+  low  = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1));
+  high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1));
+  __m256 result = _mm256_setzero_ps();
+  result = _mm256_insertf128_ps(result, low, 1);
+  result = _mm256_insertf128_ps(result, high, 0);
+  return Packet4cf(result);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a)
+{
+  return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)),
+                     Packet2cf(_mm256_extractf128_ps(a.v,1))));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
+{
+  return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
+                         Packet2cf(_mm256_extractf128_ps(a.v, 1))));
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
+
+template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
+{
+  Packet4cf num = pmul(a, pconj(b));
+  __m256 tmp = _mm256_mul_ps(b.v, b.v);
+  __m256 tmp2    = _mm256_shuffle_ps(tmp,tmp,0xB1);
+  __m256 denom = _mm256_add_ps(tmp, tmp2);
+  return Packet4cf(_mm256_div_ps(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
+{
+  return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
+}
+
+//---------- double ----------
+struct Packet2cd
+{
+  EIGEN_STRONG_INLINE Packet2cd() {}
+  EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {}
+  __m256d  v;
+};
+
+#ifndef EIGEN_VECTORIZE_AVX512
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet2cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 2,
+    HasHalfPacket = 1,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasSqrt   = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+#endif
+
+template<> struct unpacket_traits<Packet2cd> {
+  typedef std::complex<double> type;
+  typedef Packet1cd half;
+  typedef Packet4d as_real;
+  enum {
+    size=2,
+    alignment=Aligned32,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a)
+{
+  const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
+  return Packet2cd(_mm256_xor_pd(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
+{
+  __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0);
+  __m256d even = _mm256_mul_pd(tmp1, b.v);
+  __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF);
+  __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5);
+  __m256d odd  = _mm256_mul_pd(tmp2, tmp3);
+  return Packet2cd(_mm256_addsub_pd(even, odd));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) {
+  __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ);
+  return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cd pand   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd por    <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pxor   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(b.v,a.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
+{
+  // in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though)
+//   return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
+    return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, Index stride)
+{
+  return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]),
+				 std::imag(from[0*stride]), std::real(from[0*stride])));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, Index stride)
+{
+  __m128d low = _mm256_extractf128_pd(from.v, 0);
+  to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
+  __m128d high = _mm256_extractf128_pd(from.v, 1);
+  to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a)
+{
+  __m128d low = _mm256_extractf128_pd(a.v, 0);
+  EIGEN_ALIGN16 double res[2];
+  _mm_store_pd(res, low);
+  return std::complex<double>(res[0],res[1]);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
+  __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
+  return Packet2cd(result);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a)
+{
+  return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)),
+                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
+{
+  return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
+                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
+
+template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
+{
+  Packet2cd num = pmul(a, pconj(b));
+  __m256d tmp = _mm256_mul_pd(b.v, b.v);
+  __m256d denom = _mm256_hadd_pd(tmp, tmp);
+  return Packet2cd(_mm256_div_pd(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
+{
+  return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4cf,4>& kernel) {
+  __m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
+  __m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
+  __m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
+  __m256d P3 = _mm256_castps_pd(kernel.packet[3].v);
+
+  __m256d T0 = _mm256_shuffle_pd(P0, P1, 15);
+  __m256d T1 = _mm256_shuffle_pd(P0, P1, 0);
+  __m256d T2 = _mm256_shuffle_pd(P2, P3, 15);
+  __m256d T3 = _mm256_shuffle_pd(P2, P3, 0);
+
+  kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32));
+  kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49));
+  kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32));
+  kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2cd,2>& kernel) {
+  __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
+  kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
+ kernel.packet[0].v = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd psqrt<Packet2cd>(const Packet2cd& a) {
+  return psqrt_complex<Packet2cd>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf psqrt<Packet4cf>(const Packet4cf& a) {
+  return psqrt_complex<Packet4cf>(a);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_AVX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h b/src/3rdparty/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h
new file mode 100644
index 000000000..67041c812
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -0,0 +1,228 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_AVX_H
+#define EIGEN_MATH_FUNCTIONS_AVX_H
+
+/* The sin and cos functions of this file are loosely derived from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+psin<Packet8f>(const Packet8f& _x) {
+  return psin_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+pcos<Packet8f>(const Packet8f& _x) {
+  return pcos_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+plog<Packet8f>(const Packet8f& _x) {
+  return plog_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
+plog<Packet4d>(const Packet4d& _x) {
+  return plog_double(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+plog2<Packet8f>(const Packet8f& _x) {
+  return plog2_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
+plog2<Packet4d>(const Packet4d& _x) {
+  return plog2_double(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f plog1p<Packet8f>(const Packet8f& _x) {
+  return generic_plog1p(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f pexpm1<Packet8f>(const Packet8f& _x) {
+  return generic_expm1(_x);
+}
+
+// Exponential function. Works by writing "x = m*log(2) + r" where
+// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
+// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+pexp<Packet8f>(const Packet8f& _x) {
+  return pexp_float(_x);
+}
+
+// Hyperbolic Tangent function.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+ptanh<Packet8f>(const Packet8f& _x) {
+  return internal::generic_fast_tanh_float(_x);
+}
+
+// Exponential function for doubles.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
+pexp<Packet4d>(const Packet4d& _x) {
+  return pexp_double(_x);
+}
+
+// Functions for sqrt.
+// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
+// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
+// exact solution. It does not handle +inf, or denormalized numbers correctly.
+// The main advantage of this approach is not just speed, but also the fact that
+// it can be inlined and pipelined with other computations, further reducing its
+// effective latency. This is similar to Quake3's fast inverse square root.
+// For detail see here: http://www.beyond3d.com/content/articles/8/
+#if EIGEN_FAST_MATH
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f psqrt<Packet8f>(const Packet8f& _x) {
+  Packet8f minus_half_x = pmul(_x, pset1<Packet8f>(-0.5f));
+  Packet8f denormal_mask = pandnot(
+      pcmp_lt(_x, pset1<Packet8f>((std::numeric_limits<float>::min)())),
+      pcmp_lt(_x, pzero(_x)));
+
+  // Compute approximate reciprocal sqrt.
+  Packet8f x = _mm256_rsqrt_ps(_x);
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1<Packet8f>(1.5f)));
+  // Flush results for denormals to zero.
+  return pandnot(pmul(_x,x), denormal_mask);
+}
+
+#else
+
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f psqrt<Packet8f>(const Packet8f& _x) {
+  return _mm256_sqrt_ps(_x);
+}
+
+#endif
+
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4d psqrt<Packet4d>(const Packet4d& _x) {
+  return _mm256_sqrt_pd(_x);
+}
+
+#if EIGEN_FAST_MATH
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
+
+  Packet8f neg_half = pmul(_x, p8f_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  Packet8f lt_min_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
+  Packet8f inf_mask =  _mm256_cmp_ps(_x, p8f_inf, _CMP_EQ_OQ);
+  Packet8f not_normal_finite_mask = _mm256_or_ps(lt_min_mask, inf_mask);
+
+  // Compute an approximate result using the rsqrt intrinsic.
+  Packet8f y_approx = _mm256_rsqrt_ps(_x);
+
+  // Do a single step of Newton-Raphson iteration to improve the approximation.
+  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
+  // It is essential to evaluate the inner term like this because forming
+  // y_n^2 may over- or underflow.
+  Packet8f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p8f_one_point_five));
+
+  // Select the result of the Newton-Raphson step for positive normal arguments.
+  // For other arguments, choose the output of the intrinsic. This will
+  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if
+  // x is zero or a positive denormalized float (equivalent to flushing positive
+  // denormalized inputs to zero).
+  return pselect<Packet8f>(not_normal_finite_mask, y_approx, y_newton);
+}
+
+#else
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
+  _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
+  return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(_x));
+}
+#endif
+
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4d prsqrt<Packet4d>(const Packet4d& _x) {
+  _EIGEN_DECLARE_CONST_Packet4d(one, 1.0);
+  return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(_x));
+}
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, psin)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, plog)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, plog2)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, plog1p)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pexpm1)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt)
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pfrexp(const Packet8h& a, Packet8h& exponent) {
+  Packet8f fexponent;
+  const Packet8h out = float2half(pfrexp<Packet8f>(half2float(a), fexponent));
+  exponent = float2half(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pldexp(const Packet8h& a, const Packet8h& exponent) {
+  return float2half(pldexp<Packet8f>(half2float(a), half2float(exponent)));
+}
+
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pcos)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog2)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog1p)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexpm1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, prsqrt)
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pfrexp(const Packet8bf& a, Packet8bf& exponent) {
+  Packet8f fexponent;
+  const Packet8bf out = F32ToBf16(pfrexp<Packet8f>(Bf16ToF32(a), fexponent));
+  exponent = F32ToBf16(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pldexp(const Packet8bf& a, const Packet8bf& exponent) {
+  return F32ToBf16(pldexp<Packet8f>(Bf16ToF32(a), Bf16ToF32(exponent)));
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_AVX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/AVX/PacketMath.h b/src/3rdparty/eigen/Eigen/src/Core/arch/AVX/PacketMath.h
new file mode 100644
index 000000000..7fc32fd71
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -0,0 +1,1574 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_AVX_H
+#define EIGEN_PACKET_MATH_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#endif
+
+#ifdef EIGEN_VECTORIZE_FMA
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+#endif
+
+typedef __m256  Packet8f;
+typedef __m256i Packet8i;
+typedef __m256d Packet4d;
+typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
+typedef eigen_packet_wrapper<__m128i, 3> Packet8bf;
+
+template<> struct is_arithmetic<__m256>  { enum { value = true }; };
+template<> struct is_arithmetic<__m256i> { enum { value = true }; };
+template<> struct is_arithmetic<__m256d> { enum { value = true }; };
+template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
+template<> struct is_arithmetic<Packet8bf> { enum { value = true }; };
+
+#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
+  const Packet8f p8f_##NAME = pset1<Packet8f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
+  const Packet4d p4d_##NAME = pset1<Packet4d>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \
+  const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1<Packet8i>(X))
+
+#define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \
+  const Packet8i p8i_##NAME = pset1<Packet8i>(X)
+
+// Use the packet_traits defined in AVX512/PacketMath.h instead if we're going
+// to leverage AVX512 instructions.
+#ifndef EIGEN_VECTORIZE_AVX512
+template<> struct packet_traits<float>  : default_packet_traits
+{
+  typedef Packet8f type;
+  typedef Packet4f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 1,
+
+    HasCmp  = 1,
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasExp = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasBlend = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1
+  };
+};
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef Packet4d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+    HasHalfPacket = 1,
+
+    HasCmp  = 1,
+    HasDiv  = 1,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasBlend = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1
+  };
+};
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet8h type;
+  // There is no half-size packet for Packet8h.
+  typedef Packet8h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+
+    HasCmp    = 1,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasSin    = EIGEN_FAST_MATH,
+    HasCos    = EIGEN_FAST_MATH,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasAbs2   = 0,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 1,
+    HasSetLinear = 0,
+    HasLog    = 1,
+    HasLog1p  = 1,
+    HasExpm1  = 1,
+    HasExp    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasTanh   = EIGEN_FAST_MATH,
+    HasErf    = EIGEN_FAST_MATH,
+    HasBlend  = 0,
+    HasRound  = 1,
+    HasFloor  = 1,
+    HasCeil   = 1,
+    HasRint   = 1,
+    HasBessel = 1,
+    HasNdtri  = 1
+  };
+};
+
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet8bf type;
+  // There is no half-size packet for current Packet8bf.
+  // TODO: support as SSE path.
+  typedef Packet8bf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasAbs2   = 0,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 1,
+    HasSetLinear = 0,
+    HasLog = 1,
+    HasLog1p  = 1,
+    HasExpm1  = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasBlend = 0,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+    HasBessel = 1,
+    HasNdtri  = 1
+  };
+};
+#endif
+
+template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
+template<> struct scalar_div_cost<double,true> { enum { value = 16 }; };
+
+/* Proper support for integers is only provided by AVX2. In the meantime, we'll
+   use SSE instructions and packets to deal with integers.
+template<> struct packet_traits<int>    : default_packet_traits
+{
+  typedef Packet8i type;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=8
+  };
+};
+*/
+
+template<> struct unpacket_traits<Packet8f> {
+  typedef float     type;
+  typedef Packet4f  half;
+  typedef Packet8i  integer_packet;
+  typedef uint8_t   mask_t;
+  enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true, masked_store_available=true};
+};
+template<> struct unpacket_traits<Packet4d> {
+  typedef double type;
+  typedef Packet2d half;
+  enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false, masked_load_available=false, masked_store_available=false}; };
+template<> struct unpacket_traits<Packet8bf> { typedef bfloat16 type; typedef Packet8bf half; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; };
+
+// Helper function for bit packing snippet of low precision comparison.
+// It packs the flags from 16x16 to 8x16.
+EIGEN_STRONG_INLINE __m128i Pack16To8(Packet8f rf) {
+  return _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0),
+                         _mm256_extractf128_si256(_mm256_castps_si256(rf), 1));
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float&  from) { return _mm256_set1_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int&    from) { return _mm256_set1_epi32(from); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pset1frombits<Packet8f>(unsigned int from) { return _mm256_castsi256_ps(pset1<Packet8i>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4d pset1frombits<Packet4d>(uint64_t from) { return _mm256_castsi256_pd(_mm256_set1_epi64x(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); }
+template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); }
+template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); }
+
+
+template<> EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f& /*a*/) { return _mm256_castsi256_ps(_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1)); }
+template<> EIGEN_STRONG_INLINE Packet8i peven_mask(const Packet8i& /*a*/) { return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); }
+template<> EIGEN_STRONG_INLINE Packet4d peven_mask(const Packet4d& /*a*/) { return _mm256_castsi256_pd(_mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float*  from) { return _mm256_broadcast_ss(from); }
+template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
+
+template<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
+template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_add_epi32(a,b);
+#else
+  __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_sub_epi32(a,b);
+#else
+  __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
+{
+  return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
+{
+  return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pmul<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_mullo_epi32(a,b);
+#else
+  const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
+{ eigen_assert(false && "packet integer division are not supported by AVX");
+  return pset1<Packet8i>(0);
+}
+
+#ifdef EIGEN_VECTORIZE_FMA
+template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
+#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
+  // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
+  //  and even register spilling with clang>=6.0 (bug 1637).
+  // Gcc stupidly generates a vfmadd132ps instruction.
+  // So let's enforce it to generate a vfmadd231ps instruction since the most common use
+  //  case is to accumulate the result of the product.
+  Packet8f res = c;
+  __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  return _mm256_fmadd_ps(a,b,c);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
+#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
+  // see above
+  Packet4d res = c;
+  __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  return _mm256_fmadd_pd(a,b,c);
+#endif
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); }
+
+template<> EIGEN_STRONG_INLINE Packet4d pcmp_le(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LE_OQ); }
+template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LT_OQ); }
+template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a, b, _CMP_NGE_UQ); }
+template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); }
+
+
+template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_cmpeq_epi32(a,b);
+#else
+  __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // There appears to be a bug in GCC, by which the optimizer may flip
+  // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  Packet8f res;
+  asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  // Arguments are swapped to match NaN propagation behavior of std::min.
+  return _mm256_min_ps(b,a);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // See pmin above
+  Packet4d res;
+  asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  // Arguments are swapped to match NaN propagation behavior of std::min.
+  return _mm256_min_pd(b,a);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // See pmin above
+  Packet8f res;
+  asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  // Arguments are swapped to match NaN propagation behavior of std::max.
+  return _mm256_max_ps(b,a);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // See pmin above
+  Packet4d res;
+  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  // Arguments are swapped to match NaN propagation behavior of std::max.
+  return _mm256_max_pd(b,a);
+#endif
+}
+
+// Add specializations for min/max with prescribed NaN progation.
+template<>
+EIGEN_STRONG_INLINE Packet8f pmin<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet8f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4d pmin<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet4d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8f pmax<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet8f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4d pmax<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet4d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8f pmin<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet8f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4d pmin<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet4d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8f pmax<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet8f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4d pmax<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet4d>);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f print<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet4d print<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
+
+
+template<> EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  // vpcmpeqd has lower latency than the more general vcmpps
+  return _mm256_cmpeq_epi32(a,a);
+#else
+  const __m256 b = _mm256_castsi256_ps(a);
+  return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  // vpcmpeqd has lower latency than the more general vcmpps
+  const __m256i b = _mm256_castps_si256(a);
+  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b));
+#else
+  return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  // vpcmpeqq has lower latency than the more general vcmppd
+  const __m256i b = _mm256_castpd_si256(a);
+  return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b));
+#else
+  return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_and_si256(a,b);
+#else
+  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_or_si256(a,b);
+#else
+  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_xor_si256(a,b);
+#else
+  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); }
+template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); }
+template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_andnot_si256(b,a);
+#else
+  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a)));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a)
+{
+  const Packet8f mask = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x80000000u));
+  const Packet8f prev0dot5 = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
+  return _mm256_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a)
+{
+  const Packet4d mask = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
+  const Packet4d prev0dot5 = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
+  return _mm256_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b)
+{ return _mm256_blendv_ps(b,a,mask); }
+template<> EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& mask, const Packet4d& a, const Packet4d& b)
+{ return _mm256_blendv_pd(b,a,mask); }
+
+template<int N> EIGEN_STRONG_INLINE Packet8i parithmetic_shift_right(Packet8i a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_srai_epi32(a, N);
+#else
+  __m128i lo = _mm_srai_epi32(_mm256_extractf128_si256(a, 0), N);
+  __m128i hi = _mm_srai_epi32(_mm256_extractf128_si256(a, 1), N);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet8i plogical_shift_right(Packet8i a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_srli_epi32(a, N);
+#else
+  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(a, 0), N);
+  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(a, 1), N);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet8i plogical_shift_left(Packet8i a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_slli_epi32(a, N);
+#else
+  __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), N);
+  __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), N);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from, uint8_t umask) {
+  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
+  const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
+  mask = por<Packet8i>(mask, bit_mask);
+  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask);
+}
+
+// Loads 4 floats from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3, a3}
+template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
+{
+  // TODO try to find a way to avoid the need of a temporary register
+//   Packet8f tmp  = _mm256_castps128_ps256(_mm_loadu_ps(from));
+//   tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
+//   return _mm256_unpacklo_ps(tmp,tmp);
+
+  // _mm256_insertf128_ps is very slow on Haswell, thus:
+  Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
+  // mimic an "inplace" permutation of the lower 128bits using a blend
+  tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
+  // then we can perform a consistent permutation on the global register to get everything in shape:
+  return  _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
+}
+// Loads 2 doubles from memory a returns the packet {a0, a0  a1, a1}
+template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
+{
+  Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
+  return  _mm256_permute_pd(tmp, 3<<2);
+}
+
+// Loads 2 floats from memory a returns the packet {a0, a0  a0, a0, a1, a1, a1, a1}
+template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
+{
+  Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
+  return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from, uint8_t umask) {
+  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
+  const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
+  mask = por<Packet8i>(mask, bit_mask);
+  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
+  EIGEN_DEBUG_UNALIGNED_STORE return _mm256_maskstore_ps(to, mask, from);
+}
+
+// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
+// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
+template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride)
+{
+  return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
+                       from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride)
+{
+  return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride)
+{
+  __m128 low = _mm256_extractf128_ps(from, 0);
+  to[stride*0] = _mm_cvtss_f32(low);
+  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
+  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
+  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
+
+  __m128 high = _mm256_extractf128_ps(from, 1);
+  to[stride*4] = _mm_cvtss_f32(high);
+  to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
+  to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
+  to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride)
+{
+  __m128d low = _mm256_extractf128_pd(from, 0);
+  to[stride*0] = _mm_cvtsd_f64(low);
+  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
+  __m128d high = _mm256_extractf128_pd(from, 1);
+  to[stride*2] = _mm_cvtsd_f64(high);
+  to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
+{
+  Packet8f pa = pset1<Packet8f>(a);
+  pstore(to, pa);
+}
+template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
+{
+  Packet4d pa = pset1<Packet4d>(a);
+  pstore(to, pa);
+}
+template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
+{
+  Packet8i pa = pset1<Packet8i>(a);
+  pstore(to, pa);
+}
+
+#ifndef EIGEN_VECTORIZE_AVX512
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+#endif
+
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet8f>(const Packet8f& a) {
+  return _mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
+template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
+  return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
+}
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet8i>(const Packet8i& a) {
+  return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
+{
+  __m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
+  return _mm256_permute2f128_ps(tmp, tmp, 1);
+}
+template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
+{
+   __m256d tmp = _mm256_shuffle_pd(a,a,5);
+  return _mm256_permute2f128_pd(tmp, tmp, 1);
+  #if 0
+  // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
+  // exhibit the same latency/throughput, but it is here for future reference/benchmarking...
+  __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
+    return _mm256_permute_pd(swap_halves,5);
+  #endif
+}
+
+// pabs should be ok
+template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
+{
+  const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
+  return _mm256_and_ps(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
+{
+  const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
+  return _mm256_and_pd(a,mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
+  return pfrexp_generic(a,exponent);
+}
+
+// Extract exponent without existence of Packet4l.
+template<>
+EIGEN_STRONG_INLINE  
+Packet4d pfrexp_generic_get_biased_exponent(const Packet4d& a) {
+  const Packet4d cst_exp_mask  = pset1frombits<Packet4d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+  __m256i a_expo = _mm256_castpd_si256(pand(a, cst_exp_mask));
+#ifdef EIGEN_VECTORIZE_AVX2
+  a_expo = _mm256_srli_epi64(a_expo, 52);
+  __m128i lo = _mm256_extractf128_si256(a_expo, 0);
+  __m128i hi = _mm256_extractf128_si256(a_expo, 1);
+#else
+  __m128i lo = _mm256_extractf128_si256(a_expo, 0);
+  __m128i hi = _mm256_extractf128_si256(a_expo, 1);
+  lo = _mm_srli_epi64(lo, 52);
+  hi = _mm_srli_epi64(hi, 52);
+#endif
+  Packet2d exponent_lo = _mm_cvtepi32_pd(vec4i_swizzle1(lo, 0, 2, 1, 3));
+  Packet2d exponent_hi = _mm_cvtepi32_pd(vec4i_swizzle1(hi, 0, 2, 1, 3));
+  Packet4d exponent = _mm256_insertf128_pd(_mm256_setzero_pd(), exponent_lo, 0);
+  exponent = _mm256_insertf128_pd(exponent, exponent_hi, 1);
+  return exponent;
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet4d pfrexp<Packet4d>(const Packet4d& a, Packet4d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet4d max_exponent = pset1<Packet4d>(2099.0);
+  const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+  
+  // Split 2^e into four factors and multiply.
+  const Packet4i bias = pset1<Packet4i>(1023);
+  Packet4i b = parithmetic_shift_right<2>(e);  // floor(e/4)
+  
+  // 2^b
+  Packet4i hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3);
+  Packet4i lo = _mm_slli_epi64(hi, 52);
+  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
+  Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
+  Packet4d out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
+  
+  // 2^(e - 3b)
+  b = psub(psub(psub(e, b), b), b);  // e - 3b
+  hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3);
+  lo = _mm_slli_epi64(hi, 52);
+  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
+  c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
+  out = pmul(out, c); // a * 2^e
+  return out;
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
+{
+  return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1))));
+}
+template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
+{
+  return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a)
+{
+  return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
+{
+  Packet8f tmp;
+  tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
+  tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
+  return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+}
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
+{
+  Packet4d tmp;
+  tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
+  return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
+{
+  Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
+  tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
+  return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+}
+template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
+{
+  Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
+  return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
+{
+  Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
+  tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
+  return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+}
+
+template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
+{
+  Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
+  return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
+}
+
+// not needed yet
+// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
+// {
+//   return _mm256_movemask_ps(x)==0xFF;
+// }
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
+{
+  return _mm256_movemask_ps(x)!=0;
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8f,8>& kernel) {
+  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+  __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
+  __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
+  __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
+  __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
+  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
+  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
+  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
+  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
+  __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
+  __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
+  __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
+  __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
+  kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
+  kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
+  kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
+  kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
+  kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
+  kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
+  kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
+  kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8f,4>& kernel) {
+  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+
+  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
+  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
+  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
+  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
+
+  kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
+  kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
+  kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
+  kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4d,4>& kernel) {
+  __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
+  __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
+  __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
+  __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
+
+  kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
+  kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
+  kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
+  kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
+  const __m256 zero = _mm256_setzero_ps();
+  const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
+  return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
+  const __m256d zero = _mm256_setzero_pd();
+  const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
+  return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
+}
+
+// Packet math for Eigen::half
+
+template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; };
+
+template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
+  return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
+  return numext::bit_cast<Eigen::half>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
+  return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+  _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h
+ploaddup<Packet8h>(const Eigen::half*  from) {
+  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+  const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
+  const numext::uint16_t d = numext::bit_cast<numext::uint16_t>(from[3]);
+  return _mm_set_epi16(d, d, c, c, b, b, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h
+ploadquad<Packet8h>(const Eigen::half* from) {
+  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+  return _mm_set_epi16(b, b, b, b, a, a, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
+ return _mm_cmpeq_epi32(a, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pabs(const Packet8h& a) {
+  const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm_andnot_si128(sign_mask, a);
+}
+
+EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm256_cvtph_ps(a);
+#else
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, a);
+  float f0(aux[0]);
+  float f1(aux[1]);
+  float f2(aux[2]);
+  float f3(aux[3]);
+  float f4(aux[4]);
+  float f5(aux[5]);
+  float f6(aux[6]);
+  float f7(aux[7]);
+
+  return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+#else
+  EIGEN_ALIGN32 float aux[8];
+  pstore(aux, a);
+  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[0]));
+  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[1]));
+  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[2]));
+  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[3]));
+  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[4]));
+  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[5]));
+  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[6]));
+  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[7]));
+  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a,
+                                            const Packet8h& b) {
+  return float2half(pmin<Packet8f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a,
+                                            const Packet8h& b) {
+  return float2half(pmax<Packet8f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h plset<Packet8h>(const half& a) {
+  return float2half(plset<Packet8f>(static_cast<float>(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
+  // in some cases Packet4i is a wrapper around __m128i, so we either need to
+  // cast to Packet4i to directly call the intrinsics as below:
+  return _mm_or_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) {
+  return _mm_xor_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) {
+  return _mm_and_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) {
+  return _mm_andnot_si128(b,a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
+  return _mm_blendv_epi8(b, a, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
+  return float2half(pround<Packet8f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
+  return float2half(print<Packet8f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
+  return float2half(pceil<Packet8f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
+  return float2half(pfloor<Packet8f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) {
+  return Pack16To8(pcmp_eq(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a,const Packet8h& b) {
+  return Pack16To8(pcmp_le(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a,const Packet8h& b) {
+  return Pack16To8(pcmp_lt(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a,const Packet8h& b) {
+  return Pack16To8(pcmp_lt_or_nan(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
+  Packet8h sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm_xor_si128(a, sign_mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = padd(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = psub(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pmul(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pdiv(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
+{
+  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0*stride]);
+  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1*stride]);
+  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2*stride]);
+  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3*stride]);
+  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4*stride]);
+  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5*stride]);
+  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6*stride]);
+  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7*stride]);
+  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
+{
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, from);
+  to[stride*0] = aux[0];
+  to[stride*1] = aux[1];
+  to[stride*2] = aux[2];
+  to[stride*3] = aux[3];
+  to[stride*4] = aux[4];
+  to[stride*5] = aux[5];
+  to[stride*6] = aux[6];
+  to[stride*7] = aux[7];
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_max<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_min<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_mul<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  return _mm_shuffle_epi8(a,m);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8h,8>& kernel) {
+  __m128i a = kernel.packet[0];
+  __m128i b = kernel.packet[1];
+  __m128i c = kernel.packet[2];
+  __m128i d = kernel.packet[3];
+  __m128i e = kernel.packet[4];
+  __m128i f = kernel.packet[5];
+  __m128i g = kernel.packet[6];
+  __m128i h = kernel.packet[7];
+
+  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+
+  kernel.packet[0] = a0b0c0d0e0f0g0h0;
+  kernel.packet[1] = a1b1c1d1e1f1g1h1;
+  kernel.packet[2] = a2b2c2d2e2f2g2h2;
+  kernel.packet[3] = a3b3c3d3e3f3g3h3;
+  kernel.packet[4] = a4b4c4d4e4f4g4h4;
+  kernel.packet[5] = a5b5c5d5e5f5g5h5;
+  kernel.packet[6] = a6b6c6d6e6f6g6h6;
+  kernel.packet[7] = a7b7c7d7e7f7g7h7;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8h,4>& kernel) {
+  EIGEN_ALIGN32 Eigen::half in[4][8];
+  pstore<Eigen::half>(in[0], kernel.packet[0]);
+  pstore<Eigen::half>(in[1], kernel.packet[1]);
+  pstore<Eigen::half>(in[2], kernel.packet[2]);
+  pstore<Eigen::half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN32 Eigen::half out[4][8];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][2*i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+4] = in[j][2*i+1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet8h>(out[0]);
+  kernel.packet[1] = pload<Packet8h>(out[1]);
+  kernel.packet[2] = pload<Packet8h>(out[2]);
+  kernel.packet[3] = pload<Packet8h>(out[3]);
+}
+
+// BFloat16 implementation.
+
+EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  __m256i extend = _mm256_cvtepu16_epi32(a);
+  return _mm256_castsi256_ps(_mm256_slli_epi32(extend, 16));
+#else
+  __m128i lo = _mm_cvtepu16_epi32(a);
+  __m128i hi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
+  __m128i lo_shift = _mm_slli_epi32(lo, 16);
+  __m128i hi_shift = _mm_slli_epi32(hi, 16);
+  return _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo_shift), hi_shift, 1));
+#endif
+}
+
+// Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm.
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) {
+  Packet8bf r;
+
+  __m256i input = _mm256_castps_si256(a);
+
+#ifdef EIGEN_VECTORIZE_AVX2
+  // uint32_t lsb = (input >> 16);
+  __m256i t = _mm256_srli_epi32(input, 16);
+  // uint32_t lsb = lsb & 1;
+  t = _mm256_and_si256(t, _mm256_set1_epi32(1));
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t = _mm256_add_epi32(t, _mm256_set1_epi32(0x7fff));
+  // input += rounding_bias;
+  t = _mm256_add_epi32(t, input);
+  // input = input >> 16;
+  t = _mm256_srli_epi32(t, 16);
+  // Check NaN before converting back to bf16
+  __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
+  __m256i nan = _mm256_set1_epi32(0x7fc0);
+  t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask));
+  // output = numext::bit_cast<uint16_t>(input);
+  return _mm_packus_epi32(_mm256_extractf128_si256(t, 0),
+                         _mm256_extractf128_si256(t, 1));
+#else
+  // uint32_t lsb = (input >> 16);
+  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(input, 0), 16);
+  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(input, 1), 16);
+  // uint32_t lsb = lsb & 1;
+  lo = _mm_and_si128(lo, _mm_set1_epi32(1));
+  hi = _mm_and_si128(hi, _mm_set1_epi32(1));
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  lo = _mm_add_epi32(lo, _mm_set1_epi32(0x7fff));
+  hi = _mm_add_epi32(hi, _mm_set1_epi32(0x7fff));
+  // input += rounding_bias;
+  lo = _mm_add_epi32(lo, _mm256_extractf128_si256(input, 0));
+  hi = _mm_add_epi32(hi, _mm256_extractf128_si256(input, 1));
+  // input = input >> 16;
+  lo = _mm_srli_epi32(lo, 16);
+  hi = _mm_srli_epi32(hi, 16);
+  // Check NaN before converting back to bf16
+  __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
+  __m128i nan = _mm_set1_epi32(0x7fc0);
+  lo = _mm_blendv_epi8(nan, lo, _mm_castps_si128(_mm256_castps256_ps128(mask)));
+  hi = _mm_blendv_epi8(nan, hi, _mm_castps_si128(_mm256_extractf128_ps(mask, 1)));
+  // output = numext::bit_cast<uint16_t>(input);
+  return _mm_packus_epi32(lo, hi);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
+  return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet8bf>(const Packet8bf& from) {
+  return numext::bit_cast<bfloat16>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
+  return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
+  _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf
+ploaddup<Packet8bf>(const bfloat16* from) {
+  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+  const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
+  const numext::uint16_t d = numext::bit_cast<numext::uint16_t>(from[3]);
+  return _mm_set_epi16(d, d, c, c, b, b, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf
+ploadquad<Packet8bf>(const bfloat16* from) {
+  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+  return _mm_set_epi16(b, b, b, b, a, a, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf ptrue(const Packet8bf& a) {
+ return _mm_cmpeq_epi32(a, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
+  const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm_andnot_si128(sign_mask, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a,
+                                                const Packet8bf& b) {
+  return F32ToBf16(pmin<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a,
+                                                const Packet8bf& b) {
+  return F32ToBf16(pmax<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
+  return F32ToBf16(plset<Packet8f>(static_cast<float>(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf por(const Packet8bf& a,const Packet8bf& b) {
+  return _mm_or_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a,const Packet8bf& b) {
+  return _mm_xor_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pand(const Packet8bf& a,const Packet8bf& b) {
+  return _mm_and_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pandnot(const Packet8bf& a,const Packet8bf& b) {
+  return _mm_andnot_si128(b,a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pselect(const Packet8bf& mask, const Packet8bf& a, const Packet8bf& b) {
+  return _mm_blendv_epi8(b, a, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a)
+{
+  return F32ToBf16(pround<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
+  return F32ToBf16(print<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
+  return F32ToBf16(pceil<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
+  return F32ToBf16(pfloor<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a,const Packet8bf& b) {
+  return Pack16To8(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a,const Packet8bf& b) {
+  return Pack16To8(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a,const Packet8bf& b) {
+  return Pack16To8(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a,const Packet8bf& b) {
+  return Pack16To8(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pconj(const Packet8bf& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) {
+  Packet8bf sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm_xor_si128(a, sign_mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(padd<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(psub<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(pmul<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(pdiv<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
+{
+  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0*stride]);
+  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1*stride]);
+  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2*stride]);
+  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3*stride]);
+  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4*stride]);
+  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5*stride]);
+  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6*stride]);
+  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7*stride]);
+  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
+{
+  EIGEN_ALIGN32 bfloat16 aux[8];
+  pstore(aux, from);
+  to[stride*0] = aux[0];
+  to[stride*1] = aux[1];
+  to[stride*2] = aux[2];
+  to[stride*3] = aux[3];
+  to[stride*4] = aux[4];
+  to[stride*5] = aux[5];
+  to[stride*6] = aux[6];
+  to[stride*7] = aux[7];
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_max<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_min<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_mul<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  return _mm_shuffle_epi8(a,m);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8bf,8>& kernel) {
+  __m128i a = kernel.packet[0];
+  __m128i b = kernel.packet[1];
+  __m128i c = kernel.packet[2];
+  __m128i d = kernel.packet[3];
+  __m128i e = kernel.packet[4];
+  __m128i f = kernel.packet[5];
+  __m128i g = kernel.packet[6];
+  __m128i h = kernel.packet[7];
+
+  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+  kernel.packet[0] = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+  kernel.packet[1] = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+  kernel.packet[2] = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+  kernel.packet[3] = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+  kernel.packet[4] = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+  kernel.packet[5] = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+  kernel.packet[6] = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+  kernel.packet[7] = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8bf,4>& kernel) {
+  __m128i a = kernel.packet[0];
+  __m128i b = kernel.packet[1];
+  __m128i c = kernel.packet[2];
+  __m128i d = kernel.packet[3];
+
+  __m128i ab_03 = _mm_unpacklo_epi16(a, b);
+  __m128i cd_03 = _mm_unpacklo_epi16(c, d);
+  __m128i ab_47 = _mm_unpackhi_epi16(a, b);
+  __m128i cd_47 = _mm_unpackhi_epi16(c, d);
+
+  kernel.packet[0] = _mm_unpacklo_epi32(ab_03, cd_03);
+  kernel.packet[1] = _mm_unpackhi_epi32(ab_03, cd_03);
+  kernel.packet[2] = _mm_unpacklo_epi32(ab_47, cd_47);
+  kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_AVX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h b/src/3rdparty/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h
new file mode 100644
index 000000000..d507fb67b
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -0,0 +1,115 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_AVX_H
+#define EIGEN_TYPE_CASTING_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+// For now we use SSE to handle integers, so we can't use AVX instructions to cast
+// from int to float
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+
+#ifndef EIGEN_VECTORIZE_AVX512
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<bfloat16, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<float, bfloat16> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+#endif  // EIGEN_VECTORIZE_AVX512
+
+template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
+  return _mm256_cvttps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
+  return _mm256_cvtepi32_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i,Packet8f>(const Packet8f& a) {
+  return _mm256_castps_si256(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f,Packet8i>(const Packet8i& a) {
+  return _mm256_castsi256_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
+  return half2float(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) {
+  return Bf16ToF32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+  return float2half(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8f, Packet8bf>(const Packet8f& a) {
+  return F32ToBf16(a);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_AVX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/AVX512/Complex.h b/src/3rdparty/eigen/Eigen/src/Core/arch/AVX512/Complex.h
new file mode 100644
index 000000000..49c72b3f1
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/AVX512/Complex.h
@@ -0,0 +1,422 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_AVX512_H
+#define EIGEN_COMPLEX_AVX512_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet8cf
+{
+  EIGEN_STRONG_INLINE Packet8cf() {}
+  EIGEN_STRONG_INLINE explicit Packet8cf(const __m512& a) : v(a) {}
+  __m512  v;
+};
+
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+{
+  typedef Packet8cf type;
+  typedef Packet4cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 1,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasSqrt   = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet8cf> {
+  typedef std::complex<float> type;
+  typedef Packet4cf half;
+  typedef Packet16f as_real;
+  enum {
+    size = 8,
+    alignment=unpacket_traits<Packet16f>::alignment,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet8cf padd<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf psub<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a)
+{
+  return Packet8cf(pnegate(a.v));
+}
+template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a)
+{
+  const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32(
+    0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,
+    0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
+  return Packet8cf(pxor(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
+{
+  __m512 tmp2 = _mm512_mul_ps(_mm512_movehdup_ps(a.v), _mm512_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
+  return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pand   <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf por    <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pxor   <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); }
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) {
+  __m512 eq = pcmp_eq<Packet16f>(a.v, b.v);
+  return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pload <Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload<Packet16f>(&numext::real_ref(*from))); }
+template<> EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu<Packet16f>(&numext::real_ref(*from))); }
+
+
+template<> EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from)
+{
+  return Packet8cf(_mm512_castpd_ps(pload1<Packet8d>((const double*)(const void*)&from)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from)
+{
+  return Packet8cf( _mm512_castpd_ps( ploaddup<Packet8d>((const double*)(const void*)from )) );
+}
+template<> EIGEN_STRONG_INLINE Packet8cf ploadquad<Packet8cf>(const std::complex<float>* from)
+{
+  return Packet8cf( _mm512_castpd_ps( ploadquad<Packet8d>((const double*)(const void*)from )) );
+}
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet8cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet8cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet8cf pgather<std::complex<float>, Packet8cf>(const std::complex<float>* from, Index stride)
+{
+  return Packet8cf(_mm512_castpd_ps(pgather<double,Packet8d>((const double*)(const void*)from, stride)));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet8cf>(std::complex<float>* to, const Packet8cf& from, Index stride)
+{
+  pscatter((double*)(void*)to, _mm512_castps_pd(from.v), stride);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet8cf>(const Packet8cf& a)
+{
+  return pfirst(Packet2cf(_mm512_castps512_ps128(a.v)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) {
+  return Packet8cf(_mm512_castsi512_ps(
+            _mm512_permutexvar_epi64( _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7),
+                                      _mm512_castps_si512(a.v))));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a)
+{
+  return predux(padd(Packet4cf(extract256<0>(a.v)),
+                     Packet4cf(extract256<1>(a.v))));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a)
+{
+  return predux_mul(pmul(Packet4cf(extract256<0>(a.v)),
+                         Packet4cf(extract256<1>(a.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4<Packet8cf>(const Packet8cf& a) {
+  __m256 lane0 = extract256<0>(a.v);
+  __m256 lane1 = extract256<1>(a.v);
+  __m256 res = _mm256_add_ps(lane0, lane1);
+  return Packet4cf(res);
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f)
+
+template<> EIGEN_STRONG_INLINE Packet8cf pdiv<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
+{
+  Packet8cf num = pmul(a, pconj(b));
+  __m512 tmp = _mm512_mul_ps(b.v, b.v);
+  __m512 tmp2    = _mm512_shuffle_ps(tmp,tmp,0xB1);
+  __m512 denom = _mm512_add_ps(tmp, tmp2);
+  return Packet8cf(_mm512_div_ps(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& x)
+{
+  return Packet8cf(_mm512_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
+}
+
+//---------- double ----------
+struct Packet4cd
+{
+  EIGEN_STRONG_INLINE Packet4cd() {}
+  EIGEN_STRONG_INLINE explicit Packet4cd(const __m512d& a) : v(a) {}
+  __m512d  v;
+};
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet4cd type;
+  typedef Packet2cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 4,
+    HasHalfPacket = 1,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasSqrt   = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet4cd> {
+  typedef std::complex<double> type;
+  typedef Packet2cd half;
+  typedef Packet8d as_real;
+  enum {
+    size = 4,
+    alignment = unpacket_traits<Packet8d>::alignment,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4cd padd<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd psub<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_sub_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) { return Packet4cd(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a)
+{
+  const __m512d mask = _mm512_castsi512_pd(
+          _mm512_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0,
+                           0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
+  return Packet4cd(pxor(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd pmul<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
+{
+  __m512d tmp1 = _mm512_shuffle_pd(a.v,a.v,0x0);
+  __m512d tmp2 = _mm512_shuffle_pd(a.v,a.v,0xFF);
+  __m512d tmp3 = _mm512_shuffle_pd(b.v,b.v,0x55);
+  __m512d odd  = _mm512_mul_pd(tmp2, tmp3);
+  return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd ptrue<Packet4cd>(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet4cd pand   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd por    <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pxor   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); }
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) {
+  __m512d eq = pcmp_eq<Packet8d>(a.v, b.v);
+  return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd pload <Packet4cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload<Packet8d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet4cd ploadu<Packet4cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cd(ploadu<Packet8d>((const double*)from)); }
+
+template<> EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from)
+{
+  #ifdef EIGEN_VECTORIZE_AVX512DQ
+  return Packet4cd(_mm512_broadcast_f64x2(pset1<Packet1cd>(from).v));
+  #else
+  return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1<Packet1cd>(from).v))));
+  #endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
+  return Packet4cd(_mm512_insertf64x4(
+          _mm512_castpd256_pd512(ploaddup<Packet2cd>(from).v), ploaddup<Packet2cd>(from+1).v, 1));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet4cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet4cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather<std::complex<double>, Packet4cd>(const std::complex<double>* from, Index stride)
+{
+  return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512(
+            _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+0*stride).v), ploadu<Packet1cd>(from+1*stride).v,1)),
+            _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+2*stride).v), ploadu<Packet1cd>(from+3*stride).v,1), 1));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet4cd>(std::complex<double>* to, const Packet4cd& from, Index stride)
+{
+  __m512i fromi = _mm512_castpd_si512(from.v);
+  double* tod = (double*)(void*)to;
+  _mm_storeu_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) );
+  _mm_storeu_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) );
+  _mm_storeu_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) );
+  _mm_storeu_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) );
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet4cd>(const Packet4cd& a)
+{
+  __m128d low = extract128<0>(a.v);
+  EIGEN_ALIGN16 double res[2];
+  _mm_store_pd(res, low);
+  return std::complex<double>(res[0],res[1]);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) {
+  return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, (shuffle_mask<3,2,1,0>::mask)));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a)
+{
+  return predux(padd(Packet2cd(_mm512_extractf64x4_pd(a.v,0)),
+                     Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const Packet4cd& a)
+{
+  return predux_mul(pmul(Packet2cd(_mm512_extractf64x4_pd(a.v,0)),
+                         Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
+}
+
+template<> struct conj_helper<Packet4cd, Packet4cd, false,true>
+{
+  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet4cd, Packet4cd, true,false>
+{
+  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet4cd, Packet4cd, true,true>
+{
+  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d)
+
+template<> EIGEN_STRONG_INLINE Packet4cd pdiv<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
+{
+  Packet4cd num = pmul(a, pconj(b));
+  __m512d tmp = _mm512_mul_pd(b.v, b.v);
+  __m512d denom =  padd(_mm512_permute_pd(tmp,0x55), tmp);
+  return Packet4cd(_mm512_div_pd(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x)
+{
+  return Packet4cd(_mm512_permute_pd(x.v,0x55));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8cf,4>& kernel) {
+  PacketBlock<Packet8d,4> pb;
+  
+  pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
+  pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
+  pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
+  pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
+  ptranspose(pb);
+  kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
+  kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
+  kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
+  kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8cf,8>& kernel) {
+  PacketBlock<Packet8d,8> pb;
+  
+  pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
+  pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
+  pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
+  pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
+  pb.packet[4] = _mm512_castps_pd(kernel.packet[4].v);
+  pb.packet[5] = _mm512_castps_pd(kernel.packet[5].v);
+  pb.packet[6] = _mm512_castps_pd(kernel.packet[6].v);
+  pb.packet[7] = _mm512_castps_pd(kernel.packet[7].v);
+  ptranspose(pb);
+  kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
+  kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
+  kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
+  kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
+  kernel.packet[4].v = _mm512_castpd_ps(pb.packet[4]);
+  kernel.packet[5].v = _mm512_castpd_ps(pb.packet[5]);
+  kernel.packet[6].v = _mm512_castpd_ps(pb.packet[6]);
+  kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4cd,4>& kernel) {
+  __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<0,1,0,1>::mask)); // [a0 a1 b0 b1]
+  __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<2,3,2,3>::mask)); // [a2 a3 b2 b3]
+  __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<0,1,0,1>::mask)); // [c0 c1 d0 d1]
+  __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<2,3,2,3>::mask)); // [c2 c3 d2 d3]
+
+  kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<1,3,1,3>::mask))); // [a3 b3 c3 d3]
+  kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<0,2,0,2>::mask))); // [a2 b2 c2 d2]
+  kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<1,3,1,3>::mask))); // [a1 b1 c1 d1]
+  kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0,2,0,2>::mask))); // [a0 b0 c0 d0]
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd psqrt<Packet4cd>(const Packet4cd& a) {
+  return psqrt_complex<Packet4cd>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf psqrt<Packet8cf>(const Packet8cf& a) {
+  return psqrt_complex<Packet8cf>(a);
+}
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_AVX512_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h b/src/3rdparty/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h
new file mode 100644
index 000000000..6fd726d29
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -0,0 +1,362 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Pedro Gonnet (pedro.gonnet@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
+#define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
+
+namespace Eigen {
+
+namespace internal {
+
+// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics.
+#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG  || EIGEN_COMP_MSVC >= 1923
+
+#define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
+  const Packet16f p16f_##NAME = pset1<Packet16f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \
+  const Packet16f p16f_##NAME =  preinterpret<Packet16f,Packet16i>(pset1<Packet16i>(X))
+
+#define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \
+  const Packet8d p8d_##NAME = pset1<Packet8d>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
+  const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
+
+#define _EIGEN_DECLARE_CONST_Packet16bf(NAME, X) \
+  const Packet16bf p16bf_##NAME = pset1<Packet16bf>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet16bf_FROM_INT(NAME, X) \
+  const Packet16bf p16bf_##NAME =  preinterpret<Packet16bf,Packet16i>(pset1<Packet16i>(X))
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+plog<Packet16f>(const Packet16f& _x) {
+  return plog_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
+plog<Packet8d>(const Packet8d& _x) {
+  return plog_double(_x);
+}
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, plog)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog)
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+plog2<Packet16f>(const Packet16f& _x) {
+  return plog2_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
+plog2<Packet8d>(const Packet8d& _x) {
+  return plog2_double(_x);
+}
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, plog2)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog2)
+
+// Exponential function. Works by writing "x = m*log(2) + r" where
+// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
+// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+pexp<Packet16f>(const Packet16f& _x) {
+  _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
+  _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet16f(127, 127.0f);
+
+  _EIGEN_DECLARE_CONST_Packet16f(exp_hi, 88.3762626647950f);
+  _EIGEN_DECLARE_CONST_Packet16f(exp_lo, -88.3762626647949f);
+
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_LOG2EF, 1.44269504088896341f);
+
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p0, 1.9875691500E-4f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p1, 1.3981999507E-3f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p2, 8.3334519073E-3f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p3, 4.1665795894E-2f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p4, 1.6666665459E-1f);
+  _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p5, 5.0000001201E-1f);
+
+  // Clamp x.
+  Packet16f x = pmax(pmin(_x, p16f_exp_hi), p16f_exp_lo);
+
+  // Express exp(x) as exp(m*ln(2) + r), start by extracting
+  // m = floor(x/ln(2) + 0.5).
+  Packet16f m = _mm512_floor_ps(pmadd(x, p16f_cephes_LOG2EF, p16f_half));
+
+  // Get r = x - m*ln(2). Note that we can do this without losing more than one
+  // ulp precision due to the FMA instruction.
+  _EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f);
+  Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x);
+  Packet16f r2 = pmul(r, r);
+  Packet16f r3 = pmul(r2, r);
+
+  // Evaluate the polynomial approximant,improved by instruction-level parallelism.
+  Packet16f y, y1, y2;
+  y  = pmadd(p16f_cephes_exp_p0, r, p16f_cephes_exp_p1);
+  y1 = pmadd(p16f_cephes_exp_p3, r, p16f_cephes_exp_p4);
+  y2 = padd(r, p16f_1);
+  y  = pmadd(y, r, p16f_cephes_exp_p2);
+  y1 = pmadd(y1, r, p16f_cephes_exp_p5);
+  y  = pmadd(y, r3, y1);
+  y  = pmadd(y, r2, y2);
+
+  // Build emm0 = 2^m.
+  Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127));
+  emm0 = _mm512_slli_epi32(emm0, 23);
+
+  // Return 2^m * exp(r).
+  return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
+pexp<Packet8d>(const Packet8d& _x) {
+  return pexp_double(_x);
+}
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp)
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pfrexp(const Packet16h& a, Packet16h& exponent) {
+  Packet16f fexponent;
+  const Packet16h out = float2half(pfrexp<Packet16f>(half2float(a), fexponent));
+  exponent = float2half(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pldexp(const Packet16h& a, const Packet16h& exponent) {
+  return float2half(pldexp<Packet16f>(half2float(a), half2float(exponent)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pfrexp(const Packet16bf& a, Packet16bf& exponent) {
+  Packet16f fexponent;
+  const Packet16bf out = F32ToBf16(pfrexp<Packet16f>(Bf16ToF32(a), fexponent));
+  exponent = F32ToBf16(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pldexp(const Packet16bf& a, const Packet16bf& exponent) {
+  return F32ToBf16(pldexp<Packet16f>(Bf16ToF32(a), Bf16ToF32(exponent)));
+}
+
+// Functions for sqrt.
+// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
+// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
+// exact solution. The main advantage of this approach is not just speed, but
+// also the fact that it can be inlined and pipelined with other computations,
+// further reducing its effective latency.
+#if EIGEN_FAST_MATH
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+psqrt<Packet16f>(const Packet16f& _x) {
+  Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
+  __mmask16 denormal_mask = _mm512_kand(
+      _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
+                        _CMP_LT_OQ),
+      _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
+
+  Packet16f x = _mm512_rsqrt14_ps(_x);
+
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
+
+  // Flush results for denormals to zero.
+  return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
+psqrt<Packet8d>(const Packet8d& _x) {
+  Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5));
+  __mmask16 denormal_mask = _mm512_kand(
+      _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
+                        _CMP_LT_OQ),
+      _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
+
+  Packet8d x = _mm512_rsqrt14_pd(_x);
+
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
+
+  // Do a second step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
+
+  return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet16f psqrt<Packet16f>(const Packet16f& x) {
+  return _mm512_sqrt_ps(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) {
+  return _mm512_sqrt_pd(x);
+}
+#endif
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt)
+
+// prsqrt for float.
+#if defined(EIGEN_VECTORIZE_AVX512ER)
+
+template <>
+EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
+  return _mm512_rsqrt28_ps(x);
+}
+#elif EIGEN_FAST_MATH
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+prsqrt<Packet16f>(const Packet16f& _x) {
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
+
+  Packet16f neg_half = pmul(_x, p16f_minus_half);
+
+  // Identity infinite, negative and denormal arguments.
+  __mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ);
+  __mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ);
+  __mmask16 not_finite_pos_mask = not_pos_mask | inf_mask;
+
+  // Compute an approximate result using the rsqrt intrinsic, forcing +inf
+  // for denormals for consistency with AVX and SSE implementations.
+  Packet16f y_approx = _mm512_rsqrt14_ps(_x);
+
+  // Do a single step of Newton-Raphson iteration to improve the approximation.
+  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
+  // It is essential to evaluate the inner term like this because forming
+  // y_n^2 may over- or underflow.
+  Packet16f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p16f_one_point_five));
+
+  // Select the result of the Newton-Raphson step for positive finite arguments.
+  // For other arguments, choose the output of the intrinsic. This will
+  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
+  return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx);
+}
+#else
+
+template <>
+EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
+  _EIGEN_DECLARE_CONST_Packet16f(one, 1.0f);
+  return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x));
+}
+#endif
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, prsqrt)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, prsqrt)
+
+// prsqrt for double.
+#if EIGEN_FAST_MATH
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
+prsqrt<Packet8d>(const Packet8d& _x) {
+  _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
+  _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
+  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
+
+  Packet8d neg_half = pmul(_x, p8d_minus_half);
+
+  // Identity infinite, negative and denormal arguments.
+  __mmask8 inf_mask = _mm512_cmp_pd_mask(_x, p8d_inf, _CMP_EQ_OQ);
+  __mmask8 not_pos_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LE_OQ);
+  __mmask8 not_finite_pos_mask = not_pos_mask | inf_mask;
+
+  // Compute an approximate result using the rsqrt intrinsic, forcing +inf
+  // for denormals for consistency with AVX and SSE implementations.
+#if defined(EIGEN_VECTORIZE_AVX512ER)
+  Packet8d y_approx = _mm512_rsqrt28_pd(_x);
+#else
+  Packet8d y_approx = _mm512_rsqrt14_pd(_x);
+#endif
+  // Do one or two steps of Newton-Raphson's to improve the approximation, depending on the
+  // starting accuracy (either 2^-14 or 2^-28, depending on whether AVX512ER is available).
+  // The Newton-Raphson algorithm has quadratic convergence and roughly doubles the number
+  // of correct digits for each step.
+  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
+  // It is essential to evaluate the inner term like this because forming
+  // y_n^2 may over- or underflow.
+  Packet8d y_newton = pmul(y_approx, pmadd(neg_half, pmul(y_approx, y_approx), p8d_one_point_five));
+#if !defined(EIGEN_VECTORIZE_AVX512ER)
+  y_newton = pmul(y_newton, pmadd(y_newton, pmul(neg_half, y_newton), p8d_one_point_five));
+#endif
+  // Select the result of the Newton-Raphson step for positive finite arguments.
+  // For other arguments, choose the output of the intrinsic. This will
+  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
+  return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet8d prsqrt<Packet8d>(const Packet8d& x) {
+  _EIGEN_DECLARE_CONST_Packet8d(one, 1.0f);
+  return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(x));
+}
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet16f plog1p<Packet16f>(const Packet16f& _x) {
+  return generic_plog1p(_x);
+}
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, plog1p)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog1p)
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet16f pexpm1<Packet16f>(const Packet16f& _x) {
+  return generic_expm1(_x);
+}
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1)
+
+#endif
+
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+psin<Packet16f>(const Packet16f& _x) {
+  return psin_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+pcos<Packet16f>(const Packet16f& _x) {
+  return pcos_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+ptanh<Packet16f>(const Packet16f& _x) {
+  return internal::generic_fast_tanh_float(_x);
+}
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, psin)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh)
+
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pcos)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh)
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h b/src/3rdparty/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h
new file mode 100644
index 000000000..34d49ab66
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -0,0 +1,2303 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner (benoit.steiner.goog@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_AVX512_H
+#define EIGEN_PACKET_MATH_AVX512_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#endif
+
+#ifdef EIGEN_VECTORIZE_FMA
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+#endif
+
+typedef __m512 Packet16f;
+typedef __m512i Packet16i;
+typedef __m512d Packet8d;
+typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
+typedef eigen_packet_wrapper<__m256i, 2> Packet16bf;
+
+template <>
+struct is_arithmetic<__m512> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m512i> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m512d> {
+  enum { value = true };
+};
+
+template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
+
+template <>
+struct packet_traits<half> : default_packet_traits {
+  typedef Packet16h type;
+  // There is no half-size packet for Packet16h.
+  typedef Packet16h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 1,
+
+    HasCmp    = 1,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasAbs2   = 0,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 1,
+    HasSetLinear = 0,
+    HasLog    = 1,
+    HasLog1p  = 1,
+    HasExpm1  = 1,
+    HasExp    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasSin    = EIGEN_FAST_MATH,
+    HasCos    = EIGEN_FAST_MATH,
+    HasTanh   = EIGEN_FAST_MATH,
+    HasErf    = EIGEN_FAST_MATH,
+    HasBlend = 0,
+    HasRound  = 1,
+    HasFloor  = 1,
+    HasCeil   = 1,
+    HasRint   = 1,
+    HasBessel = 1,
+    HasNdtri  = 1
+  };
+};
+
+template<> struct packet_traits<float>  : default_packet_traits
+{
+  typedef Packet16f type;
+  typedef Packet8f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 1,
+
+    HasAbs = 1,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 1,
+    HasBlend = 0,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
+    HasLog = 1,
+    HasLog1p  = 1,
+    HasExpm1  = 1,
+    HasNdtri = 1,
+    HasBessel  = 1,
+    HasExp = 1,
+    HasSqrt = EIGEN_FAST_MATH,
+    HasRsqrt = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+#endif
+    HasCmp  = 1,
+    HasDiv = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1
+  };
+ };
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef Packet8d type;
+  typedef Packet4d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 1,
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
+    HasLog  = 1,
+    HasExp = 1,
+    HasSqrt = EIGEN_FAST_MATH,
+    HasRsqrt = EIGEN_FAST_MATH,
+#endif
+    HasCmp  = 1,
+    HasDiv = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1
+  };
+};
+
+/* TODO Implement AVX512 for integers
+template<> struct packet_traits<int>    : default_packet_traits
+{
+  typedef Packet16i type;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=8
+  };
+};
+*/
+
+template <>
+struct unpacket_traits<Packet16f> {
+  typedef float type;
+  typedef Packet8f half;
+  typedef Packet16i integer_packet;
+  typedef uint16_t mask_t;
+  enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=true, masked_store_available=true };
+};
+template <>
+struct unpacket_traits<Packet8d> {
+  typedef double type;
+  typedef Packet4d half;
+  enum { size = 8, alignment=Aligned64, vectorizable=true, masked_load_available=false, masked_store_available=false };
+};
+template <>
+struct unpacket_traits<Packet16i> {
+  typedef int type;
+  typedef Packet8i half;
+  enum { size = 16, alignment=Aligned64, vectorizable=false, masked_load_available=false, masked_store_available=false };
+};
+
+template<>
+struct unpacket_traits<Packet16h> {
+  typedef Eigen::half type;
+  typedef Packet8h half;
+  enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
+  return _mm512_set1_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pset1<Packet8d>(const double& from) {
+  return _mm512_set1_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pset1<Packet16i>(const int& from) {
+  return _mm512_set1_epi32(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pset1frombits<Packet16f>(unsigned int from) {
+  return _mm512_castsi512_ps(_mm512_set1_epi32(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pset1frombits<Packet8d>(const numext::uint64_t from) {
+  return _mm512_castsi512_pd(_mm512_set1_epi64(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f& /*a*/) { return _mm512_setzero_ps(); }
+template<> EIGEN_STRONG_INLINE Packet8d pzero(const Packet8d& /*a*/) { return _mm512_setzero_pd(); }
+template<> EIGEN_STRONG_INLINE Packet16i pzero(const Packet16i& /*a*/) { return _mm512_setzero_si512(); }
+
+template<> EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /*a*/) {
+  return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
+                                              0, -1, 0, -1, 0, -1, 0, -1));
+}
+template<> EIGEN_STRONG_INLINE Packet16i peven_mask(const Packet16i& /*a*/) {
+  return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
+                          0, -1, 0, -1, 0, -1, 0, -1);
+}
+template<> EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) {
+  return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1,
+                                              0, 0, -1, -1, 0, 0, -1, -1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
+  return _mm512_broadcastss_ps(_mm_load_ps1(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
+  return _mm512_set1_pd(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f plset<Packet16f>(const float& a) {
+  return _mm512_add_ps(
+      _mm512_set1_ps(a),
+      _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f,
+                    4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) {
+  return _mm512_add_pd(_mm512_set1_pd(a),
+                       _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a,
+                                              const Packet16f& b) {
+  return _mm512_add_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
+                                            const Packet8d& b) {
+  return _mm512_add_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a,
+                                              const Packet16i& b) {
+  return _mm512_add_epi32(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a,
+                                              const Packet16f& b) {
+  return _mm512_sub_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a,
+                                            const Packet8d& b) {
+  return _mm512_sub_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a,
+                                              const Packet16i& b) {
+  return _mm512_sub_epi32(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
+  return _mm512_sub_ps(_mm512_set1_ps(0.0), a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) {
+  return _mm512_sub_pd(_mm512_set1_pd(0.0), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pconj(const Packet16f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pconj(const Packet8d& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pconj(const Packet16i& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a,
+                                              const Packet16f& b) {
+  return _mm512_mul_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a,
+                                            const Packet8d& b) {
+  return _mm512_mul_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a,
+                                              const Packet16i& b) {
+  return _mm512_mullo_epi32(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
+                                              const Packet16f& b) {
+  return _mm512_div_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
+                                            const Packet8d& b) {
+  return _mm512_div_pd(a, b);
+}
+
+#ifdef EIGEN_VECTORIZE_FMA
+template <>
+EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
+                                    const Packet16f& c) {
+  return _mm512_fmadd_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b,
+                                   const Packet8d& c) {
+  return _mm512_fmadd_pd(a, b, c);
+}
+#endif
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask,
+                                           const Packet16f& a,
+                                           const Packet16f& b) {
+  __mmask16 mask16 = _mm512_cmp_epi32_mask(
+      _mm512_castps_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
+  return _mm512_mask_blend_ps(mask16, a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask,
+                                          const Packet8d& a,
+                                          const Packet8d& b) {
+  __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask),
+                                         _mm512_setzero_epi32(), _MM_CMPINT_EQ);
+  return _mm512_mask_blend_pd(mask8, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a,
+                                              const Packet16f& b) {
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm512_min_ps(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a,
+                                            const Packet8d& b) {
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm512_min_pd(b, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a,
+                                              const Packet16f& b) {
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm512_max_ps(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
+                                            const Packet8d& b) {
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm512_max_pd(b, a);
+}
+
+// Add specializations for min/max with prescribed NaN progation.
+template<>
+EIGEN_STRONG_INLINE Packet16f pmin<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet16f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8d pmin<PropagateNumbers, Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet8d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet16f pmax<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet16f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8d pmax<PropagateNumbers, Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet8d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet16f pmin<PropagateNaN, Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet16f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8d pmin<PropagateNaN, Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet8d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet16f pmax<PropagateNaN, Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet16f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8d pmax<PropagateNaN, Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet8d>);
+}
+
+
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
+template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); }
+#else
+// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
+template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
+  return  _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_));
+}
+
+// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
+template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
+  return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_));
+}
+
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
+  return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)),
+                                                _mm256_castps_si256(b),1));
+}
+#endif
+
+// Helper function for bit packing snippet of low precision comparison.
+// It packs the flags from 32x16 to 16x16.
+EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) {
+  // Split data into small pieces and handle with AVX instructions
+  // to guarantee internal order of vector.
+  // Operation:
+  //   dst[15:0]    := Saturate16(rf[31:0])
+  //   dst[31:16]   := Saturate16(rf[63:32])
+  //   ...
+  //   dst[255:240] := Saturate16(rf[255:224])
+  __m256i lo = _mm256_castps_si256(extract256<0>(rf));
+  __m256i hi = _mm256_castps_si256(extract256<1>(rf));
+  __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
+                                      _mm256_extractf128_si256(lo, 1));
+  __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
+                                      _mm256_extractf128_si256(hi, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
+  __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
+}
+
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_le(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ);
+  return _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
+  return _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ);
+  return _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f print<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet8d print<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_CUR_DIRECTION); }
+
+template<> EIGEN_STRONG_INLINE Packet16f pceil<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF); }
+template<> EIGEN_STRONG_INLINE Packet8d pceil<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF); }
+
+template<> EIGEN_STRONG_INLINE Packet16f pfloor<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF); }
+template<> EIGEN_STRONG_INLINE Packet8d pfloor<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF); }
+
+template <>
+EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(const Packet16i& /*a*/) {
+  return _mm512_set1_epi32(0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f ptrue<Packet16f>(const Packet16f& a) {
+  return _mm512_castsi512_ps(ptrue<Packet16i>(_mm512_castps_si512(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d ptrue<Packet8d>(const Packet8d& a) {
+  return _mm512_castsi512_pd(ptrue<Packet16i>(_mm512_castpd_si512(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
+                                              const Packet16i& b) {
+  return _mm512_and_si512(a,b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
+                                              const Packet16f& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_and_ps(a, b);
+#else
+  return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b)));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a,
+                                            const Packet8d& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_and_pd(a, b);
+#else
+  Packet8d res = _mm512_undefined_pd();
+  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
+  res = _mm512_insertf64x4(res, _mm256_and_pd(lane0_a, lane0_b), 0);
+
+  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
+  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
+  return _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i por<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_or_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, const Packet16f& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_or_ps(a, b);
+#else
+  return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
+                                           const Packet8d& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_or_pd(a, b);
+#else
+  return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pxor<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_xor_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, const Packet16f& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_xor_ps(a, b);
+#else
+  return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, const Packet8d& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_xor_pd(a, b);
+#else
+  return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pandnot<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_andnot_si512(b, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, const Packet16f& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_andnot_ps(b, a);
+#else
+  return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b)));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,const Packet8d& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_andnot_pd(b, a);
+#else
+  return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pround<Packet16f>(const Packet16f& a)
+{
+  // Work-around for default std::round rounding mode.
+  const Packet16f mask = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x80000000u));
+  const Packet16f prev0dot5 = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
+  return _mm512_roundscale_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+template<> EIGEN_STRONG_INLINE Packet8d pround<Packet8d>(const Packet8d& a)
+{
+  // Work-around for default std::round rounding mode.
+  const Packet8d mask = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
+  const Packet8d prev0dot5 = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
+  return _mm512_roundscale_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
+  return _mm512_srai_epi32(a, N);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) {
+  return _mm512_srli_epi32(a, N);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
+  return _mm512_slli_epi32(a, N);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pload<Packet8d>(const double* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pload<Packet16i>(const int* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d ploadu<Packet8d>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
+      reinterpret_cast<const __m512i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from, uint16_t umask) {
+  __mmask16 mask = static_cast<__mmask16>(umask);
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_ps(mask, from);
+}
+
+// Loads 8 floats from memory a returns the packet
+// {a0, a0  a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
+template <>
+EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
+  // an unaligned load is required here as there is no requirement
+  // on the alignment of input pointer 'from'
+  __m256i low_half = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+  __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
+  __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
+  return pairs;
+}
+
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+// FIXME: this does not look optimal, better load a Packet4d and shuffle...
+// Loads 4 doubles from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3,
+// a3}
+template <>
+EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
+ __m512d x = _mm512_setzero_pd();
+  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0);
+  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1);
+  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2);
+  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
+  return x;
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
+  __m512d x = _mm512_setzero_pd();
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
+  return x;
+}
+#endif
+
+// Loads 4 floats from memory a returns the packet
+// {a0, a0  a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
+template <>
+EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
+  Packet16f tmp = _mm512_castps128_ps512(ploadu<Packet4f>(from));
+  const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
+  return _mm512_permutexvar_ps(scatter_mask, tmp);
+}
+
+// Loads 2 doubles from memory a returns the packet
+// {a0, a0  a0, a0, a1, a1, a1, a1}
+template <>
+EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
+  __m256d lane0 = _mm256_set1_pd(*from);
+  __m256d lane1 = _mm256_set1_pd(*(from+1));
+  __m512d tmp = _mm512_undefined_pd();
+  tmp = _mm512_insertf64x4(tmp, lane0, 0);
+  return _mm512_insertf64x4(tmp, lane1, 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ps(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet8d& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_pd(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet16i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to),
+                                                from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ps(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet8d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_pd(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
+      reinterpret_cast<__m512i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from, uint16_t umask) {
+  __mmask16 mask = static_cast<__mmask16>(umask);
+  EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_ps(to, mask, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
+                                                             Index stride) {
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
+  Packet16i stride_multiplier =
+      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
+
+  return _mm512_i32gather_ps(indices, from, 4);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
+                                                            Index stride) {
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
+  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
+
+  return _mm512_i32gather_pd(indices, from, 8);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
+                                                         const Packet16f& from,
+                                                         Index stride) {
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
+  Packet16i stride_multiplier =
+      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
+  _mm512_i32scatter_ps(to, indices, from, 4);
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
+                                                         const Packet8d& from,
+                                                         Index stride) {
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
+  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
+  _mm512_i32scatter_pd(to, indices, from, 8);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet16f>(float* to, const float& a) {
+  Packet16f pa = pset1<Packet16f>(a);
+  pstore(to, pa);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet8d>(double* to, const double& a) {
+  Packet8d pa = pset1<Packet8d>(a);
+  pstore(to, pa);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
+  Packet16i pa = pset1<Packet16i>(a);
+  pstore(to, pa);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
+  return _mm_cvtss_f32(_mm512_extractf32x4_ps(a, 0));
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet8d>(const Packet8d& a) {
+  return _mm_cvtsd_f64(_mm256_extractf128_pd(_mm512_extractf64x4_pd(a, 0), 0));
+}
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet16i>(const Packet16i& a) {
+  return _mm_extract_epi32(_mm512_extracti32x4_epi32(a, 0), 0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a)
+{
+  return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
+{
+  return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
+{
+  // _mm512_abs_ps intrinsic not found, so hack around it
+  return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
+  // _mm512_abs_ps intrinsic not found, so hack around it
+  return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
+                                   _mm512_set1_epi64(0x7fffffffffffffff)));
+}
+
+template<>
+EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent){
+  return pfrexp_generic(a, exponent);
+}
+
+// Extract exponent without existence of Packet8l.
+template<>
+EIGEN_STRONG_INLINE  
+Packet8d pfrexp_generic_get_biased_exponent(const Packet8d& a) {
+  const Packet8d cst_exp_mask  = pset1frombits<Packet8d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+  #ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52));
+  #else
+  return _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52)));
+  #endif
+}
+
+template<>
+EIGEN_STRONG_INLINE Packet8d pfrexp<Packet8d>(const Packet8d& a, Packet8d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet8d max_exponent = pset1<Packet8d>(2099.0);
+  const Packet8i e = _mm512_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+  
+  // Split 2^e into four factors and multiply.
+  const Packet8i bias = pset1<Packet8i>(1023);
+  Packet8i b = parithmetic_shift_right<2>(e);  // floor(e/4)
+  
+  // 2^b
+  const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+  Packet8i hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
+  Packet8i lo = _mm256_slli_epi64(hi, 52);
+  hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
+  Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
+  Packet8d out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
+  
+  // 2^(e - 3b)
+  b = psub(psub(psub(e, b), b), b);  // e - 3b
+  hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
+  lo = _mm256_slli_epi64(hi, 52);
+  hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
+  c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
+  out = pmul(out, c);  // a * 2^e
+  return out;
+}
+
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
+#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                           \
+  __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0);                    \
+  __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
+#else
+#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                \
+  __m256 OUTPUT##_0 = _mm256_insertf128_ps(                     \
+      _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
+      _mm512_extractf32x4_ps(INPUT, 1), 1);                     \
+  __m256 OUTPUT##_1 = _mm256_insertf128_ps(                     \
+      _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
+      _mm512_extractf32x4_ps(INPUT, 3), 1);
+#endif
+
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
+  OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
+#else
+#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB)                    \
+  OUTPUT = _mm512_undefined_ps();                                           \
+  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
+  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
+  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
+  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
+#endif
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
+  __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
+  Packet8f x = _mm256_add_ps(lane0, lane1);
+  return predux<Packet8f>(x);
+#else
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3));
+  sum = _mm_hadd_ps(sum, sum);
+  sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
+  return _mm_cvtss_f32(sum);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  __m256d sum = _mm256_add_pd(lane0, lane1);
+  __m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
+  return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
+  __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
+  return _mm256_add_ps(lane0, lane1);
+#else
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 sum0 = _mm_add_ps(lane0, lane2);
+  __m128 sum1 = _mm_add_ps(lane1, lane3);
+  return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  return _mm256_add_pd(lane0, lane1);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
+//#ifdef EIGEN_VECTORIZE_AVX512DQ
+#if 0
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  Packet8f res = pmul(lane0, lane1);
+  res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
+  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
+#else
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
+  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  __m256d res = pmul(lane0, lane1);
+  res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
+  return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
+  res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
+}
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  __m256d res = _mm256_min_pd(lane0, lane1);
+  res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
+  return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
+  res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  __m256d res = _mm256_max_pd(lane0, lane1);
+  res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
+  return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
+}
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x)
+{
+  Packet16i xi = _mm512_castps_si512(x);
+  __mmask16 tmp = _mm512_test_epi32_mask(xi,xi);
+  return !_mm512_kortestz(tmp,tmp);
+}
+
+
+
+#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
+  EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
+  __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
+  __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
+  __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
+  __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
+  __m512 T8 = _mm512_unpacklo_ps(kernel.packet[8], kernel.packet[9]);
+  __m512 T9 = _mm512_unpackhi_ps(kernel.packet[8], kernel.packet[9]);
+  __m512 T10 = _mm512_unpacklo_ps(kernel.packet[10], kernel.packet[11]);
+  __m512 T11 = _mm512_unpackhi_ps(kernel.packet[10], kernel.packet[11]);
+  __m512 T12 = _mm512_unpacklo_ps(kernel.packet[12], kernel.packet[13]);
+  __m512 T13 = _mm512_unpackhi_ps(kernel.packet[12], kernel.packet[13]);
+  __m512 T14 = _mm512_unpacklo_ps(kernel.packet[14], kernel.packet[15]);
+  __m512 T15 = _mm512_unpackhi_ps(kernel.packet[14], kernel.packet[15]);
+  __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S4 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S5 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S6 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S7 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S8 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S9 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S10 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S11 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S12 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S13 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S14 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S15 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(3, 2, 3, 2));
+
+  EIGEN_EXTRACT_8f_FROM_16f(S0, S0);
+  EIGEN_EXTRACT_8f_FROM_16f(S1, S1);
+  EIGEN_EXTRACT_8f_FROM_16f(S2, S2);
+  EIGEN_EXTRACT_8f_FROM_16f(S3, S3);
+  EIGEN_EXTRACT_8f_FROM_16f(S4, S4);
+  EIGEN_EXTRACT_8f_FROM_16f(S5, S5);
+  EIGEN_EXTRACT_8f_FROM_16f(S6, S6);
+  EIGEN_EXTRACT_8f_FROM_16f(S7, S7);
+  EIGEN_EXTRACT_8f_FROM_16f(S8, S8);
+  EIGEN_EXTRACT_8f_FROM_16f(S9, S9);
+  EIGEN_EXTRACT_8f_FROM_16f(S10, S10);
+  EIGEN_EXTRACT_8f_FROM_16f(S11, S11);
+  EIGEN_EXTRACT_8f_FROM_16f(S12, S12);
+  EIGEN_EXTRACT_8f_FROM_16f(S13, S13);
+  EIGEN_EXTRACT_8f_FROM_16f(S14, S14);
+  EIGEN_EXTRACT_8f_FROM_16f(S15, S15);
+
+  PacketBlock<Packet8f, 32> tmp;
+
+  tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S4_0, 0x20);
+  tmp.packet[1] = _mm256_permute2f128_ps(S1_0, S5_0, 0x20);
+  tmp.packet[2] = _mm256_permute2f128_ps(S2_0, S6_0, 0x20);
+  tmp.packet[3] = _mm256_permute2f128_ps(S3_0, S7_0, 0x20);
+  tmp.packet[4] = _mm256_permute2f128_ps(S0_0, S4_0, 0x31);
+  tmp.packet[5] = _mm256_permute2f128_ps(S1_0, S5_0, 0x31);
+  tmp.packet[6] = _mm256_permute2f128_ps(S2_0, S6_0, 0x31);
+  tmp.packet[7] = _mm256_permute2f128_ps(S3_0, S7_0, 0x31);
+
+  tmp.packet[8] = _mm256_permute2f128_ps(S0_1, S4_1, 0x20);
+  tmp.packet[9] = _mm256_permute2f128_ps(S1_1, S5_1, 0x20);
+  tmp.packet[10] = _mm256_permute2f128_ps(S2_1, S6_1, 0x20);
+  tmp.packet[11] = _mm256_permute2f128_ps(S3_1, S7_1, 0x20);
+  tmp.packet[12] = _mm256_permute2f128_ps(S0_1, S4_1, 0x31);
+  tmp.packet[13] = _mm256_permute2f128_ps(S1_1, S5_1, 0x31);
+  tmp.packet[14] = _mm256_permute2f128_ps(S2_1, S6_1, 0x31);
+  tmp.packet[15] = _mm256_permute2f128_ps(S3_1, S7_1, 0x31);
+
+  // Second set of _m256 outputs
+  tmp.packet[16] = _mm256_permute2f128_ps(S8_0, S12_0, 0x20);
+  tmp.packet[17] = _mm256_permute2f128_ps(S9_0, S13_0, 0x20);
+  tmp.packet[18] = _mm256_permute2f128_ps(S10_0, S14_0, 0x20);
+  tmp.packet[19] = _mm256_permute2f128_ps(S11_0, S15_0, 0x20);
+  tmp.packet[20] = _mm256_permute2f128_ps(S8_0, S12_0, 0x31);
+  tmp.packet[21] = _mm256_permute2f128_ps(S9_0, S13_0, 0x31);
+  tmp.packet[22] = _mm256_permute2f128_ps(S10_0, S14_0, 0x31);
+  tmp.packet[23] = _mm256_permute2f128_ps(S11_0, S15_0, 0x31);
+
+  tmp.packet[24] = _mm256_permute2f128_ps(S8_1, S12_1, 0x20);
+  tmp.packet[25] = _mm256_permute2f128_ps(S9_1, S13_1, 0x20);
+  tmp.packet[26] = _mm256_permute2f128_ps(S10_1, S14_1, 0x20);
+  tmp.packet[27] = _mm256_permute2f128_ps(S11_1, S15_1, 0x20);
+  tmp.packet[28] = _mm256_permute2f128_ps(S8_1, S12_1, 0x31);
+  tmp.packet[29] = _mm256_permute2f128_ps(S9_1, S13_1, 0x31);
+  tmp.packet[30] = _mm256_permute2f128_ps(S10_1, S14_1, 0x31);
+  tmp.packet[31] = _mm256_permute2f128_ps(S11_1, S15_1, 0x31);
+
+  // Pack them into the output
+  PACK_OUTPUT(kernel.packet, tmp.packet, 0, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 1, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 2, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 3, 16);
+
+  PACK_OUTPUT(kernel.packet, tmp.packet, 4, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 5, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 6, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 7, 16);
+
+  PACK_OUTPUT(kernel.packet, tmp.packet, 8, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 9, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 10, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 11, 16);
+
+  PACK_OUTPUT(kernel.packet, tmp.packet, 12, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 13, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 14, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 15, 16);
+}
+#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE)         \
+  EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \
+                           INPUT[2 * INDEX + STRIDE]);
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
+  __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+
+  __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
+
+  EIGEN_EXTRACT_8f_FROM_16f(S0, S0);
+  EIGEN_EXTRACT_8f_FROM_16f(S1, S1);
+  EIGEN_EXTRACT_8f_FROM_16f(S2, S2);
+  EIGEN_EXTRACT_8f_FROM_16f(S3, S3);
+
+  PacketBlock<Packet8f, 8> tmp;
+
+  tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S1_0, 0x20);
+  tmp.packet[1] = _mm256_permute2f128_ps(S2_0, S3_0, 0x20);
+  tmp.packet[2] = _mm256_permute2f128_ps(S0_0, S1_0, 0x31);
+  tmp.packet[3] = _mm256_permute2f128_ps(S2_0, S3_0, 0x31);
+
+  tmp.packet[4] = _mm256_permute2f128_ps(S0_1, S1_1, 0x20);
+  tmp.packet[5] = _mm256_permute2f128_ps(S2_1, S3_1, 0x20);
+  tmp.packet[6] = _mm256_permute2f128_ps(S0_1, S1_1, 0x31);
+  tmp.packet[7] = _mm256_permute2f128_ps(S2_1, S3_1, 0x31);
+
+  PACK_OUTPUT_2(kernel.packet, tmp.packet, 0, 1);
+  PACK_OUTPUT_2(kernel.packet, tmp.packet, 1, 1);
+  PACK_OUTPUT_2(kernel.packet, tmp.packet, 2, 1);
+  PACK_OUTPUT_2(kernel.packet, tmp.packet, 3, 1);
+}
+
+#define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE)                \
+  OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \
+  OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX + STRIDE], 1);
+
+#define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE)                         \
+  OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
+  OUTPUT[INDEX] =                                                           \
+      _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
+  __m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
+  __m512d T1 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0xff);
+  __m512d T2 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
+  __m512d T3 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0xff);
+
+  PacketBlock<Packet4d, 8> tmp;
+
+  tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
+                                         _mm512_extractf64x4_pd(T2, 0), 0x20);
+  tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
+                                         _mm512_extractf64x4_pd(T3, 0), 0x20);
+  tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
+                                         _mm512_extractf64x4_pd(T2, 0), 0x31);
+  tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
+                                         _mm512_extractf64x4_pd(T3, 0), 0x31);
+
+  tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
+                                         _mm512_extractf64x4_pd(T2, 1), 0x20);
+  tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
+                                         _mm512_extractf64x4_pd(T3, 1), 0x20);
+  tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
+                                         _mm512_extractf64x4_pd(T2, 1), 0x31);
+  tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
+                                         _mm512_extractf64x4_pd(T3, 1), 0x31);
+
+  PACK_OUTPUT_D(kernel.packet, tmp.packet, 0, 1);
+  PACK_OUTPUT_D(kernel.packet, tmp.packet, 1, 1);
+  PACK_OUTPUT_D(kernel.packet, tmp.packet, 2, 1);
+  PACK_OUTPUT_D(kernel.packet, tmp.packet, 3, 1);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
+  __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
+  __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
+  __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2], kernel.packet[3]);
+  __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2], kernel.packet[3]);
+  __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4], kernel.packet[5]);
+  __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4], kernel.packet[5]);
+  __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6], kernel.packet[7]);
+  __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6], kernel.packet[7]);
+
+  PacketBlock<Packet4d, 16> tmp;
+
+  tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
+                                         _mm512_extractf64x4_pd(T2, 0), 0x20);
+  tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
+                                         _mm512_extractf64x4_pd(T3, 0), 0x20);
+  tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
+                                         _mm512_extractf64x4_pd(T2, 0), 0x31);
+  tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
+                                         _mm512_extractf64x4_pd(T3, 0), 0x31);
+
+  tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
+                                         _mm512_extractf64x4_pd(T2, 1), 0x20);
+  tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
+                                         _mm512_extractf64x4_pd(T3, 1), 0x20);
+  tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
+                                         _mm512_extractf64x4_pd(T2, 1), 0x31);
+  tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
+                                         _mm512_extractf64x4_pd(T3, 1), 0x31);
+
+  tmp.packet[8] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0),
+                                         _mm512_extractf64x4_pd(T6, 0), 0x20);
+  tmp.packet[9] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0),
+                                         _mm512_extractf64x4_pd(T7, 0), 0x20);
+  tmp.packet[10] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0),
+                                          _mm512_extractf64x4_pd(T6, 0), 0x31);
+  tmp.packet[11] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0),
+                                          _mm512_extractf64x4_pd(T7, 0), 0x31);
+
+  tmp.packet[12] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1),
+                                          _mm512_extractf64x4_pd(T6, 1), 0x20);
+  tmp.packet[13] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1),
+                                          _mm512_extractf64x4_pd(T7, 1), 0x20);
+  tmp.packet[14] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1),
+                                          _mm512_extractf64x4_pd(T6, 1), 0x31);
+  tmp.packet[15] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1),
+                                          _mm512_extractf64x4_pd(T7, 1), 0x31);
+
+  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 0, 8);
+  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 1, 8);
+  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 2, 8);
+  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 3, 8);
+
+  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 4, 8);
+  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 5, 8);
+  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 6, 8);
+  PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 7, 8);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
+                                     const Packet16f& /*thenPacket*/,
+                                     const Packet16f& /*elsePacket*/) {
+  assert(false && "To be implemented");
+  return Packet16f();
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
+                                    const Packet8d& thenPacket,
+                                    const Packet8d& elsePacket) {
+  __mmask8 m = (ifPacket.select[0]   )
+             | (ifPacket.select[1]<<1)
+             | (ifPacket.select[2]<<2)
+             | (ifPacket.select[3]<<3)
+             | (ifPacket.select[4]<<4)
+             | (ifPacket.select[5]<<5)
+             | (ifPacket.select[6]<<6)
+             | (ifPacket.select[7]<<7);
+  return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
+}
+
+// Packet math for Eigen::half
+template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
+  return _mm256_set1_epi16(from.x);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
+  return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
+  return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
+  // (void*) -> workaround clang warning:
+  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+  _mm256_store_si256((__m256i*)(void*)to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
+  // (void*) -> workaround clang warning:
+  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+  _mm256_storeu_si256((__m256i*)(void*)to, from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h
+ploaddup<Packet16h>(const Eigen::half*  from) {
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  unsigned short e = from[4].x;
+  unsigned short f = from[5].x;
+  unsigned short g = from[6].x;
+  unsigned short h = from[7].x;
+  return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h
+ploadquad(const Eigen::half* from) {
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
+}
+
+EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm512_cvtph_ps(a);
+#else
+  EIGEN_ALIGN64 half aux[16];
+  pstore(aux, a);
+  float f0(aux[0]);
+  float f1(aux[1]);
+  float f2(aux[2]);
+  float f3(aux[3]);
+  float f4(aux[4]);
+  float f5(aux[5]);
+  float f6(aux[6]);
+  float f7(aux[7]);
+  float f8(aux[8]);
+  float f9(aux[9]);
+  float fa(aux[10]);
+  float fb(aux[11]);
+  float fc(aux[12]);
+  float fd(aux[13]);
+  float fe(aux[14]);
+  float ff(aux[15]);
+
+  return _mm512_set_ps(
+      ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+#else
+  EIGEN_ALIGN64 float aux[16];
+  pstore(aux, a);
+  half h0(aux[0]);
+  half h1(aux[1]);
+  half h2(aux[2]);
+  half h3(aux[3]);
+  half h4(aux[4]);
+  half h5(aux[5]);
+  half h6(aux[6]);
+  half h7(aux[7]);
+  half h8(aux[8]);
+  half h9(aux[9]);
+  half ha(aux[10]);
+  half hb(aux[11]);
+  half hc(aux[12]);
+  half hd(aux[13]);
+  half he(aux[14]);
+  half hf(aux[15]);
+
+  return _mm256_set_epi16(
+      hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
+      h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
+  return ptrue(Packet8i(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pabs(const Packet16h& a) {
+  const __m256i sign_mask = _mm256_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm256_andnot_si256(sign_mask, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a,
+                                              const Packet16h& b) {
+  return float2half(pmin<Packet16f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a,
+                                              const Packet16h& b) {
+  return float2half(pmax<Packet16f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h plset<Packet16h>(const half& a) {
+  return float2half(plset<Packet16f>(static_cast<float>(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
+  // in some cases Packet8i is a wrapper around __m256i, so we need to
+  // cast to Packet8i to call the correct overload.
+  return por(Packet8i(a),Packet8i(b));
+}
+template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) {
+  return pxor(Packet8i(a),Packet8i(b));
+}
+template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) {
+  return pand(Packet8i(a),Packet8i(b));
+}
+template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) {
+  return pandnot(Packet8i(a),Packet8i(b));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
+  return _mm256_blendv_epi8(b, a, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
+  return float2half(pround<Packet16f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
+  return float2half(print<Packet16f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
+  return float2half(pceil<Packet16f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
+  return float2half(pfloor<Packet16f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  return Pack32To16(pcmp_eq(af, bf));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a,const Packet16h& b) {
+  return Pack32To16(pcmp_le(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a,const Packet16h& b) {
+  return Pack32To16(pcmp_lt(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a,const Packet16h& b) {
+  return Pack32To16(pcmp_lt_or_nan(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pconj(const Packet16h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
+  Packet16h sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
+  return _mm256_xor_si256(a, sign_mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = padd(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = psub(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = pmul(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = pdiv(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
+  Packet16f from_float = half2float(from);
+  return half(predux(from_float));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
+  Packet8h lane0 = _mm256_extractf128_si256(a, 0);
+  Packet8h lane1 = _mm256_extractf128_si256(a, 1);
+  return padd<Packet8h>(lane0, lane1);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet16h>(const Packet16h& a) {
+  Packet16f af = half2float(a);
+  float reduced = predux_max<Packet16f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet16h>(const Packet16h& a) {
+  Packet16f af = half2float(a);
+  float reduced = predux_min<Packet16f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
+  Packet16f from_float = half2float(from);
+  return half(predux_mul(from_float));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  return _mm256_insertf128_si256(
+                    _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a,1),m)),
+                                           _mm_shuffle_epi8(_mm256_extractf128_si256(a,0),m), 1);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
+{
+  return _mm256_set_epi16(
+      from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
+      from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
+      from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
+      from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
+{
+  EIGEN_ALIGN64 half aux[16];
+  pstore(aux, from);
+  to[stride*0] = aux[0];
+  to[stride*1] = aux[1];
+  to[stride*2] = aux[2];
+  to[stride*3] = aux[3];
+  to[stride*4] = aux[4];
+  to[stride*5] = aux[5];
+  to[stride*6] = aux[6];
+  to[stride*7] = aux[7];
+  to[stride*8] = aux[8];
+  to[stride*9] = aux[9];
+  to[stride*10] = aux[10];
+  to[stride*11] = aux[11];
+  to[stride*12] = aux[12];
+  to[stride*13] = aux[13];
+  to[stride*14] = aux[14];
+  to[stride*15] = aux[15];
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,16>& kernel) {
+  __m256i a = kernel.packet[0];
+  __m256i b = kernel.packet[1];
+  __m256i c = kernel.packet[2];
+  __m256i d = kernel.packet[3];
+  __m256i e = kernel.packet[4];
+  __m256i f = kernel.packet[5];
+  __m256i g = kernel.packet[6];
+  __m256i h = kernel.packet[7];
+  __m256i i = kernel.packet[8];
+  __m256i j = kernel.packet[9];
+  __m256i k = kernel.packet[10];
+  __m256i l = kernel.packet[11];
+  __m256i m = kernel.packet[12];
+  __m256i n = kernel.packet[13];
+  __m256i o = kernel.packet[14];
+  __m256i p = kernel.packet[15];
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
+  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
+  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
+  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
+  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
+  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
+
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
+  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
+  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
+  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
+  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
+  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
+  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
+  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
+  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
+  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
+  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
+
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
+  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
+  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
+  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
+  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
+  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
+
+  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
+  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
+  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
+  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
+  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
+  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
+  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
+  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
+  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
+  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
+  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
+  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
+  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
+  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
+  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
+  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
+  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+  __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
+
+  kernel.packet[0] = a_p_0;
+  kernel.packet[1] = a_p_1;
+  kernel.packet[2] = a_p_2;
+  kernel.packet[3] = a_p_3;
+  kernel.packet[4] = a_p_4;
+  kernel.packet[5] = a_p_5;
+  kernel.packet[6] = a_p_6;
+  kernel.packet[7] = a_p_7;
+  kernel.packet[8] = a_p_8;
+  kernel.packet[9] = a_p_9;
+  kernel.packet[10] = a_p_a;
+  kernel.packet[11] = a_p_b;
+  kernel.packet[12] = a_p_c;
+  kernel.packet[13] = a_p_d;
+  kernel.packet[14] = a_p_e;
+  kernel.packet[15] = a_p_f;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,8>& kernel) {
+  EIGEN_ALIGN64 half in[8][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+  pstore<half>(in[4], kernel.packet[4]);
+  pstore<half>(in[5], kernel.packet[5]);
+  pstore<half>(in[6], kernel.packet[6]);
+  pstore<half>(in[7], kernel.packet[7]);
+
+  EIGEN_ALIGN64 half out[8][16];
+
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      out[i][j] = in[j][2*i];
+    }
+    for (int j = 0; j < 8; ++j) {
+      out[i][j+8] = in[j][2*i+1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+  kernel.packet[4] = pload<Packet16h>(out[4]);
+  kernel.packet[5] = pload<Packet16h>(out[5]);
+  kernel.packet[6] = pload<Packet16h>(out[6]);
+  kernel.packet[7] = pload<Packet16h>(out[7]);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,4>& kernel) {
+  EIGEN_ALIGN64 half in[4][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN64 half out[4][16];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][4*i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+4] = in[j][4*i+1];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+8] = in[j][4*i+2];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+12] = in[j][4*i+3];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+}
+
+template <> struct is_arithmetic<Packet16bf> { enum { value = true }; };
+
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet16bf type;
+  typedef Packet8bf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 1,
+    HasBlend = 0,
+    HasInsert = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+    HasLog = 1,  // Currently fails test with bad accuracy.
+    HasLog1p  = 1,
+    HasExpm1  = 1,
+    HasNdtri = 1,
+    HasBessel  = 1,
+#endif
+    HasExp = 1,
+    HasSqrt = EIGEN_FAST_MATH,
+    HasRsqrt = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+#endif
+    HasCmp  = 1,
+    HasDiv = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet16bf>
+{
+  typedef bfloat16 type;
+  enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  typedef Packet8bf half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pset1<Packet16bf>(const bfloat16& from) {
+  return _mm256_set1_epi16(from.value);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst<Packet16bf>(const Packet16bf& from) {
+  bfloat16 t;
+  t.value = static_cast<unsigned short>(_mm256_extract_epi16(from, 0));
+  return t;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pload<Packet16bf>(const bfloat16* from) {
+  return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf ploadu<Packet16bf>(const bfloat16* from) {
+  return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to,
+                                          const Packet16bf& from) {
+  _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to,
+                                           const Packet16bf& from) {
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf
+ploaddup<Packet16bf>(const bfloat16* from) {
+  Packet16bf r;
+  unsigned short a = from[0].value;
+  unsigned short b = from[1].value;
+  unsigned short c = from[2].value;
+  unsigned short d = from[3].value;
+  unsigned short e = from[4].value;
+  unsigned short f = from[5].value;
+  unsigned short g = from[6].value;
+  unsigned short h = from[7].value;
+  return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf
+ploadquad(const bfloat16* from) {
+  Packet16bf r;
+  unsigned short a = from[0].value;
+  unsigned short b = from[1].value;
+  unsigned short c = from[2].value;
+  unsigned short d = from[3].value;
+  return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
+}
+
+EIGEN_STRONG_INLINE Packet16f Bf16ToF32(const Packet16bf& a) {
+  return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16));
+}
+
+// Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm.
+EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) {
+  Packet16bf r;
+
+#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1)
+  // Since GCC 10.1 supports avx512bf16 and C style explicit cast
+  // (C++ static_cast is not supported yet), do converion via intrinsic
+  // and register path for performance.
+  r = (__m256i)(_mm512_cvtneps_pbh(a));
+
+#else
+  __m512i t;
+  __m512i input = _mm512_castps_si512(a);
+  __m512i nan = _mm512_set1_epi32(0x7fc0);
+
+  // uint32_t lsb = (input >> 16) & 1;
+  t = _mm512_and_si512(_mm512_srli_epi32(input, 16), _mm512_set1_epi32(1));
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t = _mm512_add_epi32(t, _mm512_set1_epi32(0x7fff));
+  // input += rounding_bias;
+  t = _mm512_add_epi32(t, input);
+  // input = input >> 16;
+  t = _mm512_srli_epi32(t, 16);
+
+  // Check NaN before converting back to bf16
+  __mmask16 mask = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q);
+
+  t = _mm512_mask_blend_epi32(mask, nan, t);
+  // output.value = static_cast<uint16_t>(input);
+  r = _mm512_cvtepi32_epi16(t);
+#endif // EIGEN_VECTORIZE_AVX512BF16
+
+  return r;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf ptrue(const Packet16bf& a) {
+  return ptrue<Packet8i>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf por(const Packet16bf& a, const Packet16bf& b) {
+  return por<Packet8i>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pxor(const Packet16bf& a, const Packet16bf& b) {
+  return pxor<Packet8i>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pand(const Packet16bf& a, const Packet16bf& b) {
+  return pand<Packet8i>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pandnot(const Packet16bf& a,
+                                       const Packet16bf& b) {
+  return pandnot<Packet8i>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pselect(const Packet16bf& mask,
+                                       const Packet16bf& a,
+                                       const Packet16bf& b) {
+  // Input mask is expected to be all 0/1, handle it with 8-bit
+  // intrinsic for performance.
+  return _mm256_blendv_epi8(b, a, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf pround<Packet16bf>(const Packet16bf& a)
+{
+  return F32ToBf16(pround<Packet16f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf print<Packet16bf>(const Packet16bf& a) {
+  return F32ToBf16(print<Packet16f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf pceil<Packet16bf>(const Packet16bf& a) {
+  return F32ToBf16(pceil<Packet16f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf pfloor<Packet16bf>(const Packet16bf& a) {
+  return F32ToBf16(pfloor<Packet16f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcmp_eq(const Packet16bf& a,
+                                       const Packet16bf& b) {
+  return Pack32To16(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcmp_le(const Packet16bf& a,
+                                       const Packet16bf& b) {
+  return Pack32To16(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcmp_lt(const Packet16bf& a,
+                                       const Packet16bf& b) {
+  return Pack32To16(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcmp_lt_or_nan(const Packet16bf& a,
+                                              const Packet16bf& b) {
+  return Pack32To16(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pnegate(const Packet16bf& a) {
+  Packet16bf sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
+  return _mm256_xor_si256(a, sign_mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pconj(const Packet16bf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pabs(const Packet16bf& a) {
+  const __m256i sign_mask = _mm256_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm256_andnot_si256(sign_mask, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf padd<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(padd<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf psub<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(psub<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(pmul<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pdiv<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(pdiv<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmin<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(pmin<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmax<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(pmax<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf plset<Packet16bf>(const bfloat16& a) {
+  return F32ToBf16(plset<Packet16f>(static_cast<float>(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4<Packet16bf>(const Packet16bf& a) {
+  Packet8bf lane0 = _mm256_extractf128_si256(a, 0);
+  Packet8bf lane1 = _mm256_extractf128_si256(a, 1);
+  return padd<Packet8bf>(lane0, lane1);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux<Packet16bf>(const Packet16bf& p) {
+  return static_cast<bfloat16>(predux<Packet16f>(Bf16ToF32(p)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet16bf>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_mul<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<Packet16bf>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_min<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<Packet16bf>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_max<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf preverse(const Packet16bf& a) {
+  __m256i m = _mm256_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,
+                               14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+
+  Packet16bf res;
+  // Swap hi and lo first because shuffle is in 128-bit lanes.
+  res = _mm256_permute2x128_si256(a, a, 1);
+  // Shuffle 8-bit values in src within 2*128-bit lanes.
+  return _mm256_shuffle_epi8(res, m);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pgather<bfloat16, Packet16bf>(const bfloat16* from,
+                                                             Index stride) {
+  return _mm256_set_epi16(
+      from[15*stride].value, from[14*stride].value, from[13*stride].value, from[12*stride].value,
+      from[11*stride].value, from[10*stride].value, from[9*stride].value, from[8*stride].value,
+      from[7*stride].value, from[6*stride].value, from[5*stride].value, from[4*stride].value,
+      from[3*stride].value, from[2*stride].value, from[1*stride].value, from[0*stride].value);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet16bf>(bfloat16* to,
+                                                        const Packet16bf& from,
+                                                        Index stride) {
+  EIGEN_ALIGN64 bfloat16 aux[16];
+  pstore(aux, from);
+  to[stride*0] = aux[0];
+  to[stride*1] = aux[1];
+  to[stride*2] = aux[2];
+  to[stride*3] = aux[3];
+  to[stride*4] = aux[4];
+  to[stride*5] = aux[5];
+  to[stride*6] = aux[6];
+  to[stride*7] = aux[7];
+  to[stride*8] = aux[8];
+  to[stride*9] = aux[9];
+  to[stride*10] = aux[10];
+  to[stride*11] = aux[11];
+  to[stride*12] = aux[12];
+  to[stride*13] = aux[13];
+  to[stride*14] = aux[14];
+  to[stride*15] = aux[15];
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,16>& kernel) {
+  __m256i a = kernel.packet[0];
+  __m256i b = kernel.packet[1];
+  __m256i c = kernel.packet[2];
+  __m256i d = kernel.packet[3];
+  __m256i e = kernel.packet[4];
+  __m256i f = kernel.packet[5];
+  __m256i g = kernel.packet[6];
+  __m256i h = kernel.packet[7];
+  __m256i i = kernel.packet[8];
+  __m256i j = kernel.packet[9];
+  __m256i k = kernel.packet[10];
+  __m256i l = kernel.packet[11];
+  __m256i m = kernel.packet[12];
+  __m256i n = kernel.packet[13];
+  __m256i o = kernel.packet[14];
+  __m256i p = kernel.packet[15];
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
+  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
+  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
+  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
+  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
+  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
+
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
+  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
+  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
+  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
+  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
+  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
+  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
+  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
+  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
+  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
+  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
+
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
+  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
+  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
+  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
+  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
+  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
+
+  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
+  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
+  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
+  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
+  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
+  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
+  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
+  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
+  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
+  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
+  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
+  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
+  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
+  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
+  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
+  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  kernel.packet[0] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
+  kernel.packet[1] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  kernel.packet[2] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  kernel.packet[3] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  kernel.packet[4] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  kernel.packet[5] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  kernel.packet[6] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  kernel.packet[7] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  kernel.packet[8] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  kernel.packet[9] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  kernel.packet[10] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  kernel.packet[11] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  kernel.packet[12] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  kernel.packet[13] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  kernel.packet[14] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+  kernel.packet[15] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,4>& kernel) {
+  __m256i a = kernel.packet[0];
+  __m256i b = kernel.packet[1];
+  __m256i c = kernel.packet[2];
+  __m256i d = kernel.packet[3];
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  kernel.packet[0] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x20);
+  kernel.packet[1] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x20);
+  kernel.packet[2] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x31);
+  kernel.packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_AVX512_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h b/src/3rdparty/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h
new file mode 100644
index 000000000..330412729
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h
@@ -0,0 +1,89 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_AVX512_H
+#define EIGEN_TYPE_CASTING_AVX512_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
+  return _mm512_cvttps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
+  return _mm512_cvtepi32_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i, Packet16f>(const Packet16f& a) {
+  return _mm512_castps_si512(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet16i>(const Packet16i& a) {
+  return _mm512_castsi512_ps(a);
+}
+
+template <>
+struct type_casting_traits<half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
+  return half2float(a);
+}
+
+template <>
+struct type_casting_traits<float, half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
+  return float2half(a);
+}
+
+template <>
+struct type_casting_traits<bfloat16, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16bf, Packet16f>(const Packet16bf& a) {
+  return Bf16ToF32(a);
+}
+
+template <>
+struct type_casting_traits<float, bfloat16> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16bf pcast<Packet16f, Packet16bf>(const Packet16f& a) {
+  return F32ToBf16(a);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_AVX512_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/Complex.h b/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/Complex.h
new file mode 100644
index 000000000..f424f11cf
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -0,0 +1,417 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010-2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX32_ALTIVEC_H
+#define EIGEN_COMPLEX32_ALTIVEC_H
+
+namespace Eigen {
+
+namespace internal {
+
+static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+#ifdef __VSX__
+#if defined(_BIG_ENDIAN)
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+#else
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+#endif
+#endif
+
+//---------- float ----------
+struct Packet2cf
+{
+  EIGEN_STRONG_INLINE explicit Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b)
+  {
+    Packet4f v1, v2;
+
+    // Permute and multiply the real parts of a and b
+    v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
+    // Get the imaginary parts of a
+    v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
+    // multiply a_re * b
+    v1 = vec_madd(v1, b.v, p4f_ZERO);
+    // multiply a_im * b and get the conjugate result
+    v2 = vec_madd(v2, b.v, p4f_ZERO);
+    v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
+    // permute back to a proper order
+    v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
+
+    return Packet2cf(padd<Packet4f>(v1, v2));
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
+    v = pmul(Packet2cf(*this), b).v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
+    return Packet2cf(*this) *= b;
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
+    return Packet2cf(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
+    return Packet2cf(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
+    return Packet2cf(-v);
+  }
+
+  Packet4f  v;
+};
+
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+{
+  typedef Packet2cf type;
+  typedef Packet2cf half;
+  typedef Packet4f as_real;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+#ifdef __VSX__
+    HasBlend  = 1,
+#endif
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; typedef Packet4f as_real; };
+
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+{
+  Packet2cf res;
+  if((std::ptrdiff_t(&from) % 16) == 0)
+    res.v = pload<Packet4f>((const float *)&from);
+  else
+    res.v = ploadu<Packet4f>((const float *)&from);
+  res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>*        from) { return Packet2cf(pload<Packet4f>((const float *) from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>*       from) { return Packet2cf(ploadu<Packet4f>((const float*) from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*     from) { return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
+
+EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>* from0, const std::complex<float>* from1)
+{
+  Packet4f res0, res1;
+#ifdef __VSX__
+  __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (*from0));
+  __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (*from1));
+#ifdef _BIG_ENDIAN
+  __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
+#else
+  __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
+#endif
+#else
+  *reinterpret_cast<std::complex<float> *>(&res0) = *from0;
+  *reinterpret_cast<std::complex<float> *>(&res1) = *from1;
+  res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI);
+#endif
+  return Packet2cf(res0);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  EIGEN_ALIGN16 std::complex<float> af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  return pload<Packet2cf>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  EIGEN_ALIGN16 std::complex<float> af[2];
+  pstore<std::complex<float> >((std::complex<float> *) af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v, b.v)); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr)    { EIGEN_PPC_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
+{
+  EIGEN_ALIGN16 std::complex<float> res[2];
+  pstore((float *)&res, a.v);
+
+  return res[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+{
+  Packet4f rev_a;
+  rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2);
+  return Packet2cf(rev_a);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  Packet4f b;
+  b = vec_sld(a.v, a.v, 8);
+  b = padd<Packet4f>(a.v, b);
+  return pfirst<Packet2cf>(Packet2cf(b));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  Packet4f b;
+  Packet2cf prod;
+  b = vec_sld(a.v, a.v, 8);
+  prod = pmul<Packet2cf>(a, Packet2cf(b));
+
+  return pfirst<Packet2cf>(prod);
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+
+template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for AltiVec
+  Packet2cf res = pmul(a, pconj(b));
+  Packet4f s = pmul<Packet4f>(b.v, b.v);
+  return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
+{
+  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
+{
+  Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = reinterpret_cast<Packet4f>(vec_cmpeq(a.v,b.v));
+  return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV)));
+}
+
+#ifdef __VSX__
+template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  Packet2cf result;
+  result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
+  return result;
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a)
+{
+  return psqrt_complex<Packet2cf>(a);
+}
+
+//---------- double ----------
+#ifdef __VSX__
+struct Packet1cd
+{
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b)
+  {
+    Packet2d a_re, a_im, v1, v2;
+
+    // Permute and multiply the real parts of a and b
+    a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
+    // Get the imaginary parts of a
+    a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
+    // multiply a_re * b
+    v1 = vec_madd(a_re, b.v, p2d_ZERO);
+    // multiply a_im * b and get the conjugate result
+    v2 = vec_madd(a_im, b.v, p2d_ZERO);
+    v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
+    v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
+
+    return Packet1cd(padd<Packet2d>(v1, v2));
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
+    v = pmul(Packet1cd(*this), b).v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
+    return Packet1cd(*this) *= b;
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
+    return Packet1cd(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
+    return Packet1cd(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
+    return Packet1cd(-v);
+  }
+
+  Packet2d v;
+};
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; typedef Packet2d as_real; };
+
+template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstoreu((double*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
+{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
+{
+  return pload<Packet1cd>(from);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
+{
+  pstore<std::complex<double> >(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pandnot(a.v, b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from)  { return pset1<Packet1cd>(*from); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr)    { EIGEN_PPC_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
+{
+  EIGEN_ALIGN16 std::complex<double> res[2];
+  pstore<std::complex<double> >(res, a);
+
+  return res[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for AltiVec
+  Packet1cd res = pmul(a,pconj(b));
+  Packet2d s = pmul<Packet2d>(b.v, b.v);
+  return Packet1cd(pdiv(res.v, padd<Packet2d>(s, vec_perm(s, s, p16uc_REVERSE64))));
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+{
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
+{
+  Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+  // Compare real and imaginary parts of a and b to get the mask vector:
+  // [re(a)==re(b), im(a)==im(b)]
+  Packet2d eq = reinterpret_cast<Packet2d>(vec_cmpeq(a.v,b.v));
+  // Swap real/imag elements in the mask in to get:
+  // [im(a)==im(b), re(a)==re(b)]
+  Packet2d eq_swapped = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(eq), reinterpret_cast<Packet4ui>(eq), 8));
+  // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+  return Packet1cd(vec_and(eq, eq_swapped));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a)
+{
+  return psqrt_complex<Packet1cd>(a);
+}
+
+#endif // __VSX__
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX32_ALTIVEC_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h
new file mode 100644
index 000000000..3a7a32936
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -0,0 +1,90 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog<Packet4f>(const Packet4f& _x)
+{
+  return plog_float(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& _x)
+{
+  return pexp_float(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psin<Packet4f>(const Packet4f& _x)
+{
+  return psin_float(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pcos<Packet4f>(const Packet4f& _x)
+{
+  return pcos_float(_x);
+}
+
+#ifndef EIGEN_COMP_CLANG
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x)
+{
+  return  vec_rsqrt(x);
+}
+#endif
+
+#ifdef __VSX__
+#ifndef EIGEN_COMP_CLANG
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d prsqrt<Packet2d>(const Packet2d& x)
+{
+  return  vec_rsqrt(x);
+}
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& x)
+{
+  return  vec_sqrt(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d psqrt<Packet2d>(const Packet2d& x)
+{
+  return  vec_sqrt(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d pexp<Packet2d>(const Packet2d& _x)
+{
+  return pexp_double(_x);
+}
+#endif
+
+// Hyperbolic Tangent function.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+ptanh<Packet4f>(const Packet4f& x) {
+  return internal::generic_fast_tanh_float(x);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_ALTIVEC_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
new file mode 100644
index 000000000..3f79b97df
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -0,0 +1,2937 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
+// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H
+#define EIGEN_MATRIX_PRODUCT_ALTIVEC_H
+
+#ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK
+#define EIGEN_ALTIVEC_USE_CUSTOM_PACK    1
+#endif
+
+#include "MatrixProductCommon.h"
+
+// Since LLVM doesn't support dynamic dispatching, force either always MMA or VSX
+#if EIGEN_COMP_LLVM
+#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && !defined(EIGEN_ALTIVEC_MMA_ONLY)
+#ifdef __MMA__
+#define EIGEN_ALTIVEC_MMA_ONLY
+#else
+#define EIGEN_ALTIVEC_DISABLE_MMA
+#endif
+#endif
+#endif
+
+#ifdef __has_builtin
+#if __has_builtin(__builtin_mma_assemble_acc)
+  #define ALTIVEC_MMA_SUPPORT
+#endif
+#endif
+
+#if defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+  #include "MatrixProductMMA.h"
+#endif
+
+/**************************************************************************************************
+ * TODO                                                                                           *
+ * - Check StorageOrder on dhs_pack (the innermost second loop seems unvectorized when it could). *
+ * - Check the possibility of transposing as GETREAL and GETIMAG when needed.                     *
+ **************************************************************************************************/
+namespace Eigen {
+
+namespace internal {
+
+/**************************
+ * Constants and typedefs *
+ **************************/
+template<typename Scalar>
+struct quad_traits
+{
+  typedef typename packet_traits<Scalar>::type    vectortype;
+  typedef PacketBlock<vectortype,4>                     type;
+  typedef vectortype                                 rhstype;
+  enum
+  {
+    vectorsize = packet_traits<Scalar>::size,
+    size = 4,
+    rows = 4
+  };
+};
+
+template<>
+struct quad_traits<double>
+{
+  typedef Packet2d                        vectortype;
+  typedef PacketBlock<vectortype,4>             type;
+  typedef PacketBlock<Packet2d,2>            rhstype;
+  enum
+  {
+    vectorsize = packet_traits<double>::size,
+    size = 2,
+    rows = 4
+  };
+};
+
+// MatrixProduct decomposes real/imaginary vectors into a real vector and an imaginary vector, this turned out
+// to be faster than Eigen's usual approach of having real/imaginary pairs on a single vector. This constants then
+// are responsible to extract from convert between Eigen's and MatrixProduct approach.
+
+const static Packet16uc p16uc_GETREAL32 = {  0,  1,  2,  3,
+                                             8,  9, 10, 11,
+                                            16, 17, 18, 19,
+                                            24, 25, 26, 27};
+
+const static Packet16uc p16uc_GETIMAG32 = {  4,  5,  6,  7,
+                                            12, 13, 14, 15,
+                                            20, 21, 22, 23,
+                                            28, 29, 30, 31};
+const static Packet16uc p16uc_GETREAL64 = {  0,  1,  2,  3,  4,  5,  6,  7,
+                                            16, 17, 18, 19, 20, 21, 22, 23};
+
+//[a,ai],[b,bi] = [ai,bi]
+const static Packet16uc p16uc_GETIMAG64 = {  8,  9, 10, 11, 12, 13, 14, 15,
+                                            24, 25, 26, 27, 28, 29, 30, 31};
+
+/*********************************************
+ * Single precision real and complex packing *
+ * *******************************************/
+
+/**
+ * Symm packing is related to packing of symmetric adjoint blocks, as expected the packing leaves
+ * the diagonal real, whatever is below it is copied from the respective upper diagonal element and 
+ * conjugated. There's no PanelMode available for symm packing.
+ *
+ * Packing in general is supposed to leave the lhs block and the rhs block easy to be read by gemm using 
+ * its respective rank-update instructions. The float32/64 versions are different because at this moment
+ * the size of the accumulator is fixed at 512-bits so you can't have a 4x4 accumulator of 64-bit elements.
+ * 
+ * As mentioned earlier MatrixProduct breaks complex numbers into a real vector and a complex vector so packing has
+ * to take that into account, at the moment, we run pack the real part and then the imaginary part, this is the main
+ * reason why packing for complex is broken down into several different parts, also the reason why we endup having a
+ * float32/64 and complex float32/64 version.
+ **/
+template<typename Scalar, typename Index, int StorageOrder>
+EIGEN_ALWAYS_INLINE std::complex<Scalar> getAdjointVal(Index i, Index j, const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder>& dt)
+{
+  std::complex<Scalar> v;
+  if(i < j)
+  {
+    v.real( dt(j,i).real());
+    v.imag(-dt(j,i).imag());
+  } else if(i > j)
+  {
+    v.real( dt(i,j).real());
+    v.imag( dt(i,j).imag());
+  } else {
+    v.real( dt(i,j).real());
+    v.imag((Scalar)0.0);
+  }
+  return v;
+}
+
+template<typename Scalar, typename Index, int StorageOrder, int N>
+EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex<Scalar>* blockB, const std::complex<Scalar>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+{
+  const Index depth = k2 + rows;
+  const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder> rhs(_rhs, rhsStride);
+  const Index vectorSize = N*quad_traits<Scalar>::vectorsize;
+  const Index vectorDelta = vectorSize * rows;
+  Scalar* blockBf = reinterpret_cast<Scalar *>(blockB);
+
+  Index rir = 0, rii, j = 0;
+  for(; j + vectorSize <= cols; j+=vectorSize)
+  {
+    rii = rir + vectorDelta;
+
+    for(Index i = k2; i < depth; i++)
+    {
+      for(Index k = 0; k < vectorSize; k++)
+      {
+        std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, j + k, rhs);
+
+        blockBf[rir + k] = v.real();
+        blockBf[rii + k] = v.imag();
+      }
+      rir += vectorSize;
+      rii += vectorSize;
+    }
+
+    rir += vectorDelta;
+  }
+  if (j < cols)
+  {
+    rii = rir + ((cols - j) * rows);
+
+    for(Index i = k2; i < depth; i++)
+    {
+      Index k = j;
+      for(; k < cols; k++)
+      {
+        std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, k, rhs);
+
+        blockBf[rir] = v.real();
+        blockBf[rii] = v.imag();
+
+        rir += 1;
+        rii += 1;
+      }
+    }
+  }
+}
+
+template<typename Scalar, typename Index, int StorageOrder>
+EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex<Scalar>* blockA, const std::complex<Scalar>* _lhs, Index lhsStride, Index cols, Index rows)
+{
+  const Index depth = cols;
+  const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder> lhs(_lhs, lhsStride);
+  const Index vectorSize = quad_traits<Scalar>::vectorsize;
+  const Index vectorDelta = vectorSize * depth;
+  Scalar* blockAf = (Scalar *)(blockA);
+
+  Index rir = 0, rii, j = 0;
+  for(; j + vectorSize <= rows; j+=vectorSize)
+  {
+    rii = rir + vectorDelta;
+
+    for(Index i = 0; i < depth; i++)
+    {
+      for(Index k = 0; k < vectorSize; k++)
+      {
+        std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(j+k, i, lhs);
+
+        blockAf[rir + k] = v.real();
+        blockAf[rii + k] = v.imag();
+      }
+      rir += vectorSize;
+      rii += vectorSize;
+    }
+
+    rir += vectorDelta;
+  }
+
+  if (j < rows)
+  {
+    rii = rir + ((rows - j) * depth);
+
+    for(Index i = 0; i < depth; i++)
+    {
+      Index k = j;
+      for(; k < rows; k++)
+      {
+        std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(k, i, lhs);
+
+        blockAf[rir] = v.real();
+        blockAf[rii] = v.imag();
+
+        rir += 1;
+        rii += 1;
+      }
+    }
+  }
+}
+
+template<typename Scalar, typename Index, int StorageOrder, int N>
+EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+{
+  const Index depth = k2 + rows;
+  const_blas_data_mapper<Scalar, Index, StorageOrder> rhs(_rhs, rhsStride);
+  const Index vectorSize = quad_traits<Scalar>::vectorsize;
+
+  Index ri = 0, j = 0;
+  for(; j + N*vectorSize <= cols; j+=N*vectorSize)
+  {
+    Index i = k2;
+    for(; i < depth; i++)
+    {
+      for(Index k = 0; k < N*vectorSize; k++)
+      {
+        if(i <= j+k)
+          blockB[ri + k] = rhs(j+k, i);
+        else
+          blockB[ri + k] = rhs(i, j+k);
+      }
+      ri += N*vectorSize;
+    }
+  }
+
+  if (j < cols)
+  {
+    for(Index i = k2; i < depth; i++)
+    {
+      Index k = j;
+      for(; k < cols; k++)
+      {
+        if(k <= i)
+          blockB[ri] = rhs(i, k);
+        else
+          blockB[ri] = rhs(k, i);
+        ri += 1;
+      }
+    }
+  }
+}
+
+template<typename Scalar, typename Index, int StorageOrder>
+EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
+{
+  const Index depth = cols;
+  const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs, lhsStride);
+  const Index vectorSize = quad_traits<Scalar>::vectorsize;
+
+  Index ri = 0, j = 0;
+  for(; j + vectorSize <= rows; j+=vectorSize)
+  {
+    Index i = 0;
+
+    for(; i < depth; i++)
+    {
+      for(Index k = 0; k < vectorSize; k++)
+      {
+        if(i <= j+k)
+          blockA[ri + k] = lhs(j+k, i);
+        else
+          blockA[ri + k] = lhs(i, j+k);
+      }
+      ri += vectorSize;
+    }
+  }
+
+  if (j < rows)
+  {
+    for(Index i = 0; i < depth; i++)
+    {
+      Index k = j;
+      for(; k < rows; k++)
+      {
+        if(i <= k)
+          blockA[ri] = lhs(k, i);
+        else
+          blockA[ri] = lhs(i, k);
+        ri += 1;
+      }
+    }
+  }
+}
+
+template<typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<std::complex<float>, Index, nr, StorageOrder>
+{
+  void operator()(std::complex<float>* blockB, const std::complex<float>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+  {
+    symm_pack_complex_rhs_helper<float, Index, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
+  }
+};
+
+template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<std::complex<float>, Index, Pack1, Pack2_dummy, StorageOrder>
+{
+  void operator()(std::complex<float>* blockA, const std::complex<float>* _lhs, Index lhsStride, Index cols, Index rows)
+  {
+    symm_pack_complex_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+  }
+};
+
+// *********** symm_pack std::complex<float64> ***********
+
+template<typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<std::complex<double>, Index, nr, StorageOrder>
+{
+  void operator()(std::complex<double>* blockB, const std::complex<double>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+  {
+    symm_pack_complex_rhs_helper<double, Index, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
+  }
+};
+
+template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<std::complex<double>, Index, Pack1, Pack2_dummy, StorageOrder>
+{
+  void operator()(std::complex<double>* blockA, const std::complex<double>* _lhs, Index lhsStride, Index cols, Index rows)
+  {
+    symm_pack_complex_lhs_helper<double, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+  }
+};
+
+// *********** symm_pack float32 ***********
+template<typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<float, Index, nr, StorageOrder>
+{
+  void operator()(float* blockB, const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+  {
+    symm_pack_rhs_helper<float, Index, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
+  }
+};
+
+template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<float, Index, Pack1, Pack2_dummy, StorageOrder>
+{
+  void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows)
+  {
+    symm_pack_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+  }
+};
+
+// *********** symm_pack float64 ***********
+template<typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<double, Index, nr, StorageOrder>
+{
+  void operator()(double* blockB, const double* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+  {
+    symm_pack_rhs_helper<double, Index, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
+  }
+};
+
+template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<double, Index, Pack1, Pack2_dummy, StorageOrder>
+{
+  void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows)
+  {
+    symm_pack_lhs_helper<double, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+  }
+};
+
+/**
+ * PanelMode
+ * Packing might be called several times before being multiplied by gebp_kernel, this happens because 
+ * on special occasions it fills part of block with other parts of the matrix. Two variables control
+ * how PanelMode should behave: offset and stride. The idea is that those variables represent whatever
+ * is going to be the real offset and stride in the future and this is what you should obey. The process
+ * is to behave as you would with normal packing but leave the start of each part with the correct offset
+ * and the end as well respecting the real stride the block will have. Gebp is aware of both blocks stride
+ * and offset and behaves accordingly.
+ **/
+
+template<typename Scalar, typename Packet, typename Index>
+EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock<Packet,4>& block)
+{
+  const Index size = 16 / sizeof(Scalar);
+  pstore<Scalar>(to + (0 * size), block.packet[0]);
+  pstore<Scalar>(to + (1 * size), block.packet[1]);
+  pstore<Scalar>(to + (2 * size), block.packet[2]);
+  pstore<Scalar>(to + (3 * size), block.packet[3]);
+}
+
+template<typename Scalar, typename Packet, typename Index>
+EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock<Packet,2>& block)
+{
+  const Index size = 16 / sizeof(Scalar);
+  pstore<Scalar>(to + (0 * size), block.packet[0]);
+  pstore<Scalar>(to + (1 * size), block.packet[1]);
+}
+
+// General template for lhs & rhs complex packing.
+template<typename Scalar, typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode, bool UseLhs>
+struct dhs_cpack {
+  EIGEN_STRONG_INLINE void operator()(std::complex<Scalar>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+  {
+    const Index vectorSize = quad_traits<Scalar>::vectorsize;
+    const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
+    Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
+    Scalar* blockAt = reinterpret_cast<Scalar *>(blockA);
+    Index j = 0;
+
+    for(; j + vectorSize <= rows; j+=vectorSize)
+    {
+      Index i = 0;
+
+      rii = rir + vectorDelta;
+
+      for(; i + vectorSize <= depth; i+=vectorSize)
+      {
+        PacketBlock<Packet,4> blockr, blocki;
+        PacketBlock<PacketC,8> cblock;
+
+        if (UseLhs) {
+          bload<DataMapper, PacketC, Index, 2, 0, StorageOrder>(cblock, lhs, j, i);
+        } else {
+          bload<DataMapper, PacketC, Index, 2, 0, StorageOrder>(cblock, lhs, i, j);
+        }
+
+        blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32);
+        blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32);
+        blockr.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32);
+        blockr.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32);
+
+        blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32);
+        blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32);
+        blocki.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32);
+        blocki.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32);
+
+        if(Conjugate)
+        {
+          blocki.packet[0] = -blocki.packet[0];
+          blocki.packet[1] = -blocki.packet[1];
+          blocki.packet[2] = -blocki.packet[2];
+          blocki.packet[3] = -blocki.packet[3];
+        }
+
+        if(((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs)))
+        {
+          ptranspose(blockr);
+          ptranspose(blocki);
+        }
+
+        storeBlock<Scalar, Packet, Index>(blockAt + rir, blockr);
+        storeBlock<Scalar, Packet, Index>(blockAt + rii, blocki);
+
+        rir += 4*vectorSize;
+        rii += 4*vectorSize;
+      }
+      for(; i < depth; i++)
+      {
+        PacketBlock<Packet,1> blockr, blocki;
+        PacketBlock<PacketC,2> cblock;
+
+        if(((StorageOrder == ColMajor) && UseLhs) || (((StorageOrder == RowMajor) && !UseLhs)))
+        {
+          if (UseLhs) {
+            cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
+            cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 2, i);
+          } else {
+            cblock.packet[0] = lhs.template loadPacket<PacketC>(i, j + 0);
+            cblock.packet[1] = lhs.template loadPacket<PacketC>(i, j + 2);
+          }
+        } else {
+          std::complex<Scalar> lhs0, lhs1;
+          if (UseLhs) {
+            lhs0 = lhs(j + 0, i);
+            lhs1 = lhs(j + 1, i);
+            cblock.packet[0] = pload2(&lhs0, &lhs1);
+            lhs0 = lhs(j + 2, i);
+            lhs1 = lhs(j + 3, i);
+            cblock.packet[1] = pload2(&lhs0, &lhs1);
+          } else {
+            lhs0 = lhs(i, j + 0);
+            lhs1 = lhs(i, j + 1);
+            cblock.packet[0] = pload2(&lhs0, &lhs1);
+            lhs0 = lhs(i, j + 2);
+            lhs1 = lhs(i, j + 3);
+            cblock.packet[1] = pload2(&lhs0, &lhs1);
+          }
+        }
+
+        blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL32);
+        blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG32);
+
+        if(Conjugate)
+        {
+          blocki.packet[0] = -blocki.packet[0];
+        }
+
+        pstore<Scalar>(blockAt + rir, blockr.packet[0]);
+        pstore<Scalar>(blockAt + rii, blocki.packet[0]);
+
+        rir += vectorSize;
+        rii += vectorSize;
+      }
+
+      rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
+    }
+
+    if (j < rows)
+    {
+      if(PanelMode) rir += (offset*(rows - j - vectorSize));
+      rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
+
+      for(Index i = 0; i < depth; i++)
+      {
+        Index k = j;
+        for(; k < rows; k++)
+        {
+          if (UseLhs) {
+            blockAt[rir] = lhs(k, i).real();
+
+            if(Conjugate)
+              blockAt[rii] = -lhs(k, i).imag();
+            else
+              blockAt[rii] =  lhs(k, i).imag();
+          } else {
+            blockAt[rir] = lhs(i, k).real();
+
+            if(Conjugate)
+              blockAt[rii] = -lhs(i, k).imag();
+            else
+              blockAt[rii] =  lhs(i, k).imag();
+          }
+
+          rir += 1;
+          rii += 1;
+        }
+      }
+    }
+  }
+};
+
+// General template for lhs & rhs packing.
+template<typename Scalar, typename Index, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode, bool UseLhs>
+struct dhs_pack{
+  EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+  {
+    const Index vectorSize = quad_traits<Scalar>::vectorsize;
+    Index ri = 0, j = 0;
+
+    for(; j + vectorSize <= rows; j+=vectorSize)
+    {
+      Index i = 0;
+
+      if(PanelMode) ri += vectorSize*offset;
+
+      for(; i + vectorSize <= depth; i+=vectorSize)
+      {
+        PacketBlock<Packet,4> block;
+
+        if (UseLhs) {
+          bload<DataMapper, Packet, Index, 4, 0, StorageOrder>(block, lhs, j, i);
+        } else {
+          bload<DataMapper, Packet, Index, 4, 0, StorageOrder>(block, lhs, i, j);
+        }
+        if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
+        {
+          ptranspose(block);
+        }
+
+        storeBlock<Scalar, Packet, Index>(blockA + ri, block);
+
+        ri += 4*vectorSize;
+      }
+      for(; i < depth; i++)
+      {
+        if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
+        {
+          if (UseLhs) {
+            blockA[ri+0] = lhs(j+0, i);
+            blockA[ri+1] = lhs(j+1, i);
+            blockA[ri+2] = lhs(j+2, i);
+            blockA[ri+3] = lhs(j+3, i);
+          } else {
+            blockA[ri+0] = lhs(i, j+0);
+            blockA[ri+1] = lhs(i, j+1);
+            blockA[ri+2] = lhs(i, j+2);
+            blockA[ri+3] = lhs(i, j+3);
+          }
+        } else {
+          Packet lhsV;
+          if (UseLhs) {
+            lhsV = lhs.template loadPacket<Packet>(j, i);
+          } else {
+            lhsV = lhs.template loadPacket<Packet>(i, j);
+          }
+          pstore<Scalar>(blockA + ri, lhsV);
+        }
+
+        ri += vectorSize;
+      }
+
+      if(PanelMode) ri += vectorSize*(stride - offset - depth);
+    }
+
+    if (j < rows)
+    {
+      if(PanelMode) ri += offset*(rows - j);
+
+      for(Index i = 0; i < depth; i++)
+      {
+        Index k = j;
+        for(; k < rows; k++)
+        {
+          if (UseLhs) {
+            blockA[ri] = lhs(k, i);
+          } else {
+            blockA[ri] = lhs(i, k);
+          }
+          ri += 1;
+        }
+      }
+    }
+  }
+};
+
+// General template for lhs packing, float64 specialization.
+template<typename Index, typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<double, Index, DataMapper, Packet2d, StorageOrder, PanelMode, true>
+{
+  EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+  {
+    const Index vectorSize = quad_traits<double>::vectorsize;
+    Index ri = 0, j = 0;
+
+    for(; j + vectorSize <= rows; j+=vectorSize)
+    {
+      Index i = 0;
+
+      if(PanelMode) ri += vectorSize*offset;
+
+      for(; i + vectorSize <= depth; i+=vectorSize)
+      {
+        PacketBlock<Packet2d,2> block;
+        if(StorageOrder == RowMajor)
+        {
+          block.packet[0] = lhs.template loadPacket<Packet2d>(j + 0, i);
+          block.packet[1] = lhs.template loadPacket<Packet2d>(j + 1, i);
+
+          ptranspose(block);
+        } else {
+          block.packet[0] = lhs.template loadPacket<Packet2d>(j, i + 0);
+          block.packet[1] = lhs.template loadPacket<Packet2d>(j, i + 1);
+        }
+
+        storeBlock<double, Packet2d, Index>(blockA + ri, block);
+
+        ri += 2*vectorSize;
+      }
+      for(; i < depth; i++)
+      {
+        if(StorageOrder == RowMajor)
+        {
+          blockA[ri+0] = lhs(j+0, i);
+          blockA[ri+1] = lhs(j+1, i);
+        } else {
+          Packet2d lhsV = lhs.template loadPacket<Packet2d>(j, i);
+          pstore<double>(blockA + ri, lhsV);
+        }
+
+        ri += vectorSize;
+      }
+
+      if(PanelMode) ri += vectorSize*(stride - offset - depth);
+    }
+
+    if (j < rows)
+    {
+      if(PanelMode) ri += offset*(rows - j);
+
+      for(Index i = 0; i < depth; i++)
+      {
+        Index k = j;
+        for(; k < rows; k++)
+        {
+          blockA[ri] = lhs(k, i);
+          ri += 1;
+        }
+      }
+    }
+  }
+};
+
+// General template for rhs packing, float64 specialization.
+template<typename Index, typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<double, Index, DataMapper, Packet2d, StorageOrder, PanelMode, false>
+{
+  EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+  {
+    const Index vectorSize = quad_traits<double>::vectorsize;
+    Index ri = 0, j = 0;
+
+    for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
+    {
+      Index i = 0;
+
+      if(PanelMode) ri += offset*(2*vectorSize);
+
+      for(; i + vectorSize <= depth; i+=vectorSize)
+      {
+        PacketBlock<Packet2d,4> block;
+        if(StorageOrder == ColMajor)
+        {
+          PacketBlock<Packet2d,2> block1, block2;
+          block1.packet[0] = rhs.template loadPacket<Packet2d>(i, j + 0);
+          block1.packet[1] = rhs.template loadPacket<Packet2d>(i, j + 1);
+          block2.packet[0] = rhs.template loadPacket<Packet2d>(i, j + 2);
+          block2.packet[1] = rhs.template loadPacket<Packet2d>(i, j + 3);
+
+          ptranspose(block1);
+          ptranspose(block2);
+
+          pstore<double>(blockB + ri    , block1.packet[0]);
+          pstore<double>(blockB + ri + 2, block2.packet[0]);
+          pstore<double>(blockB + ri + 4, block1.packet[1]);
+          pstore<double>(blockB + ri + 6, block2.packet[1]);
+        } else {
+          block.packet[0] = rhs.template loadPacket<Packet2d>(i + 0, j + 0); //[a1 a2]
+          block.packet[1] = rhs.template loadPacket<Packet2d>(i + 0, j + 2); //[a3 a4]
+          block.packet[2] = rhs.template loadPacket<Packet2d>(i + 1, j + 0); //[b1 b2]
+          block.packet[3] = rhs.template loadPacket<Packet2d>(i + 1, j + 2); //[b3 b4]
+
+          storeBlock<double, Packet2d, Index>(blockB + ri, block);
+        }
+
+        ri += 4*vectorSize;
+      }
+      for(; i < depth; i++)
+      {
+        if(StorageOrder == ColMajor)
+        {
+          blockB[ri+0] = rhs(i, j+0);
+          blockB[ri+1] = rhs(i, j+1);
+
+          ri += vectorSize;
+
+          blockB[ri+0] = rhs(i, j+2);
+          blockB[ri+1] = rhs(i, j+3);
+        } else {
+          Packet2d rhsV = rhs.template loadPacket<Packet2d>(i, j);
+          pstore<double>(blockB + ri, rhsV);
+
+          ri += vectorSize;
+
+          rhsV = rhs.template loadPacket<Packet2d>(i, j + 2);
+          pstore<double>(blockB + ri, rhsV);
+        }
+        ri += vectorSize;
+      }
+
+      if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth);
+    }
+
+    if (j < cols)
+    {
+      if(PanelMode) ri += offset*(cols - j);
+
+      for(Index i = 0; i < depth; i++)
+      {
+        Index k = j;
+        for(; k < cols; k++)
+        {
+          blockB[ri] = rhs(i, k);
+          ri += 1;
+        }
+      }
+    }
+  }
+};
+
+// General template for lhs complex packing, float64 specialization.
+template<typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
+struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true>
+{
+  EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+  {
+    const Index vectorSize = quad_traits<double>::vectorsize;
+    const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
+    Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
+    double* blockAt = reinterpret_cast<double *>(blockA);
+    Index j = 0;
+
+    for(; j + vectorSize <= rows; j+=vectorSize)
+    {
+      Index i = 0;
+
+      rii = rir + vectorDelta;
+
+      for(; i + vectorSize <= depth; i+=vectorSize)
+      {
+        PacketBlock<Packet,2> blockr, blocki;
+        PacketBlock<PacketC,4> cblock;
+
+        if(StorageOrder == ColMajor)
+        {
+          cblock.packet[0] = lhs.template loadPacket<PacketC>(j, i + 0); //[a1 a1i]
+          cblock.packet[1] = lhs.template loadPacket<PacketC>(j, i + 1); //[b1 b1i]
+
+          cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 1, i + 0); //[a2 a2i]
+          cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1); //[b2 b2i]
+
+          blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETREAL64); //[a1 a2]
+          blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
+
+          blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETIMAG64);
+          blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETIMAG64);
+        } else {
+          cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i); //[a1 a1i]
+          cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i); //[a2 a2i]
+
+          cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 0, i + 1); //[b1 b1i]
+          cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1); //[b2 b2i
+
+          blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); //[a1 a2]
+          blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2]
+
+          blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
+          blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
+        }
+
+        if(Conjugate)
+        {
+          blocki.packet[0] = -blocki.packet[0];
+          blocki.packet[1] = -blocki.packet[1];
+        }
+
+        storeBlock<double, Packet, Index>(blockAt + rir, blockr);
+        storeBlock<double, Packet, Index>(blockAt + rii, blocki);
+
+        rir += 2*vectorSize;
+        rii += 2*vectorSize;
+      }
+      for(; i < depth; i++)
+      {
+        PacketBlock<Packet,1> blockr, blocki;
+        PacketBlock<PacketC,2> cblock;
+
+        cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
+        cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i);
+
+        blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
+        blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
+
+        if(Conjugate)
+        {
+          blocki.packet[0] = -blocki.packet[0];
+        }
+
+        pstore<double>(blockAt + rir, blockr.packet[0]);
+        pstore<double>(blockAt + rii, blocki.packet[0]);
+
+        rir += vectorSize;
+        rii += vectorSize;
+      }
+
+      rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
+    }
+
+    if (j < rows)
+    {
+      if(PanelMode) rir += (offset*(rows - j - vectorSize));
+      rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
+
+      for(Index i = 0; i < depth; i++)
+      {
+        Index k = j;
+        for(; k < rows; k++)
+        {
+          blockAt[rir] = lhs(k, i).real();
+
+          if(Conjugate)
+            blockAt[rii] = -lhs(k, i).imag();
+          else
+            blockAt[rii] =  lhs(k, i).imag();
+
+          rir += 1;
+          rii += 1;
+        }
+      }
+    }
+  }
+};
+
+// General template for rhs complex packing, float64 specialization.
+template<typename Index, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
+struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, false>
+{
+  EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+  {
+    const Index vectorSize = quad_traits<double>::vectorsize;
+    const Index vectorDelta = 2*vectorSize * ((PanelMode) ? stride : depth);
+    Index rir = ((PanelMode) ? (2*vectorSize*offset) : 0), rii;
+    double* blockBt = reinterpret_cast<double *>(blockB);
+    Index j = 0;
+
+    for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
+    {
+      Index i = 0;
+
+      rii = rir + vectorDelta;
+
+      for(; i < depth; i++)
+      {
+        PacketBlock<PacketC,4> cblock;
+        PacketBlock<Packet,2> blockr, blocki;
+
+        bload<DataMapper, PacketC, Index, 2, 0, ColMajor>(cblock, rhs, i, j);
+
+        blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
+        blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64);
+
+        blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
+        blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
+
+        if(Conjugate)
+        {
+          blocki.packet[0] = -blocki.packet[0];
+          blocki.packet[1] = -blocki.packet[1];
+        }
+
+        storeBlock<double, Packet, Index>(blockBt + rir, blockr);
+        storeBlock<double, Packet, Index>(blockBt + rii, blocki);
+
+        rir += 2*vectorSize;
+        rii += 2*vectorSize;
+      }
+
+      rir += ((PanelMode) ? (2*vectorSize*(2*stride - depth)) : vectorDelta);
+    }
+
+    if (j < cols)
+    {
+      if(PanelMode) rir += (offset*(cols - j - 2*vectorSize));
+      rii = rir + (((PanelMode) ? stride : depth) * (cols - j));
+
+      for(Index i = 0; i < depth; i++)
+      {
+        Index k = j;
+        for(; k < cols; k++)
+        {
+          blockBt[rir] = rhs(i, k).real();
+
+          if(Conjugate)
+            blockBt[rii] = -rhs(i, k).imag();
+          else
+            blockBt[rii] =  rhs(i, k).imag();
+
+          rir += 1;
+          rii += 1;
+        }
+      }
+    }
+  }
+};
+
+/**************
+ * GEMM utils *
+ **************/
+
+// 512-bits rank1-update of acc. It can either positive or negative accumulate (useful for complex gemm).
+template<typename Packet, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger_common(PacketBlock<Packet,4>* acc, const Packet& lhsV, const Packet* rhsV)
+{
+  if(NegativeAccumulate)
+  {
+    acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
+    acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]);
+    acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]);
+    acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]);
+  } else {
+    acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
+    acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]);
+    acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]);
+    acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]);
+  }
+}
+
+template<typename Packet, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger_common(PacketBlock<Packet,1>* acc, const Packet& lhsV, const Packet* rhsV)
+{
+  if(NegativeAccumulate)
+  {
+    acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
+  } else {
+    acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
+  }
+}
+
+template<int N, typename Scalar, typename Packet, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger(PacketBlock<Packet,N>* acc, const Scalar* lhs, const Packet* rhsV)
+{
+  Packet lhsV = pload<Packet>(lhs);
+
+  pger_common<Packet, NegativeAccumulate>(acc, lhsV, rhsV);
+}
+
+template<typename Scalar, typename Packet, typename Index>
+EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows)
+{
+#ifdef _ARCH_PWR9
+  lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar));
+#else
+  Index i = 0;
+  do {
+    lhsV[i] = lhs[i];
+  } while (++i < remaining_rows);
+#endif
+}
+
+template<int N, typename Scalar, typename Packet, typename Index, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger(PacketBlock<Packet,N>* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows)
+{
+  Packet lhsV;
+  loadPacketRemaining<Scalar, Packet, Index>(lhs, lhsV, remaining_rows);
+
+  pger_common<Packet, NegativeAccumulate>(acc, lhsV, rhsV);
+}
+
+// 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real.
+template<int N, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi)
+{
+  pger_common<Packet, false>(accReal, lhsV, rhsV);
+  if(LhsIsReal)
+  {
+    pger_common<Packet, ConjugateRhs>(accImag, lhsV, rhsVi);
+    EIGEN_UNUSED_VARIABLE(lhsVi);
+  } else {
+    if (!RhsIsReal) {
+      pger_common<Packet, ConjugateLhs == ConjugateRhs>(accReal, lhsVi, rhsVi);
+      pger_common<Packet, ConjugateRhs>(accImag, lhsV, rhsVi);
+    } else {
+      EIGEN_UNUSED_VARIABLE(rhsVi);
+    }
+    pger_common<Packet, ConjugateLhs>(accImag, lhsVi, rhsV);
+  }
+}
+
+template<int N, typename Scalar, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi)
+{
+  Packet lhsV = ploadLhs<Scalar, Packet>(lhs_ptr);
+  Packet lhsVi;
+  if(!LhsIsReal) lhsVi = ploadLhs<Scalar, Packet>(lhs_ptr_imag);
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+
+  pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
+}
+
+template<typename Scalar, typename Packet, typename Index, bool LhsIsReal>
+EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows)
+{
+#ifdef _ARCH_PWR9
+  lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows * sizeof(Scalar));
+  if(!LhsIsReal) lhsVi = vec_xl_len((Scalar *)lhs_ptr_imag, remaining_rows * sizeof(Scalar));
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+#else
+  Index i = 0;
+  do {
+    lhsV[i] = lhs_ptr[i];
+    if(!LhsIsReal) lhsVi[i] = lhs_ptr_imag[i];
+  } while (++i < remaining_rows);
+  if(LhsIsReal) EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+#endif
+}
+
+template<int N, typename Scalar, typename Packet, typename Index, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows)
+{
+  Packet lhsV, lhsVi;
+  loadPacketRemaining<Scalar, Packet, Index, LhsIsReal>(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows);
+
+  pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
+}
+
+template<typename Scalar, typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs)
+{
+  return ploadu<Packet>(lhs);
+}
+
+// Zero the accumulator on PacketBlock.
+template<typename Scalar, typename Packet>
+EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock<Packet,4>& acc)
+{
+  acc.packet[0] = pset1<Packet>((Scalar)0);
+  acc.packet[1] = pset1<Packet>((Scalar)0);
+  acc.packet[2] = pset1<Packet>((Scalar)0);
+  acc.packet[3] = pset1<Packet>((Scalar)0);
+}
+
+template<typename Scalar, typename Packet>
+EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock<Packet,1>& acc)
+{
+  acc.packet[0] = pset1<Packet>((Scalar)0);
+}
+
+// Scale the PacketBlock vectors by alpha.
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha)
+{
+  acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
+  acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]);
+  acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]);
+  acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]);
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,1>& acc, PacketBlock<Packet,1>& accZ, const Packet& pAlpha)
+{
+  acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha)
+{
+  acc.packet[0] = pmul<Packet>(accZ.packet[0], pAlpha);
+  acc.packet[1] = pmul<Packet>(accZ.packet[1], pAlpha);
+  acc.packet[2] = pmul<Packet>(accZ.packet[2], pAlpha);
+  acc.packet[3] = pmul<Packet>(accZ.packet[3], pAlpha);
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock<Packet,1>& acc, PacketBlock<Packet,1>& accZ, const Packet& pAlpha)
+{
+  acc.packet[0] = pmul<Packet>(accZ.packet[0], pAlpha);
+}
+
+// Complex version of PacketBlock scaling.
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag)
+{
+  bscalec_common<Packet>(cReal, aReal, bReal);
+
+  bscalec_common<Packet>(cImag, aImag, bReal);
+
+  pger_common<Packet, true>(&cReal, bImag, aImag.packet);
+
+  pger_common<Packet, false>(&cImag, bImag, aReal.packet);
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void band(PacketBlock<Packet,4>& acc, const Packet& pMask)
+{
+  acc.packet[0] = pand(acc.packet[0], pMask);
+  acc.packet[1] = pand(acc.packet[1], pMask);
+  acc.packet[2] = pand(acc.packet[2], pMask);
+  acc.packet[3] = pand(acc.packet[3], pMask);
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,4>& aReal, PacketBlock<Packet,4>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,4>& cReal, PacketBlock<Packet,4>& cImag, const Packet& pMask)
+{
+  band<Packet>(aReal, pMask);
+  band<Packet>(aImag, pMask);
+
+  bscalec<Packet,4>(aReal, aImag, bReal, bImag, cReal, cImag);
+}
+
+// Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed.
+template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,4>& acc, const DataMapper& res, Index row, Index col)
+{
+  if (StorageOrder == RowMajor) {
+    acc.packet[0] = res.template loadPacket<Packet>(row + 0, col + N*accCols);
+    acc.packet[1] = res.template loadPacket<Packet>(row + 1, col + N*accCols);
+    acc.packet[2] = res.template loadPacket<Packet>(row + 2, col + N*accCols);
+    acc.packet[3] = res.template loadPacket<Packet>(row + 3, col + N*accCols);
+  } else {
+    acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
+    acc.packet[1] = res.template loadPacket<Packet>(row + N*accCols, col + 1);
+    acc.packet[2] = res.template loadPacket<Packet>(row + N*accCols, col + 2);
+    acc.packet[3] = res.template loadPacket<Packet>(row + N*accCols, col + 3);
+  }
+}
+
+// An overload of bload when you have a PacketBLock with 8 vectors.
+template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,8>& acc, const DataMapper& res, Index row, Index col)
+{
+  if (StorageOrder == RowMajor) {
+    acc.packet[0] = res.template loadPacket<Packet>(row + 0, col + N*accCols);
+    acc.packet[1] = res.template loadPacket<Packet>(row + 1, col + N*accCols);
+    acc.packet[2] = res.template loadPacket<Packet>(row + 2, col + N*accCols);
+    acc.packet[3] = res.template loadPacket<Packet>(row + 3, col + N*accCols);
+    acc.packet[4] = res.template loadPacket<Packet>(row + 0, col + (N+1)*accCols);
+    acc.packet[5] = res.template loadPacket<Packet>(row + 1, col + (N+1)*accCols);
+    acc.packet[6] = res.template loadPacket<Packet>(row + 2, col + (N+1)*accCols);
+    acc.packet[7] = res.template loadPacket<Packet>(row + 3, col + (N+1)*accCols);
+  } else {
+    acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
+    acc.packet[1] = res.template loadPacket<Packet>(row + N*accCols, col + 1);
+    acc.packet[2] = res.template loadPacket<Packet>(row + N*accCols, col + 2);
+    acc.packet[3] = res.template loadPacket<Packet>(row + N*accCols, col + 3);
+    acc.packet[4] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 0);
+    acc.packet[5] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 1);
+    acc.packet[6] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 2);
+    acc.packet[7] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 3);
+  }
+}
+
+template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,2>& acc, const DataMapper& res, Index row, Index col)
+{
+  acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
+  acc.packet[1] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 0);
+}
+
+const static Packet4i mask41 = { -1,  0,  0,  0 };
+const static Packet4i mask42 = { -1, -1,  0,  0 };
+const static Packet4i mask43 = { -1, -1, -1,  0 };
+
+const static Packet2l mask21 = { -1, 0 };
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows)
+{
+  if (remaining_rows == 0) {
+    return pset1<Packet>(float(0.0));  // Not used
+  } else {
+    switch (remaining_rows) {
+      case 1:  return Packet(mask41);
+      case 2:  return Packet(mask42);
+      default: return Packet(mask43);
+    }
+  }
+}
+
+template<>
+EIGEN_ALWAYS_INLINE Packet2d bmask<Packet2d>(const int remaining_rows)
+{
+  if (remaining_rows == 0) {
+    return pset1<Packet2d>(double(0.0));  // Not used
+  } else {
+    return Packet2d(mask21);
+  }
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha, const Packet& pMask)
+{
+  band<Packet>(accZ, pMask);
+
+  bscale<Packet>(acc, accZ, pAlpha);
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3)
+{
+  pbroadcast4<Packet>(a, a0, a1, a2, a3);
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void pbroadcast4_old<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
+{
+  a1 = pload<Packet2d>(a);
+  a3 = pload<Packet2d>(a + 2);
+  a0 = vec_splat(a1, 0);
+  a1 = vec_splat(a1, 1);
+  a2 = vec_splat(a3, 0);
+  a3 = vec_splat(a3, 1);
+}
+
+// PEEL loop factor.
+#define PEEL 7
+
+template<typename Scalar, typename Packet, typename Index>
+EIGEN_ALWAYS_INLINE void MICRO_EXTRA_COL(
+  const Scalar* &lhs_ptr,
+  const Scalar* &rhs_ptr,
+  PacketBlock<Packet,1> &accZero,
+  Index remaining_rows,
+  Index remaining_cols)
+{
+  Packet rhsV[1];
+  rhsV[0] = pset1<Packet>(rhs_ptr[0]);
+  pger<1,Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
+  lhs_ptr += remaining_rows;
+  rhs_ptr += remaining_cols;
+}
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
+EIGEN_STRONG_INLINE void gemm_extra_col(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index row,
+  Index col,
+  Index remaining_rows,
+  Index remaining_cols,
+  const Packet& pAlpha)
+{
+  const Scalar* rhs_ptr = rhs_base;
+  const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
+  PacketBlock<Packet,1> accZero;
+
+  bsetzero<Scalar, Packet>(accZero);
+
+  Index remaining_depth = (depth & -accRows);
+  Index k = 0;
+  for(; k + PEEL <= remaining_depth; k+= PEEL)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr);
+    EIGEN_POWER_PREFETCH(lhs_ptr);
+    for (int l = 0; l < PEEL; l++) {
+      MICRO_EXTRA_COL<Scalar, Packet, Index>(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols);
+    }
+  }
+  for(; k < remaining_depth; k++)
+  {
+    MICRO_EXTRA_COL<Scalar, Packet, Index>(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols);
+  }
+  for(; k < depth; k++)
+  {
+    Packet rhsV[1];
+    rhsV[0] = pset1<Packet>(rhs_ptr[0]);
+    pger<1, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
+    lhs_ptr += remaining_rows;
+    rhs_ptr += remaining_cols;
+  }
+
+  accZero.packet[0] = vec_mul(pAlpha, accZero.packet[0]);
+  for(Index i = 0; i < remaining_rows; i++) {
+    res(row + i, col) += accZero.packet[0][i];
+  }
+}
+
+template<typename Scalar, typename Packet, typename Index, const Index accRows>
+EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW(
+  const Scalar* &lhs_ptr,
+  const Scalar* &rhs_ptr,
+  PacketBlock<Packet,4> &accZero,
+  Index remaining_rows)
+{
+  Packet rhsV[4];
+  pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+  pger<4, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
+  lhs_ptr += remaining_rows;
+  rhs_ptr += accRows;
+}
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+EIGEN_STRONG_INLINE void gemm_extra_row(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index row,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlpha,
+  const Packet& pMask)
+{
+  const Scalar* rhs_ptr = rhs_base;
+  const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
+  PacketBlock<Packet,4> accZero, acc;
+
+  bsetzero<Scalar, Packet>(accZero);
+
+  Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows);
+  Index k = 0;
+  for(; k + PEEL <= remaining_depth; k+= PEEL)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr);
+    EIGEN_POWER_PREFETCH(lhs_ptr);
+    for (int l = 0; l < PEEL; l++) {
+      MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows>(lhs_ptr, rhs_ptr, accZero, remaining_rows);
+    }
+  }
+  for(; k < remaining_depth; k++)
+  {
+    MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows>(lhs_ptr, rhs_ptr, accZero, remaining_rows);
+  }
+
+  if ((remaining_depth == depth) && (rows >= accCols))
+  {
+    for(Index j = 0; j < 4; j++) {
+      acc.packet[j] = res.template loadPacket<Packet>(row, col + j);
+    }
+    bscale<Packet>(acc, accZero, pAlpha, pMask);
+    res.template storePacketBlock<Packet,4>(row, col, acc);
+  } else {
+    for(; k < depth; k++)
+    {
+      Packet rhsV[4];
+      pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+      pger<4, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
+      lhs_ptr += remaining_rows;
+      rhs_ptr += accRows;
+    }
+
+    for(Index j = 0; j < 4; j++) {
+      accZero.packet[j] = vec_mul(pAlpha, accZero.packet[j]);
+    }
+    for(Index j = 0; j < 4; j++) {
+      for(Index i = 0; i < remaining_rows; i++) {
+        res(row + i, col + j) += accZero.packet[j][i];
+      }
+    }
+  }
+}
+
+#define MICRO_UNROLL(func) \
+  func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
+
+#define MICRO_UNROLL_WORK(func, func2, peel) \
+    MICRO_UNROLL(func2); \
+    func(0,peel) func(1,peel) func(2,peel) func(3,peel) \
+    func(4,peel) func(5,peel) func(6,peel) func(7,peel)
+
+#define MICRO_LOAD_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr##iter); \
+    lhs_ptr##iter += accCols; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
+  }
+
+#define MICRO_WORK_ONE(iter, peel) \
+  if (unroll_factor > iter) { \
+    pger_common<Packet, false>(&accZero##iter, lhsV##iter, rhsV##peel); \
+  }
+
+#define MICRO_TYPE_PEEL4(func, func2, peel) \
+  if (PEEL > peel) { \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
+    pbroadcast4<Packet>(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
+    MICRO_UNROLL_WORK(func, func2, peel) \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+  }
+
+#define MICRO_TYPE_PEEL1(func, func2, peel) \
+  if (PEEL > peel) { \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
+    rhsV##peel[0] = pset1<Packet>(rhs_ptr[remaining_cols * peel]); \
+    MICRO_UNROLL_WORK(func, func2, peel) \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+  }
+
+#define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \
+  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \
+  func(func1,func2,0); func(func1,func2,1); \
+  func(func1,func2,2); func(func1,func2,3); \
+  func(func1,func2,4); func(func1,func2,5); \
+  func(func1,func2,6); func(func1,func2,7); \
+  func(func1,func2,8); func(func1,func2,9);
+
+#define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \
+  Packet rhsV0[M]; \
+  func(func1,func2,0);
+
+#define MICRO_ONE_PEEL4 \
+  MICRO_UNROLL_TYPE_PEEL(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
+  rhs_ptr += (accRows * PEEL);
+
+#define MICRO_ONE4 \
+  MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
+  rhs_ptr += accRows;
+
+#define MICRO_ONE_PEEL1 \
+  MICRO_UNROLL_TYPE_PEEL(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
+  rhs_ptr += (remaining_cols * PEEL);
+
+#define MICRO_ONE1 \
+  MICRO_UNROLL_TYPE_ONE(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
+  rhs_ptr += remaining_cols;
+
+#define MICRO_DST_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    bsetzero<Scalar, Packet>(accZero##iter); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accZero##iter); \
+  }
+
+#define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE)
+
+#define MICRO_SRC_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
+  }
+
+#define MICRO_SRC_PTR MICRO_UNROLL(MICRO_SRC_PTR_ONE)
+
+#define MICRO_PREFETCH_ONE(iter) \
+  if (unroll_factor > iter) { \
+    EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
+  }
+
+#define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE)
+
+#define MICRO_STORE_ONE(iter) \
+  if (unroll_factor > iter) { \
+    acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \
+    acc.packet[1] = res.template loadPacket<Packet>(row + iter*accCols, col + 1); \
+    acc.packet[2] = res.template loadPacket<Packet>(row + iter*accCols, col + 2); \
+    acc.packet[3] = res.template loadPacket<Packet>(row + iter*accCols, col + 3); \
+    bscale<Packet>(acc, accZero##iter, pAlpha); \
+    res.template storePacketBlock<Packet,4>(row + iter*accCols, col, acc); \
+  }
+
+#define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE)
+
+#define MICRO_COL_STORE_ONE(iter) \
+  if (unroll_factor > iter) { \
+    acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \
+    bscale<Packet>(acc, accZero##iter, pAlpha); \
+    res.template storePacketBlock<Packet,1>(row + iter*accCols, col, acc); \
+  }
+
+#define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE)
+
+template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+EIGEN_STRONG_INLINE void gemm_unrolled_iteration(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index& row,
+  Index col,
+  const Packet& pAlpha)
+{
+  const Scalar* rhs_ptr = rhs_base;
+  const Scalar* lhs_ptr0 = NULL, *  lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
+  PacketBlock<Packet,4> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
+  PacketBlock<Packet,4> acc;
+
+  MICRO_SRC_PTR
+  MICRO_DST_PTR
+
+  Index k = 0;
+  for(; k + PEEL <= depth; k+= PEEL)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr);
+    MICRO_PREFETCH
+    MICRO_ONE_PEEL4
+  }
+  for(; k < depth; k++)
+  {
+    MICRO_ONE4
+  }
+  MICRO_STORE
+
+  row += unroll_factor*accCols;
+}
+
+template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
+EIGEN_STRONG_INLINE void gemm_unrolled_col_iteration(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index& row,
+  Index col,
+  Index remaining_cols,
+  const Packet& pAlpha)
+{
+  const Scalar* rhs_ptr = rhs_base;
+  const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, *lhs_ptr7 = NULL;
+  PacketBlock<Packet,1> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
+  PacketBlock<Packet,1> acc;
+
+  MICRO_SRC_PTR
+  MICRO_DST_PTR
+
+  Index k = 0;
+  for(; k + PEEL <= depth; k+= PEEL)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr);
+    MICRO_PREFETCH
+    MICRO_ONE_PEEL1
+  }
+  for(; k < depth; k++)
+  {
+    MICRO_ONE1
+  }
+  MICRO_COL_STORE
+
+  row += unroll_factor*accCols;
+}
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
+EIGEN_STRONG_INLINE void gemm_unrolled_col(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index& row,
+  Index rows,
+  Index col,
+  Index remaining_cols,
+  const Packet& pAlpha)
+{
+#define MAX_UNROLL 6
+  while(row + MAX_UNROLL*accCols <= rows) {
+    gemm_unrolled_col_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+  }
+  switch( (rows-row)/accCols ) {
+#if MAX_UNROLL > 7
+    case 7:
+      gemm_unrolled_col_iteration<7, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+      break;
+#endif
+#if MAX_UNROLL > 6
+    case 6:
+      gemm_unrolled_col_iteration<6, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+      break;
+#endif
+#if MAX_UNROLL > 5
+   case 5:
+      gemm_unrolled_col_iteration<5, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+      break;
+#endif
+#if MAX_UNROLL > 4
+   case 4:
+      gemm_unrolled_col_iteration<4, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+      break;
+#endif
+#if MAX_UNROLL > 3
+   case 3:
+     gemm_unrolled_col_iteration<3, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+     break;
+#endif
+#if MAX_UNROLL > 2
+   case 2:
+     gemm_unrolled_col_iteration<2, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+     break;
+#endif
+#if MAX_UNROLL > 1
+   case 1:
+     gemm_unrolled_col_iteration<1, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+     break;
+#endif
+   default:
+     break;
+  }
+#undef MAX_UNROLL
+}
+
+/****************
+ * GEMM kernels *
+ * **************/
+template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
+EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+      const Index remaining_rows = rows % accCols;
+      const Index remaining_cols = cols % accRows;
+
+      if( strideA == -1 ) strideA = depth;
+      if( strideB == -1 ) strideB = depth;
+
+      const Packet pAlpha = pset1<Packet>(alpha);
+      const Packet pMask  = bmask<Packet>((const int)(remaining_rows));
+
+      Index col = 0;
+      for(; col + accRows <= cols; col += accRows)
+      {
+        const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
+        const Scalar* lhs_base = blockA;
+        Index row = 0;
+
+#define MAX_UNROLL 6
+        while(row + MAX_UNROLL*accCols <= rows) {
+          gemm_unrolled_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+        }
+        switch( (rows-row)/accCols ) {
+#if MAX_UNROLL > 7
+          case 7:
+            gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_UNROLL > 6
+          case 6:
+            gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_UNROLL > 5
+          case 5:
+            gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_UNROLL > 4
+          case 4:
+            gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_UNROLL > 3
+          case 3:
+            gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_UNROLL > 2
+          case 2:
+            gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_UNROLL > 1
+          case 1:
+            gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+          default:
+            break;
+        }
+#undef MAX_UNROLL
+
+        if(remaining_rows > 0)
+        {
+          gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
+        }
+    }
+
+    if(remaining_cols > 0)
+    {
+      const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB;
+      const Scalar* lhs_base = blockA;
+
+      for(; col < cols; col++)
+      {
+        Index row = 0;
+
+        gemm_unrolled_col<Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha);
+
+        if (remaining_rows > 0)
+        {
+          gemm_extra_col<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha);
+        }
+        rhs_base++;
+      }
+    }
+}
+
+#define accColsC (accCols / 2)
+#define advanceRows ((LhsIsReal) ? 1 : 2)
+#define advanceCols ((RhsIsReal) ? 1 : 2)
+
+// PEEL_COMPLEX loop factor.
+#define PEEL_COMPLEX 3
+
+template<typename Scalar, typename Packet, typename Index, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_COL(
+  const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag,
+  const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag,
+  PacketBlock<Packet,1> &accReal, PacketBlock<Packet,1> &accImag,
+  Index remaining_rows,
+  Index remaining_cols)
+{
+  Packet rhsV[1], rhsVi[1];
+  rhsV[0] = pset1<Packet>(rhs_ptr_real[0]);
+  if(!RhsIsReal) rhsVi[0] = pset1<Packet>(rhs_ptr_imag[0]);
+  pgerc<1, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
+  lhs_ptr_real += remaining_rows;
+  if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+  rhs_ptr_real += remaining_cols;
+  if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
+  else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+}
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_extra_col(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index row,
+  Index col,
+  Index remaining_rows,
+  Index remaining_cols,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag)
+{
+  const Scalar* rhs_ptr_real = rhs_base;
+  const Scalar* rhs_ptr_imag;
+  if(!RhsIsReal) rhs_ptr_imag = rhs_base + remaining_cols*strideB;
+  else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+  const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
+  const Scalar* lhs_ptr_imag;
+  if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+  PacketBlock<Packet,1> accReal, accImag;
+  PacketBlock<Packet,1> taccReal, taccImag;
+  PacketBlock<Packetc,1> acc0, acc1;
+
+  bsetzero<Scalar, Packet>(accReal);
+  bsetzero<Scalar, Packet>(accImag);
+
+  Index remaining_depth = (depth & -accRows);
+  Index k = 0;
+  for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr_real);
+    if(!RhsIsReal) {
+      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
+    }
+    EIGEN_POWER_PREFETCH(lhs_ptr_real);
+    if(!LhsIsReal) {
+      EIGEN_POWER_PREFETCH(lhs_ptr_imag);
+    }
+    for (int l = 0; l < PEEL_COMPLEX; l++) {
+      MICRO_COMPLEX_EXTRA_COL<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols);
+    }
+  }
+  for(; k < remaining_depth; k++)
+  {
+    MICRO_COMPLEX_EXTRA_COL<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols);
+  }
+
+  for(; k < depth; k++)
+  {
+    Packet rhsV[1], rhsVi[1];
+    rhsV[0] = pset1<Packet>(rhs_ptr_real[0]);
+    if(!RhsIsReal) rhsVi[0] = pset1<Packet>(rhs_ptr_imag[0]);
+    pgerc<1, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows);
+    lhs_ptr_real += remaining_rows;
+    if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
+    rhs_ptr_real += remaining_cols;
+    if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
+  }
+
+  bscalec<Packet,1>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag);
+  bcouple_common<Packet, Packetc>(taccReal, taccImag, acc0, acc1);
+
+  if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1))
+  {
+    res(row + 0, col + 0) += pfirst<Packetc>(acc0.packet[0]);
+  } else {
+    acc0.packet[0] += res.template loadPacket<Packetc>(row + 0, col + 0);
+    res.template storePacketBlock<Packetc,1>(row + 0, col + 0, acc0);
+    if(remaining_rows > accColsC) {
+      res(row + accColsC, col + 0) += pfirst<Packetc>(acc1.packet[0]);
+    }
+  }
+}
+
+template<typename Scalar, typename Packet, typename Index, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW(
+  const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag,
+  const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag,
+  PacketBlock<Packet,4> &accReal, PacketBlock<Packet,4> &accImag,
+  Index remaining_rows)
+{
+  Packet rhsV[4], rhsVi[4];
+  pbroadcast4_old<Packet>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+  if(!RhsIsReal) pbroadcast4_old<Packet>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
+  pgerc<4, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
+  lhs_ptr_real += remaining_rows;
+  if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+  rhs_ptr_real += accRows;
+  if(!RhsIsReal) rhs_ptr_imag += accRows;
+  else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+}
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_extra_row(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index row,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag,
+  const Packet& pMask)
+{
+  const Scalar* rhs_ptr_real = rhs_base;
+  const Scalar* rhs_ptr_imag;
+  if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB;
+  else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+  const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
+  const Scalar* lhs_ptr_imag;
+  if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+  PacketBlock<Packet,4> accReal, accImag;
+  PacketBlock<Packet,4> taccReal, taccImag;
+  PacketBlock<Packetc,4> acc0, acc1;
+  PacketBlock<Packetc,8> tRes;
+
+  bsetzero<Scalar, Packet>(accReal);
+  bsetzero<Scalar, Packet>(accImag);
+
+  Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows);
+  Index k = 0;
+  for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr_real);
+    if(!RhsIsReal) {
+      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
+    }
+    EIGEN_POWER_PREFETCH(lhs_ptr_real);
+    if(!LhsIsReal) {
+      EIGEN_POWER_PREFETCH(lhs_ptr_imag);
+    }
+    for (int l = 0; l < PEEL_COMPLEX; l++) {
+      MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows);
+    }
+  }
+  for(; k < remaining_depth; k++)
+  {
+    MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows);
+  }
+
+  if ((remaining_depth == depth) && (rows >= accCols))
+  {
+    bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row, col);
+    bscalec<Packet>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
+    bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1);
+    res.template storePacketBlock<Packetc,4>(row + 0, col, acc0);
+    res.template storePacketBlock<Packetc,4>(row + accColsC, col, acc1);
+  } else {
+    for(; k < depth; k++)
+    {
+      Packet rhsV[4], rhsVi[4];
+      pbroadcast4_old<Packet>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+      if(!RhsIsReal) pbroadcast4_old<Packet>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
+      pgerc<4, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows);
+      lhs_ptr_real += remaining_rows;
+      if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
+      rhs_ptr_real += accRows;
+      if(!RhsIsReal) rhs_ptr_imag += accRows;
+    }
+
+    bscalec<Packet,4>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag);
+    bcouple_common<Packet, Packetc>(taccReal, taccImag, acc0, acc1);
+
+    if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1))
+    {
+      for(Index j = 0; j < 4; j++) {
+        res(row + 0, col + j) += pfirst<Packetc>(acc0.packet[j]);
+      }
+    } else {
+      for(Index j = 0; j < 4; j++) {
+        PacketBlock<Packetc,1> acc2;
+        acc2.packet[0] = res.template loadPacket<Packetc>(row + 0, col + j) + acc0.packet[j];
+        res.template storePacketBlock<Packetc,1>(row + 0, col + j, acc2);
+        if(remaining_rows > accColsC) {
+          res(row + accColsC, col + j) += pfirst<Packetc>(acc1.packet[j]);
+        }
+      }
+    }
+  }
+}
+
+#define MICRO_COMPLEX_UNROLL(func) \
+  func(0) func(1) func(2) func(3) func(4)
+
+#define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
+    MICRO_COMPLEX_UNROLL(func2); \
+    func(0,peel) func(1,peel) func(2,peel) func(3,peel) func(4,peel)
+
+#define MICRO_COMPLEX_LOAD_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
+    lhs_ptr_real##iter += accCols; \
+    if(!LhsIsReal) { \
+      lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_imag##iter); \
+      lhs_ptr_imag##iter += accCols; \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
+    } \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
+    EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
+  }
+
+#define MICRO_COMPLEX_WORK_ONE4(iter, peel) \
+  if (unroll_factor > iter) { \
+    pgerc_common<4, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_WORK_ONE1(iter, peel) \
+  if (unroll_factor > iter) { \
+    pgerc_common<1, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \
+  if (PEEL_COMPLEX > peel) { \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
+    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
+    pbroadcast4_old<Packet>(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
+    if(!RhsIsReal) { \
+      pbroadcast4_old<Packet>(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+    } \
+    MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_TYPE_PEEL1(func, func2, peel) \
+  if (PEEL_COMPLEX > peel) { \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
+    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
+    rhsV##peel[0] = pset1<Packet>(rhs_ptr_real[remaining_cols * peel]); \
+    if(!RhsIsReal) { \
+      rhsVi##peel[0] = pset1<Packet>(rhs_ptr_imag[remaining_cols * peel]); \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+    } \
+    MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \
+  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \
+  Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M], rhsVi4[M], rhsVi5[M], rhsVi6[M], rhsVi7[M], rhsVi8[M], rhsVi9[M]; \
+  func(func1,func2,0); func(func1,func2,1); \
+  func(func1,func2,2); func(func1,func2,3); \
+  func(func1,func2,4); func(func1,func2,5); \
+  func(func1,func2,6); func(func1,func2,7); \
+  func(func1,func2,8); func(func1,func2,9);
+
+#define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \
+  Packet rhsV0[M], rhsVi0[M];\
+  func(func1,func2,0);
+
+#define MICRO_COMPLEX_ONE_PEEL4 \
+  MICRO_COMPLEX_UNROLL_TYPE_PEEL(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \
+  rhs_ptr_real += (accRows * PEEL_COMPLEX); \
+  if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX);
+
+#define MICRO_COMPLEX_ONE4 \
+  MICRO_COMPLEX_UNROLL_TYPE_ONE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \
+  rhs_ptr_real += accRows; \
+  if(!RhsIsReal) rhs_ptr_imag += accRows;
+
+#define MICRO_COMPLEX_ONE_PEEL1 \
+  MICRO_COMPLEX_UNROLL_TYPE_PEEL(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \
+  rhs_ptr_real += (remaining_cols * PEEL_COMPLEX); \
+  if(!RhsIsReal) rhs_ptr_imag += (remaining_cols * PEEL_COMPLEX);
+
+#define MICRO_COMPLEX_ONE1 \
+  MICRO_COMPLEX_UNROLL_TYPE_ONE(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \
+  rhs_ptr_real += remaining_cols; \
+  if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
+
+#define MICRO_COMPLEX_DST_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    bsetzero<Scalar, Packet>(accReal##iter); \
+    bsetzero<Scalar, Packet>(accImag##iter); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accReal##iter); \
+    EIGEN_UNUSED_VARIABLE(accImag##iter); \
+  }
+
+#define MICRO_COMPLEX_DST_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_DST_PTR_ONE)
+
+#define MICRO_COMPLEX_SRC_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \
+    if(!LhsIsReal) { \
+      lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
+    } \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
+  }
+
+#define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE)
+
+#define MICRO_COMPLEX_PREFETCH_ONE(iter) \
+  if (unroll_factor > iter) { \
+    EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
+    if(!LhsIsReal) { \
+      EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \
+    } \
+  }
+
+#define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
+
+#define MICRO_COMPLEX_STORE_ONE(iter) \
+  if (unroll_factor > iter) { \
+    bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row + iter*accCols, col); \
+    bscalec<Packet,4>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
+    bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1); \
+    res.template storePacketBlock<Packetc,4>(row + iter*accCols + 0, col, acc0); \
+    res.template storePacketBlock<Packetc,4>(row + iter*accCols + accColsC, col, acc1); \
+  }
+
+#define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE)
+
+#define MICRO_COMPLEX_COL_STORE_ONE(iter) \
+  if (unroll_factor > iter) { \
+    bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row + iter*accCols, col); \
+    bscalec<Packet,1>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
+    bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1); \
+    res.template storePacketBlock<Packetc,1>(row + iter*accCols + 0, col, acc0); \
+    res.template storePacketBlock<Packetc,1>(row + iter*accCols + accColsC, col, acc1); \
+  }
+
+#define MICRO_COMPLEX_COL_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_COL_STORE_ONE)
+
+template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index& row,
+  Index col,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag)
+{
+  const Scalar* rhs_ptr_real = rhs_base;
+  const Scalar* rhs_ptr_imag;
+  if(!RhsIsReal) {
+    rhs_ptr_imag = rhs_base + accRows*strideB;
+  } else {
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+  }
+  const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL;
+  const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL;
+  const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL;
+  PacketBlock<Packet,4> accReal0, accImag0, accReal1, accImag1;
+  PacketBlock<Packet,4> accReal2, accImag2, accReal3, accImag3;
+  PacketBlock<Packet,4> accReal4, accImag4;
+  PacketBlock<Packet,4> taccReal, taccImag;
+  PacketBlock<Packetc,4> acc0, acc1;
+  PacketBlock<Packetc,8> tRes;
+
+  MICRO_COMPLEX_SRC_PTR
+  MICRO_COMPLEX_DST_PTR
+
+  Index k = 0;
+  for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr_real);
+    if(!RhsIsReal) {
+      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
+    }
+    MICRO_COMPLEX_PREFETCH
+    MICRO_COMPLEX_ONE_PEEL4
+  }
+  for(; k < depth; k++)
+  {
+    MICRO_COMPLEX_ONE4
+  }
+  MICRO_COMPLEX_STORE
+
+  row += unroll_factor*accCols;
+}
+
+template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_unrolled_col_iteration(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index& row,
+  Index col,
+  Index remaining_cols,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag)
+{
+  const Scalar* rhs_ptr_real = rhs_base;
+  const Scalar* rhs_ptr_imag;
+  if(!RhsIsReal) {
+    rhs_ptr_imag = rhs_base + remaining_cols*strideB;
+  } else {
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+  }
+  const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL;
+  const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL;
+  const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL;
+  PacketBlock<Packet,1> accReal0, accImag0, accReal1, accImag1;
+  PacketBlock<Packet,1> accReal2, accImag2, accReal3, accImag3;
+  PacketBlock<Packet,1> accReal4, accImag4;
+  PacketBlock<Packet,1> taccReal, taccImag;
+  PacketBlock<Packetc,1> acc0, acc1;
+  PacketBlock<Packetc,2> tRes;
+
+  MICRO_COMPLEX_SRC_PTR
+  MICRO_COMPLEX_DST_PTR
+
+  Index k = 0;
+  for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr_real);
+    if(!RhsIsReal) {
+      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
+    }
+    MICRO_COMPLEX_PREFETCH
+    MICRO_COMPLEX_ONE_PEEL1
+  }
+  for(; k < depth; k++)
+  {
+    MICRO_COMPLEX_ONE1
+  }
+  MICRO_COMPLEX_COL_STORE
+
+  row += unroll_factor*accCols;
+}
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_unrolled_col(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index& row,
+  Index rows,
+  Index col,
+  Index remaining_cols,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag)
+{
+#define MAX_COMPLEX_UNROLL 3
+  while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
+    gemm_complex_unrolled_col_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
+  }
+  switch( (rows-row)/accCols ) {
+#if MAX_COMPLEX_UNROLL > 4
+   case 4:
+     gemm_complex_unrolled_col_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
+     break;
+#endif
+#if MAX_COMPLEX_UNROLL > 3
+   case 3:
+     gemm_complex_unrolled_col_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
+     break;
+#endif
+#if MAX_COMPLEX_UNROLL > 2
+   case 2:
+     gemm_complex_unrolled_col_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
+     break;
+#endif
+#if MAX_COMPLEX_UNROLL > 1
+   case 1:
+     gemm_complex_unrolled_col_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
+     break;
+#endif
+   default:
+     break;
+  }
+#undef MAX_COMPLEX_UNROLL
+}
+
+template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+      const Index remaining_rows = rows % accCols;
+      const Index remaining_cols = cols % accRows;
+
+      if( strideA == -1 ) strideA = depth;
+      if( strideB == -1 ) strideB = depth;
+
+      const Packet pAlphaReal = pset1<Packet>(alpha.real());
+      const Packet pAlphaImag = pset1<Packet>(alpha.imag());
+      const Packet pMask = bmask<Packet>((const int)(remaining_rows));
+
+      const Scalar* blockA = (Scalar *) blockAc;
+      const Scalar* blockB = (Scalar *) blockBc;
+
+      Index col = 0;
+      for(; col + accRows <= cols; col += accRows)
+      {
+        const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
+        const Scalar* lhs_base = blockA;
+        Index row = 0;
+
+#define MAX_COMPLEX_UNROLL 3
+        while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
+          gemm_complex_unrolled_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+        }
+        switch( (rows-row)/accCols ) {
+#if MAX_COMPLEX_UNROLL > 4
+          case 4:
+            gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+            break;
+#endif
+#if MAX_COMPLEX_UNROLL > 3
+          case 3:
+            gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+            break;
+#endif
+#if MAX_COMPLEX_UNROLL > 2
+          case 2:
+            gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+            break;
+#endif
+#if MAX_COMPLEX_UNROLL > 1
+          case 1:
+            gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+            break;
+#endif
+          default:
+            break;
+        }
+#undef MAX_COMPLEX_UNROLL
+
+        if(remaining_rows > 0)
+        {
+          gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+        }
+      }
+
+      if(remaining_cols > 0)
+      {
+        const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB;
+        const Scalar* lhs_base = blockA;
+
+        for(; col < cols; col++)
+        {
+          Index row = 0;
+
+          gemm_complex_unrolled_col<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag);
+
+          if (remaining_rows > 0)
+          {
+            gemm_complex_extra_col<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag);
+          }
+          rhs_base++;
+        }
+      }
+}
+
+#undef accColsC
+#undef advanceCols
+#undef advanceRows
+
+/************************************
+ * ppc64le template specializations *
+ * **********************************/
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+  ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+    dhs_pack<double, Index, DataMapper, Packet2d, ColMajor, PanelMode, true> pack;
+    pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+  ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+    dhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode, true> pack;
+    pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+  ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_pack<double, Index, DataMapper, Packet2d, ColMajor, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+  ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+#endif
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+  ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  dhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+  ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  dhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+  ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+  ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+  ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+  ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+#endif
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+  ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+  ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+  ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+  ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+{
+  void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+  ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+{
+  void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+  ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+// ********* gebp specializations *********
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef typename quad_traits<float>::vectortype   Packet;
+  typedef typename quad_traits<float>::rhstype      RhsPacket;
+
+  void operator()(const DataMapper& res, const float* blockA, const float* blockB,
+                  Index rows, Index depth, Index cols, float alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const float* blockA, const float* blockB,
+               Index rows, Index depth, Index cols, float alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<float>::rows;
+    const Index accCols = quad_traits<float>::size;
+    void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index);
+
+    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+      //generate with MMA only
+      gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+    #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+      if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+        gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+      }
+      else{
+        gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+      }
+    #else
+      gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+    #endif
+      gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef Packet4f   Packet;
+  typedef Packet2cf  Packetc;
+  typedef Packet4f   RhsPacket;
+
+  void operator()(const DataMapper& res, const std::complex<float>* blockA, const std::complex<float>* blockB,
+                  Index rows, Index depth, Index cols, std::complex<float> alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const std::complex<float>* blockA, const std::complex<float>* blockB,
+               Index rows, Index depth, Index cols, std::complex<float> alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<float>::rows;
+    const Index accCols = quad_traits<float>::size;
+    void (*gemm_function)(const DataMapper&, const std::complex<float>*, const std::complex<float>*,
+          Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
+
+    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+       //generate with MMA only
+       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+       }
+       else{
+         gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+       }
+     #else
+       gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+     #endif
+      gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef Packet4f   Packet;
+  typedef Packet2cf  Packetc;
+  typedef Packet4f   RhsPacket;
+
+  void operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB,
+                  Index rows, Index depth, Index cols, std::complex<float> alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB,
+               Index rows, Index depth, Index cols, std::complex<float> alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<float>::rows;
+    const Index accCols = quad_traits<float>::size;
+    void (*gemm_function)(const DataMapper&, const float*, const std::complex<float>*,
+          Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
+    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+       //generate with MMA only
+       gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+         gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+       }
+       else{
+         gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+       }
+     #else
+       gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+     #endif
+       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef Packet4f   Packet;
+  typedef Packet2cf  Packetc;
+  typedef Packet4f   RhsPacket;
+
+  void operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB,
+                  Index rows, Index depth, Index cols, std::complex<float> alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB,
+               Index rows, Index depth, Index cols, std::complex<float> alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<float>::rows;
+    const Index accCols = quad_traits<float>::size;
+    void (*gemm_function)(const DataMapper&, const std::complex<float>*, const float*,
+          Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
+    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+       //generate with MMA only
+       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+       }
+       else{
+         gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+       }
+     #else
+       gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+     #endif
+       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef typename quad_traits<double>::vectortype  Packet;
+  typedef typename quad_traits<double>::rhstype     RhsPacket;
+
+  void operator()(const DataMapper& res, const double* blockA, const double* blockB,
+                  Index rows, Index depth, Index cols, double alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const double* blockA, const double* blockB,
+               Index rows, Index depth, Index cols, double alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<double>::rows;
+    const Index accCols = quad_traits<double>::size;
+    void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index);
+
+    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+      //generate with MMA only
+      gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+    #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+      if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+        gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+      }
+      else{
+        gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+      }
+    #else
+      gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
+    #endif
+      gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef quad_traits<double>::vectortype   Packet;
+  typedef Packet1cd  Packetc;
+  typedef quad_traits<double>::rhstype   RhsPacket;
+
+  void operator()(const DataMapper& res, const std::complex<double>* blockA, const std::complex<double>* blockB,
+                  Index rows, Index depth, Index cols, std::complex<double> alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const std::complex<double>* blockA, const std::complex<double>* blockB,
+               Index rows, Index depth, Index cols, std::complex<double> alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<double>::rows;
+    const Index accCols = quad_traits<double>::size;
+    void (*gemm_function)(const DataMapper&, const std::complex<double>*, const std::complex<double>*,
+          Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
+    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+       //generate with MMA only
+       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+       }
+       else{
+         gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+       }
+     #else
+       gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+     #endif
+       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef quad_traits<double>::vectortype   Packet;
+  typedef Packet1cd  Packetc;
+  typedef quad_traits<double>::rhstype   RhsPacket;
+
+  void operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB,
+                  Index rows, Index depth, Index cols, std::complex<double> alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB,
+               Index rows, Index depth, Index cols, std::complex<double> alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<double>::rows;
+    const Index accCols = quad_traits<double>::size;
+    void (*gemm_function)(const DataMapper&, const std::complex<double>*, const double*,
+          Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
+    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+       //generate with MMA only
+       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+       }
+       else{
+         gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+       }
+     #else
+       gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+     #endif
+       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+{
+  typedef quad_traits<double>::vectortype   Packet;
+  typedef Packet1cd  Packetc;
+  typedef quad_traits<double>::rhstype   RhsPacket;
+
+  void operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB,
+                  Index rows, Index depth, Index cols, std::complex<double> alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB,
+               Index rows, Index depth, Index cols, std::complex<double> alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    const Index accRows = quad_traits<double>::rows;
+    const Index accCols = quad_traits<double>::size;
+    void (*gemm_function)(const DataMapper&, const double*, const std::complex<double>*,
+          Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
+    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+       //generate with MMA only
+       gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+         gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+       }
+       else{
+         gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+       }
+     #else
+       gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+     #endif
+       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+  }
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
new file mode 100644
index 000000000..33d543494
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
@@ -0,0 +1,221 @@
+//#define EIGEN_POWER_USE_PREFETCH  // Use prefetching in gemm routines
+#ifdef EIGEN_POWER_USE_PREFETCH
+#define EIGEN_POWER_PREFETCH(p)  prefetch(p)
+#else
+#define EIGEN_POWER_PREFETCH(p)
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
+EIGEN_STRONG_INLINE void gemm_extra_col(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index row,
+  Index col,
+  Index remaining_rows,
+  Index remaining_cols,
+  const Packet& pAlpha);
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+EIGEN_STRONG_INLINE void gemm_extra_row(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index row,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlpha,
+  const Packet& pMask);
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
+EIGEN_STRONG_INLINE void gemm_unrolled_col(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index& row,
+  Index rows,
+  Index col,
+  Index remaining_cols,
+  const Packet& pAlpha);
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows);
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_extra_col(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index row,
+  Index col,
+  Index remaining_rows,
+  Index remaining_cols,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag);
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_extra_row(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index row,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag,
+  const Packet& pMask);
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_unrolled_col(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index& row,
+  Index rows,
+  Index col,
+  Index remaining_cols,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag);
+
+template<typename Scalar, typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs);
+
+template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,4>& acc, const DataMapper& res, Index row, Index col);
+
+template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,8>& acc, const DataMapper& res, Index row, Index col);
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha);
+
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag);
+
+const static Packet16uc p16uc_SETCOMPLEX32_FIRST = {  0,  1,  2,  3,
+                                                     16, 17, 18, 19,
+                                                      4,  5,  6,  7,
+                                                     20, 21, 22, 23};
+
+const static Packet16uc p16uc_SETCOMPLEX32_SECOND = {  8,  9, 10, 11,
+                                                      24, 25, 26, 27,
+                                                      12, 13, 14, 15,
+                                                      28, 29, 30, 31};
+//[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64
+const static Packet16uc p16uc_SETCOMPLEX64_FIRST = {  0,  1,  2,  3,  4,  5,  6,  7,
+                                                     16, 17, 18, 19, 20, 21, 22, 23};
+
+//[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64
+const static Packet16uc p16uc_SETCOMPLEX64_SECOND = {  8,  9, 10, 11, 12, 13, 14, 15,
+                                                      24, 25, 26, 27, 28, 29, 30, 31};
+
+
+// Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
+template<typename Packet, typename Packetc>
+EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,4>& taccReal, PacketBlock<Packet,4>& taccImag, PacketBlock<Packetc, 4>& acc1, PacketBlock<Packetc, 4>& acc2)
+{
+  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST);
+  acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST);
+  acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_FIRST);
+  acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_FIRST);
+
+  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND);
+  acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_SECOND);
+  acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_SECOND);
+  acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_SECOND);
+}
+
+template<typename Packet, typename Packetc>
+EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,4>& taccReal, PacketBlock<Packet,4>& taccImag, PacketBlock<Packetc,8>& tRes, PacketBlock<Packetc, 4>& acc1, PacketBlock<Packetc, 4>& acc2)
+{
+  bcouple_common<Packet, Packetc>(taccReal, taccImag, acc1, acc2);
+
+  acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
+  acc1.packet[1] = padd<Packetc>(tRes.packet[1], acc1.packet[1]);
+  acc1.packet[2] = padd<Packetc>(tRes.packet[2], acc1.packet[2]);
+  acc1.packet[3] = padd<Packetc>(tRes.packet[3], acc1.packet[3]);
+
+  acc2.packet[0] = padd<Packetc>(tRes.packet[4], acc2.packet[0]);
+  acc2.packet[1] = padd<Packetc>(tRes.packet[5], acc2.packet[1]);
+  acc2.packet[2] = padd<Packetc>(tRes.packet[6], acc2.packet[2]);
+  acc2.packet[3] = padd<Packetc>(tRes.packet[7], acc2.packet[3]);
+}
+
+template<typename Packet, typename Packetc>
+EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,1>& taccReal, PacketBlock<Packet,1>& taccImag, PacketBlock<Packetc, 1>& acc1, PacketBlock<Packetc, 1>& acc2)
+{
+  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST);
+
+  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND);
+}
+
+template<typename Packet, typename Packetc>
+EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,1>& taccReal, PacketBlock<Packet,1>& taccImag, PacketBlock<Packetc,2>& tRes, PacketBlock<Packetc, 1>& acc1, PacketBlock<Packetc, 1>& acc2)
+{
+  bcouple_common<Packet, Packetc>(taccReal, taccImag, acc1, acc2);
+
+  acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
+
+  acc2.packet[0] = padd<Packetc>(tRes.packet[1], acc2.packet[0]);
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void bcouple_common<Packet2d, Packet1cd>(PacketBlock<Packet2d,4>& taccReal, PacketBlock<Packet2d,4>& taccImag, PacketBlock<Packet1cd, 4>& acc1, PacketBlock<Packet1cd, 4>& acc2)
+{
+  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST);
+  acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST);
+  acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_FIRST);
+  acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_FIRST);
+
+  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND);
+  acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_SECOND);
+  acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_SECOND);
+  acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_SECOND);
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void bcouple_common<Packet2d, Packet1cd>(PacketBlock<Packet2d,1>& taccReal, PacketBlock<Packet2d,1>& taccImag, PacketBlock<Packet1cd, 1>& acc1, PacketBlock<Packet1cd, 1>& acc2)
+{
+  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST);
+
+  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND);
+}
+
+// This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
+template<typename Scalar, typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadRhs(const Scalar* rhs)
+{
+  return ploadu<Packet>(rhs);
+}
+
+} // end namespace internal
+} // end namespace Eigen
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
new file mode 100644
index 000000000..6540c6fa6
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
@@ -0,0 +1,629 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
+// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+#define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+
+#pragma GCC target("cpu=power10")
+
+#ifdef __has_builtin
+#if !__has_builtin(__builtin_vsx_assemble_pair)
+#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
+#endif
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Scalar, typename Packet>
+EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc)
+{
+  __builtin_mma_xxsetaccz(acc);
+}
+
+template<typename DataMapper, typename Index, typename Packet, const Index accCols>
+EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad* acc)
+{
+  PacketBlock<Packet, 4> result;
+  __builtin_mma_disassemble_acc(&result.packet, acc);
+
+  PacketBlock<Packet, 4> tRes;
+  bload<DataMapper, Packet, Index, accCols, 0, ColMajor>(tRes, data, i, j);
+
+  bscale<Packet>(tRes, result, alpha);
+
+  data.template storePacketBlock<Packet, 4>(i, j, tRes);
+}
+
+template<typename DataMapper, typename Index, typename Packet, typename Packetc, const Index accColsC, int N>
+EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag)
+{
+  PacketBlock<Packet, 4> resultReal, resultImag;
+  __builtin_mma_disassemble_acc(&resultReal.packet, accReal);
+  __builtin_mma_disassemble_acc(&resultImag.packet, accImag);
+
+  PacketBlock<Packetc, 8> tRes;
+  bload<DataMapper, Packetc, Index, accColsC, N, ColMajor>(tRes, data, i, j);
+
+  PacketBlock<Packet,4> taccReal, taccImag;
+  bscalec<Packet,4>(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag);
+
+  PacketBlock<Packetc, 4> acc1, acc2;
+  bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc1, acc2);
+
+  data.template storePacketBlock<Packetc, 4>(i + N*accColsC, j, acc1);
+  data.template storePacketBlock<Packetc, 4>(i + (N+1)*accColsC, j, acc2);
+}
+
+// Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
+template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b)
+{
+  if(NegativeAccumulate)
+  {
+    __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+  }
+}
+
+template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock<Packet2d,2>& a, const Packet2d& b)
+{
+  __vector_pair* a0 = (__vector_pair *)(&a.packet[0]);
+  if(NegativeAccumulate)
+  {
+    __builtin_mma_xvf64gernp(acc, *a0, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf64gerpp(acc, *a0, (__vector unsigned char)b);
+  }
+}
+
+template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b)
+{
+  if(NegativeAccumulate)
+  {
+    __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b);
+  }
+}
+
+template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad*, const __vector_pair&, const Packet4f&)
+{
+  // Just for compilation
+}
+
+template<typename Scalar, typename Packet, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, const Packet& lhsVi, const RhsPacket& rhsV, const RhsPacket& rhsVi)
+{
+  pgerMMA<Packet, RhsPacket, false>(accReal,  rhsV,  lhsV);
+  if(LhsIsReal) {
+    pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi,  lhsV);
+  } else {
+    if(!RhsIsReal) {
+      pgerMMA<Packet, RhsPacket, ConjugateLhs == ConjugateRhs>(accReal, rhsVi, lhsVi);
+      pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi,  lhsV);
+    } else {
+      EIGEN_UNUSED_VARIABLE(rhsVi);
+    }
+    pgerMMA<Packet, RhsPacket, ConjugateLhs>(accImag,  rhsV, lhsVi);
+  }
+}
+
+// This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
+template<typename Scalar, typename Packet>
+EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV)
+{
+  rhsV = ploadRhs<Scalar, Packet>((const Scalar*)(rhs));
+} 
+
+template<>
+EIGEN_ALWAYS_INLINE void ploadRhsMMA<double, PacketBlock<Packet2d, 2> >(const double* rhs, PacketBlock<Packet2d, 2>& rhsV)
+{
+  rhsV.packet[0] = ploadRhs<double, Packet2d>((const double *)((Packet2d *)rhs      ));
+  rhsV.packet[1] = ploadRhs<double, Packet2d>((const double *)(((Packet2d *)rhs) + 1));
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void ploadRhsMMA<double, __vector_pair>(const double* rhs, __vector_pair& rhsV)
+{
+#if EIGEN_COMP_LLVM
+  __builtin_vsx_assemble_pair(&rhsV,
+    (__vector unsigned char)(ploadRhs<double, Packet2d>((const double *)(((Packet2d *)rhs) + 1))),
+    (__vector unsigned char)(ploadRhs<double, Packet2d>((const double *)((Packet2d *)rhs      ))));
+#else
+  __asm__ ("lxvp %x0,%1" : "=wa" (rhsV) : "Y" (*rhs));
+#endif
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&)
+{
+  // Just for compilation
+}
+
+// PEEL_MMA loop factor.
+#define PEEL_MMA 7
+
+#define MICRO_MMA_UNROLL(func) \
+  func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
+
+#define MICRO_MMA_LOAD_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr##iter); \
+    lhs_ptr##iter += accCols; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
+  }
+
+#define MICRO_MMA_WORK_ONE(iter, type, peel) \
+  if (unroll_factor > iter) { \
+    pgerMMA<Packet, type, false>(&accZero##iter, rhsV##peel, lhsV##iter); \
+  }
+
+#define MICRO_MMA_TYPE_PEEL(func, func2, type, peel) \
+  if (PEEL_MMA > peel) { \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
+    ploadRhsMMA<Scalar, type>(rhs_ptr + (accRows * peel), rhsV##peel); \
+    MICRO_MMA_UNROLL(func2); \
+    func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \
+    func(4,type,peel) func(5,type,peel) func(6,type,peel) func(7,type,peel) \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+  }
+
+#define MICRO_MMA_UNROLL_TYPE_PEEL(func, func2, type) \
+  type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,0); MICRO_MMA_TYPE_PEEL(func,func2,type,1); \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,2); MICRO_MMA_TYPE_PEEL(func,func2,type,3); \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,4); MICRO_MMA_TYPE_PEEL(func,func2,type,5); \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7); \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,8); MICRO_MMA_TYPE_PEEL(func,func2,type,9);
+
+#define MICRO_MMA_UNROLL_TYPE_ONE(func, func2, type) \
+  type rhsV0; \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,0);
+
+#define MICRO_MMA_ONE_PEEL \
+  if (sizeof(Scalar) == sizeof(float)) { \
+    MICRO_MMA_UNROLL_TYPE_PEEL(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, RhsPacket); \
+  } else { \
+    MICRO_MMA_UNROLL_TYPE_PEEL(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, __vector_pair); \
+  } \
+  rhs_ptr += (accRows * PEEL_MMA);
+
+#define MICRO_MMA_ONE \
+  if (sizeof(Scalar) == sizeof(float)) { \
+    MICRO_MMA_UNROLL_TYPE_ONE(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, RhsPacket); \
+  } else { \
+    MICRO_MMA_UNROLL_TYPE_ONE(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, __vector_pair); \
+  } \
+  rhs_ptr += accRows;
+
+#define MICRO_MMA_DST_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    bsetzeroMMA<Scalar, Packet>(&accZero##iter); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accZero##iter); \
+  }
+
+#define MICRO_MMA_DST_PTR MICRO_MMA_UNROLL(MICRO_MMA_DST_PTR_ONE)
+
+#define MICRO_MMA_SRC_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
+  }
+
+#define MICRO_MMA_SRC_PTR MICRO_MMA_UNROLL(MICRO_MMA_SRC_PTR_ONE)
+
+#define MICRO_MMA_PREFETCH_ONE(iter) \
+  if (unroll_factor > iter) { \
+    EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
+  }
+
+#define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_MMA_PREFETCH_ONE)
+
+#define MICRO_MMA_STORE_ONE(iter) \
+  if (unroll_factor > iter) { \
+    storeAccumulator<DataMapper, Index, Packet, accCols>(row + iter*accCols, col, res, pAlpha, &accZero##iter); \
+  }
+
+#define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE)
+
+template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index& row,
+  Index col,
+  const Packet& pAlpha)
+{
+  const Scalar* rhs_ptr = rhs_base;
+  const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
+  __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
+
+  MICRO_MMA_SRC_PTR
+  MICRO_MMA_DST_PTR
+
+  Index k = 0;
+  for(; k + PEEL_MMA <= depth; k+= PEEL_MMA)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr);
+    MICRO_MMA_PREFETCH
+    MICRO_MMA_ONE_PEEL
+  }
+  for(; k < depth; k++)
+  {
+    MICRO_MMA_ONE
+  }
+  MICRO_MMA_STORE
+
+  row += unroll_factor*accCols;
+}
+
+template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
+void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+      const Index remaining_rows = rows % accCols;
+      const Index remaining_cols = cols % accRows;
+
+      if( strideA == -1 ) strideA = depth;
+      if( strideB == -1 ) strideB = depth;
+
+      const Packet pAlpha = pset1<Packet>(alpha);
+      const Packet pMask  = bmask<Packet>((const int)(remaining_rows));
+
+      Index col = 0;
+      for(; col + accRows <= cols; col += accRows)
+      {
+        const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
+        const Scalar* lhs_base = blockA;
+
+        Index row = 0;
+#define MAX_MMA_UNROLL 7
+        while(row + MAX_MMA_UNROLL*accCols <= rows) {
+          gemm_unrolled_MMA_iteration<MAX_MMA_UNROLL, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+        }
+        switch( (rows-row)/accCols ) {
+#if MAX_MMA_UNROLL > 7
+          case 7:
+            gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_MMA_UNROLL > 6
+          case 6:
+            gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_MMA_UNROLL > 5
+          case 5:
+            gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_MMA_UNROLL > 4
+          case 4:
+            gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_MMA_UNROLL > 3
+          case 3:
+            gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_MMA_UNROLL > 2
+          case 2:
+            gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+#if MAX_MMA_UNROLL > 1
+          case 1:
+            gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
+            break;
+#endif
+          default:
+            break;
+        }
+#undef MAX_MMA_UNROLL
+
+        if(remaining_rows > 0)
+        {
+          gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
+        }
+      }
+
+      if(remaining_cols > 0)
+      {
+        const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB;
+        const Scalar* lhs_base = blockA;
+
+        for(; col < cols; col++)
+        {
+          Index row = 0;
+
+          gemm_unrolled_col<Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha);
+
+          if (remaining_rows > 0)
+          {
+            gemm_extra_col<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha);
+          }
+          rhs_base++;
+        }
+      }
+}
+
+#define accColsC (accCols / 2)
+#define advanceRows ((LhsIsReal) ? 1 : 2)
+#define advanceCols ((RhsIsReal) ? 1 : 2)
+
+// PEEL_COMPLEX_MMA loop factor.
+#define PEEL_COMPLEX_MMA 7
+
+#define MICRO_COMPLEX_MMA_UNROLL(func) \
+  func(0) func(1) func(2) func(3) func(4)
+
+#define MICRO_COMPLEX_MMA_LOAD_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
+    lhs_ptr_real##iter += accCols; \
+    if(!LhsIsReal) { \
+      lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_imag##iter); \
+      lhs_ptr_imag##iter += accCols; \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
+    } \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
+    EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
+  }
+
+#define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel) \
+  if (unroll_factor > iter) { \
+    pgercMMA<Scalar, Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_MMA_TYPE_PEEL(func, func2, type, peel) \
+  if (PEEL_COMPLEX_MMA > peel) { \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
+    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
+    ploadRhsMMA<Scalar, type>(rhs_ptr_real + (accRows * peel), rhsV##peel); \
+    if(!RhsIsReal) { \
+      ploadRhsMMA<Scalar, type>(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+    } \
+    MICRO_COMPLEX_MMA_UNROLL(func2); \
+    func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) func(4,type,peel) \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(func, func2, type) \
+  type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \
+  type rhsVi0, rhsVi1, rhsVi2, rhsVi3, rhsVi4, rhsVi5, rhsVi6, rhsVi7, rhsVi8, rhsVi9; \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,1); \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3); \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,4); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,5); \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,6); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,7); \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,8); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,9);
+
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(func, func2, type) \
+  type rhsV0, rhsVi0; \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0);
+
+#define MICRO_COMPLEX_MMA_ONE_PEEL \
+  if (sizeof(Scalar) == sizeof(float)) { \
+    MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, RhsPacket); \
+  } else { \
+    MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, __vector_pair); \
+  } \
+  rhs_ptr_real += (accRows * PEEL_COMPLEX_MMA); \
+  if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX_MMA);
+
+#define MICRO_COMPLEX_MMA_ONE \
+  if (sizeof(Scalar) == sizeof(float)) { \
+    MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, RhsPacket); \
+  } else { \
+    MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, __vector_pair); \
+  } \
+  rhs_ptr_real += accRows; \
+  if(!RhsIsReal) rhs_ptr_imag += accRows;
+
+#define MICRO_COMPLEX_MMA_DST_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    bsetzeroMMA<Scalar, Packet>(&accReal##iter); \
+    bsetzeroMMA<Scalar, Packet>(&accImag##iter); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accReal##iter); \
+    EIGEN_UNUSED_VARIABLE(accImag##iter); \
+  }
+
+#define MICRO_COMPLEX_MMA_DST_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_DST_PTR_ONE)
+
+#define MICRO_COMPLEX_MMA_SRC_PTR_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \
+    if(!LhsIsReal) { \
+      lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \
+    } else { \
+      EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
+    } \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
+  }
+
+#define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_SRC_PTR_ONE)
+
+#define MICRO_COMPLEX_MMA_PREFETCH_ONE(iter) \
+  if (unroll_factor > iter) { \
+    EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
+    if(!LhsIsReal) { \
+      EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \
+    } \
+  }
+
+#define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_PREFETCH_ONE)
+
+#define MICRO_COMPLEX_MMA_STORE_ONE(iter) \
+  if (unroll_factor > iter) { \
+    storeComplexAccumulator<DataMapper, Index, Packet, Packetc, accColsC, 0>(row + iter*accCols, col, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \
+  }
+
+#define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
+
+template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index& row,
+  Index col,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag)
+{
+  const Scalar* rhs_ptr_real = rhs_base;
+  const Scalar* rhs_ptr_imag;
+  if(!RhsIsReal) {
+    rhs_ptr_imag = rhs_base + accRows*strideB;
+  } else {
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
+  }
+  const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL;
+  const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL;
+  const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL;
+  __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3, accReal4, accImag4;
+
+  MICRO_COMPLEX_MMA_SRC_PTR
+  MICRO_COMPLEX_MMA_DST_PTR
+
+  Index k = 0;
+  for(; k + PEEL_COMPLEX_MMA <= depth; k+= PEEL_COMPLEX_MMA)
+  {
+    EIGEN_POWER_PREFETCH(rhs_ptr_real);
+    if(!RhsIsReal) {
+      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
+    }
+    MICRO_COMPLEX_MMA_PREFETCH
+    MICRO_COMPLEX_MMA_ONE_PEEL
+  }
+  for(; k < depth; k++)
+  {
+    MICRO_COMPLEX_MMA_ONE
+  }
+  MICRO_COMPLEX_MMA_STORE
+
+  row += unroll_factor*accCols;
+}
+
+template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+      const Index remaining_rows = rows % accCols;
+      const Index remaining_cols = cols % accRows;
+
+      if( strideA == -1 ) strideA = depth;
+      if( strideB == -1 ) strideB = depth;
+
+      const Packet pAlphaReal = pset1<Packet>(alpha.real());
+      const Packet pAlphaImag = pset1<Packet>(alpha.imag());
+      const Packet pMask = bmask<Packet>((const int)(remaining_rows));
+
+      const Scalar* blockA = (Scalar *) blockAc;
+      const Scalar* blockB = (Scalar *) blockBc;
+
+      Index col = 0;
+      for(; col + accRows <= cols; col += accRows)
+      {
+        const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
+        const Scalar* lhs_base = blockA;
+        Index row = 0;
+
+#define MAX_COMPLEX_MMA_UNROLL 4
+        while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) {
+          gemm_complex_unrolled_MMA_iteration<MAX_COMPLEX_MMA_UNROLL, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+        }
+        switch( (rows-row)/accCols ) {
+#if MAX_COMPLEX_MMA_UNROLL > 4
+          case 4:
+            gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+            break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 3
+          case 3:
+            gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+            break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 2
+          case 2:
+            gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+            break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 1
+          case 1:
+            gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
+            break;
+#endif
+          default:
+            break;
+        }
+#undef MAX_COMPLEX_MMA_UNROLL
+
+        if(remaining_rows > 0)
+        {
+          gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+        }
+      }
+
+      if(remaining_cols > 0)
+      {
+        const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB;
+        const Scalar* lhs_base = blockA;
+
+        for(; col < cols; col++)
+        {
+          Index row = 0;
+
+          gemm_complex_unrolled_col<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag);
+
+          if (remaining_rows > 0)
+          {
+            gemm_complex_extra_col<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag);
+          }
+          rhs_base++;
+        }
+      }
+}
+
+#undef accColsC
+#undef advanceRows
+#undef advanceCols
+
+#pragma GCC reset_options
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h b/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h
new file mode 100644
index 000000000..2a440545b
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -0,0 +1,2711 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_ALTIVEC_H
+#define EIGEN_PACKET_MATH_ALTIVEC_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
+#endif
+
+typedef __vector float                   Packet4f;
+typedef __vector int                     Packet4i;
+typedef __vector unsigned int            Packet4ui;
+typedef __vector __bool int              Packet4bi;
+typedef __vector short int               Packet8s;
+typedef __vector unsigned short int      Packet8us;
+typedef __vector signed char             Packet16c;
+typedef __vector unsigned char           Packet16uc;
+typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf;
+
+// We don't want to write the same code all the time, but we need to reuse the constants
+// and it doesn't really work to declare them global, so we define macros instead
+#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
+  Packet4f p4f_##NAME = {X, X, X, X}
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
+  Packet4i p4i_##NAME = vec_splat_s32(X)
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \
+  Packet4ui p4ui_##NAME = {X, X, X, X}
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \
+  Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \
+  Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
+  Packet4f p4f_##NAME = pset1<Packet4f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
+  Packet4i p4i_##NAME = pset1<Packet4i>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
+  Packet2d p2d_##NAME = pset1<Packet2d>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
+  Packet2l p2l_##NAME = pset1<Packet2l>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
+
+#define DST_CHAN 1
+#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
+#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type 
+
+// These constants are endian-agnostic
+static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
+static _EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
+static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
+static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
+static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1);
+static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
+#ifndef __VSX__
+static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
+#endif
+
+static Packet4f  p4f_COUNTDOWN  = { 0.0, 1.0, 2.0, 3.0 };
+static Packet4i  p4i_COUNTDOWN  = { 0, 1, 2, 3 };
+static Packet8s  p8s_COUNTDOWN  = { 0, 1, 2, 3, 4, 5, 6, 7 };
+static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
+
+static Packet16c  p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
+                                    8, 9, 10, 11, 12, 13, 14, 15};
+static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7, 
+                                    8, 9, 10, 11, 12, 13, 14, 15};
+
+static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
+static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 };
+static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 };
+
+static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
+static Packet16uc p16uc_DUPLICATE16_HI = { 0,1,0,1, 2,3,2,3, 4,5,4,5, 6,7,6,7 };
+static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 };
+static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 };
+static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 };
+
+static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 };
+
+// Handle endianness properly while loading constants
+// Define global static constants:
+#ifdef _BIG_ENDIAN
+static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
+#ifdef __VSX__
+static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+#endif
+static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+#else
+static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
+static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+#endif // _BIG_ENDIAN
+
+static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16;                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16;                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+
+static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+
+#ifdef _BIG_ENDIAN
+static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+#else
+static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+#endif // _BIG_ENDIAN
+
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+  #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#else
+  #define EIGEN_PPC_PREFETCH(ADDR) asm( "   dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#endif
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet4f type;
+  typedef Packet4f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 1,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasAbs = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+#ifdef __VSX__
+    HasSqrt = 1,
+#if !EIGEN_COMP_CLANG
+    HasRsqrt = 1,
+#else
+    HasRsqrt = 0,
+#endif
+#else
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+#endif
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet8bf type;
+  typedef Packet8bf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasAbs = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+#ifdef __VSX__
+    HasSqrt = 1,
+#if !EIGEN_COMP_CLANG
+    HasRsqrt = 1,
+#else
+    HasRsqrt = 0,
+#endif
+#else
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+#endif
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<int> : default_packet_traits {
+  typedef Packet4i type;
+  typedef Packet4i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+
+    HasAdd   = 1,
+    HasSub   = 1,
+    HasShift = 1,
+    HasMul   = 1,
+    HasDiv   = 0,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<short int> : default_packet_traits {
+  typedef Packet8s type;
+  typedef Packet8s half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 0,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<unsigned short int> : default_packet_traits {
+  typedef Packet8us type;
+  typedef Packet8us half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 0,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<signed char> : default_packet_traits {
+  typedef Packet16c type;
+  typedef Packet16c half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 0,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<unsigned char> : default_packet_traits {
+  typedef Packet16uc type;
+  typedef Packet16uc half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 0,
+    HasBlend = 1
+  };
+};
+
+template<> struct unpacket_traits<Packet4f>
+{
+  typedef float     type;
+  typedef Packet4f  half;
+  typedef Packet4i  integer_packet;
+  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet4i>
+{
+  typedef int       type;
+  typedef Packet4i  half;
+  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet8s>
+{
+  typedef short int type;
+  typedef Packet8s  half;
+  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet8us>
+{
+  typedef unsigned short int type;
+  typedef Packet8us          half;
+  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+
+template<> struct unpacket_traits<Packet16c>
+{
+  typedef signed char type;
+  typedef Packet16c  half;
+  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet16uc>
+{
+  typedef unsigned char type;
+  typedef Packet16uc  half;
+  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+
+template<> struct unpacket_traits<Packet8bf>
+{
+  typedef bfloat16 type;
+  typedef Packet8bf          half;
+  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+inline std::ostream & operator <<(std::ostream & s, const Packet16c & v)
+{
+  union {
+    Packet16c   v;
+    signed char n[16];
+  } vt;
+  vt.v = v;
+  for (int i=0; i< 16; i++)
+    s << vt.n[i] << ", ";
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
+{
+  union {
+    Packet16uc   v;
+    unsigned char n[16];
+  } vt;
+  vt.v = v;
+  for (int i=0; i< 16; i++)
+    s << vt.n[i] << ", ";
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
+{
+  union {
+    Packet4f   v;
+    float n[4];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
+{
+  union {
+    Packet4i   v;
+    int n[4];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
+{
+  union {
+    Packet4ui   v;
+    unsigned int n[4];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
+  return s;
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from)
+{
+  // some versions of GCC throw "unused-but-set-parameter".
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(from);
+  EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef __VSX__
+  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
+#else
+  return vec_ld(0, from);
+#endif
+}
+
+// Need to define them first or we get specialization after instantiation errors
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
+{
+  return pload_common<Packet4f>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
+{
+  return pload_common<Packet4i>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from)
+{
+  return pload_common<Packet8s>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from)
+{
+  return pload_common<Packet8us>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char*     from)
+{
+  return pload_common<Packet16c>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char*     from)
+{
+  return pload_common<Packet16uc>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16*     from)
+{
+  return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
+  // some versions of GCC throw "unused-but-set-parameter" (float *to).
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(to);
+  EIGEN_DEBUG_ALIGNED_STORE
+#ifdef __VSX__
+  vec_xst(from, 0, to);
+#else
+  vec_st(from, 0, to);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
+{
+  pstore_common<Packet4f>(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
+{
+  pstore_common<Packet4i>(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<short int>(short int*       to, const Packet8s& from)
+{
+  pstore_common<Packet8s>(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int*       to, const Packet8us& from)
+{
+  pstore_common<Packet8us>(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16*       to, const Packet8bf& from)
+{
+  pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<signed char>(signed char*       to, const Packet16c& from)
+{
+  pstore_common<Packet16c>(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char*       to, const Packet16uc& from)
+{
+  pstore_common<Packet16uc>(to, from);
+}
+
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
+{
+  Packet v = {from, from, from, from};
+  return v;
+}
+
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet)& from)
+{
+  Packet v = {from, from, from, from, from, from, from, from};
+  return v;
+}
+
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet)& from)
+{
+  Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
+  return v;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
+  return pset1_size4<Packet4f>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
+  return pset1_size4<Packet4i>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int&    from)   {
+  return pset1_size8<Packet8s>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int&    from)   {
+  return pset1_size8<Packet8us>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char&    from)   {
+  return pset1_size16<Packet16c>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char&    from)   {
+  return pset1_size16<Packet16uc>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
+  return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16&    from)   {
+  return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from));
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE void
+pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a,
+                      Packet& a0, Packet& a1, Packet& a2, Packet& a3)
+{
+  a3 = pload<Packet>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
+}
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4i>(const int *a,
+                      Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
+{
+  pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
+}
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
+  a[0] = from[0*stride];
+  a[1] = from[1*stride];
+  a[2] = from[2*stride];
+  a[3] = from[3*stride];
+  return pload<Packet>(a);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  return pgather_common<Packet4f>(from, stride);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+{
+  return pgather_common<Packet4i>(from, stride);
+}
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
+  a[0] = from[0*stride];
+  a[1] = from[1*stride];
+  a[2] = from[2*stride];
+  a[3] = from[3*stride];
+  a[4] = from[4*stride];
+  a[5] = from[5*stride];
+  a[6] = from[6*stride];
+  a[7] = from[7*stride];
+  return pload<Packet>(a);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
+{
+  return pgather_size8<Packet8s>(from, stride);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
+{
+  return pgather_size8<Packet8us>(from, stride);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
+{
+  return pgather_size8<Packet8bf>(from, stride);
+}
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
+  a[0] = from[0*stride];
+  a[1] = from[1*stride];
+  a[2] = from[2*stride];
+  a[3] = from[3*stride];
+  a[4] = from[4*stride];
+  a[5] = from[5*stride];
+  a[6] = from[6*stride];
+  a[7] = from[7*stride];
+  a[8] = from[8*stride];
+  a[9] = from[9*stride];
+  a[10] = from[10*stride];
+  a[11] = from[11*stride];
+  a[12] = from[12*stride];
+  a[13] = from[13*stride];
+  a[14] = from[14*stride];
+  a[15] = from[15*stride];
+  return pload<Packet>(a);
+}
+
+
+template<> EIGEN_DEVICE_FUNC inline Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
+{
+  return pgather_size16<Packet16c>(from, stride);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride)
+{
+  return pgather_size16<Packet16uc>(from, stride);
+}
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
+  pstore<__UNPACK_TYPE__(Packet)>(a, from);
+  to[0*stride] = a[0];
+  to[1*stride] = a[1];
+  to[2*stride] = a[2];
+  to[3*stride] = a[3];
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  pscatter_size4<Packet4f>(to, from, stride);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+{
+  pscatter_size4<Packet4i>(to, from, stride);
+}
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
+  pstore<__UNPACK_TYPE__(Packet)>(a, from);
+  to[0*stride] = a[0];
+  to[1*stride] = a[1];
+  to[2*stride] = a[2];
+  to[3*stride] = a[3];
+  to[4*stride] = a[4];
+  to[5*stride] = a[5];
+  to[6*stride] = a[6];
+  to[7*stride] = a[7];
+}
+
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
+{
+  pscatter_size8<Packet8s>(to, from, stride);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
+{
+  pscatter_size8<Packet8us>(to, from, stride);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
+{
+  pscatter_size8<Packet8bf>(to, from, stride);
+}
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
+  pstore<__UNPACK_TYPE__(Packet)>(a, from);
+  to[0*stride] = a[0];
+  to[1*stride] = a[1];
+  to[2*stride] = a[2];
+  to[3*stride] = a[3];
+  to[4*stride] = a[4];
+  to[5*stride] = a[5];
+  to[6*stride] = a[6];
+  to[7*stride] = a[7];
+  to[8*stride] = a[8];
+  to[9*stride] = a[9];
+  to[10*stride] = a[10];
+  to[11*stride] = a[11];
+  to[12*stride] = a[12];
+  to[13*stride] = a[13];
+  to[14*stride] = a[14];
+  to[15*stride] = a[15];
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
+{
+  pscatter_size16<Packet16c>(to, from, stride);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
+{
+  pscatter_size16<Packet16uc>(to, from, stride);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f   plset<Packet4f>(const float&     a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN;  }
+template<> EIGEN_STRONG_INLINE Packet4i   plset<Packet4i>(const int&       a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN;  }
+template<> EIGEN_STRONG_INLINE Packet8s   plset<Packet8s>(const short int& a) { return pset1<Packet8s>(a) + p8s_COUNTDOWN; }
+template<> EIGEN_STRONG_INLINE Packet8us  plset<Packet8us>(const unsigned short int& a) { return pset1<Packet8us>(a) + p8us_COUNTDOWN; }
+template<> EIGEN_STRONG_INLINE Packet16c  plset<Packet16c>(const signed char& a)   { return pset1<Packet16c>(a) + p16c_COUNTDOWN; }
+template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a)   { return pset1<Packet16uc>(a) + p16uc_COUNTDOWN; }
+
+template<> EIGEN_STRONG_INLINE Packet4f   padd<Packet4f>  (const Packet4f&   a, const Packet4f&   b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet4i   padd<Packet4i>  (const Packet4i&   a, const Packet4i&   b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet4ui   padd<Packet4ui>  (const Packet4ui&   a, const Packet4ui&   b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet8s   padd<Packet8s>  (const Packet8s&   a, const Packet8s&   b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet8us  padd<Packet8us> (const Packet8us&  a, const Packet8us&  b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet16c  padd<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a + b; }
+
+template<> EIGEN_STRONG_INLINE Packet4f   psub<Packet4f>  (const Packet4f&   a, const Packet4f&   b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet4i   psub<Packet4i>  (const Packet4i&   a, const Packet4i&   b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet8s   psub<Packet8s>  (const Packet8s&   a, const Packet8s&   b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet8us  psub<Packet8us> (const Packet8us&  a, const Packet8us&  b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet16c  psub<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; }
+
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
+
+template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4f   pmul<Packet4f>  (const Packet4f&   a, const Packet4f&   b) { return vec_madd(a,b, p4f_MZERO); }
+template<> EIGEN_STRONG_INLINE Packet4i   pmul<Packet4i>  (const Packet4i&   a, const Packet4i&   b) { return a * b; }
+template<> EIGEN_STRONG_INLINE Packet8s   pmul<Packet8s>  (const Packet8s&   a, const Packet8s&   b) { return vec_mul(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us  pmul<Packet8us> (const Packet8us&  a, const Packet8us&  b) { return vec_mul(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c  pmul<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return vec_mul(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); }
+
+
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+#ifndef __VSX__  // VSX actually provides a div instruction
+  Packet4f t, y_0, y_1;
+
+  // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
+  y_0 = vec_re(b);
+
+  // Do one Newton-Raphson iteration to get the needed accuracy
+  t   = vec_nmsub(y_0, b, p4f_ONE);
+  y_1 = vec_madd(y_0, t, y_0);
+
+  return vec_madd(a, y_1, p4f_MZERO);
+#else
+  return vec_div(a, b);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
+{ eigen_assert(false && "packet integer division are not supported by AltiVec");
+  return pset1<Packet4i>(0);
+}
+
+// for some weird raisons, it has to be overloaded for packet of integers
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
+template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); }
+template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  #ifdef __VSX__
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
+  Packet4f ret;
+  __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+  #else
+  return vec_min(a, b);
+  #endif
+}
+template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); }
+
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  #ifdef __VSX__
+  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
+  Packet4f ret;
+  __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+  #else
+  return vec_max(a, b);
+  #endif
+}
+template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
+  Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
+  return vec_nor(c,c);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmpeq(a,b)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return pand<Packet8us>(a, b);
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return por<Packet8us>(a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 
+  return pxor<Packet8us>(a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_andc(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_andc(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+  return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+    Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
+    Packet4f res;
+
+#ifdef __VSX__
+    __asm__("xvrspiz %x0, %x1\n\t"
+        : "=&wa" (res)
+        : "wa" (t));
+#else
+    __asm__("vrfiz %0, %1\n\t"
+        : "=v" (res)
+        : "v" (t));
+#endif
+
+    return res;
+}
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
+template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
+{
+    Packet4f res;
+
+    __asm__("xvrspic %x0, %x1\n\t"
+        : "=&wa" (res)
+        : "wa" (a));
+
+    return res;
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef _BIG_ENDIAN
+  Packet16uc MSQ, LSQ;
+  Packet16uc mask;
+  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
+  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
+  mask = vec_lvsl(0, from);                        // create the permute mask
+  //TODO: Add static_cast here
+  return (Packet) vec_perm(MSQ, LSQ, mask);           // align the data
+#else
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
+{
+  return ploadu_common<Packet4f>(from);
+}
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
+{
+  return ploadu_common<Packet4i>(from);
+}
+template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from)
+{
+  return ploadu_common<Packet8s>(from);
+}
+template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from)
+{
+  return ploadu_common<Packet8us>(from);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from)
+{
+  return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from)
+{
+  return ploadu_common<Packet16c>(from);
+}
+template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from)
+{
+  return ploadu_common<Packet16uc>(from);
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)*   from)
+{
+  Packet p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet>(from);
+  else                                  p = ploadu<Packet>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+}
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
+{
+  return ploaddup_common<Packet4f>(from);
+}
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
+{
+  return ploaddup_common<Packet4i>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int*     from)
+{
+  Packet8s p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8s>(from);
+  else                                  p = ploadu<Packet8s>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE16_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int*     from)
+{
+  Packet8us p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8us>(from);
+  else                                  p = ploadu<Packet8us>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE16_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int*     from)
+{
+  Packet8s p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8s>(from);
+  else                                  p = ploadu<Packet8s>(from);
+  return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int*     from)
+{
+  Packet8us p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8us>(from);
+  else                                  p = ploadu<Packet8us>(from);
+  return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16*     from)
+{
+  return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char*     from)
+{
+  Packet16c p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet16c>(from);
+  else                                  p = ploadu<Packet16c>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE8_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char*     from)
+{
+  Packet16uc p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet16uc>(from);
+  else                                  p = ploadu<Packet16uc>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE8_HI);
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)*  to, const Packet& from)
+{
+  EIGEN_DEBUG_UNALIGNED_STORE
+#ifdef _BIG_ENDIAN
+  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
+  // Warning: not thread safe!
+  Packet16uc MSQ, LSQ, edges;
+  Packet16uc edgeAlign, align;
+
+  MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
+  LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
+  edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
+  edges=vec_perm(LSQ,MSQ,edgeAlign);                        // extract the edges
+  align = vec_lvsr( 0, to );                                // permute map to misalign data
+  MSQ = vec_perm(edges,(Packet16uc)from,align);             // misalign the data (MSQ)
+  LSQ = vec_perm((Packet16uc)from,edges,align);             // misalign the data (LSQ)
+  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
+  vec_st( MSQ, 0, (unsigned char *)to );                   // Store the MSQ part second
+#else
+  vec_xst(from, 0, to);
+#endif
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
+{
+  pstoreu_common<Packet4f>(to, from);
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from)
+{
+  pstoreu_common<Packet4i>(to, from);
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<short int>(short int*      to, const Packet8s& from)
+{
+  pstoreu_common<Packet8s>(to, from);
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int*      to, const Packet8us& from)
+{
+  pstoreu_common<Packet8us>(to, from);
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16*      to, const Packet8bf& from)
+{
+  pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from);
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char*      to, const Packet16c& from)
+{
+  pstoreu_common<Packet16c>(to, from);
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char*      to, const Packet16uc& from)
+{
+  pstoreu_common<Packet16uc>(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr)    { EIGEN_PPC_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr)    { EIGEN_PPC_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int   x; vec_ste(a, 0, &x); return x; }
+
+template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
+  vec_ste(a, 0, &x);
+  return x;
+}
+
+template<> EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
+  return pfirst_common<Packet8s>(a);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
+  return pfirst_common<Packet8us>(a);
+}
+
+template<> EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a)
+{
+  return pfirst_common<Packet16c>(a);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a)
+{
+  return pfirst_common<Packet16uc>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
+{
+  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
+{
+  return reinterpret_cast<Packet8s>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
+}
+template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
+{
+  return reinterpret_cast<Packet8us>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
+}
+template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
+{
+  return vec_perm(a, a, p16uc_REVERSE8);
+}
+template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
+{
+  return vec_perm(a, a, p16uc_REVERSE8);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
+{
+  return preverse<Packet8us>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8bf  pabs(const Packet8bf& a) {
+  _EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF);
+  return pand<Packet8us>(p8us_abs_mask, a);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a)
+{ return vec_sra(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a)
+{ return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a)
+{ return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
+template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
+  Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
+  return reinterpret_cast<Packet4f>(r);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
+  Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
+  return reinterpret_cast<Packet4f>(r);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
+  return vec_sr(a, p4ui_mask);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
+  return vec_sl(a, p4ui_mask);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
+  return vec_sl(a, p8us_mask);
+}
+template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
+  return vec_sr(a, p8us_mask);
+}
+
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){
+  return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
+}
+
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
+  return pand<Packet4f>(
+    reinterpret_cast<Packet4f>(bf.m_val),
+    reinterpret_cast<Packet4f>(p4ui_high_mask)
+  );
+}
+
+// Simple interleaving of bool masks, prevents true values from being
+// converted to NaNs.
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
+  Packet4f bf_odd, bf_even;
+  bf_odd = pand(reinterpret_cast<Packet4f>(p4ui_high_mask), odd);
+  bf_even = plogical_shift_right<16>(even);
+  return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd));
+}
+
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){
+  Packet4ui input = reinterpret_cast<Packet4ui>(p4f);
+  Packet4ui lsb = plogical_shift_right<16>(input);
+  lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE));
+
+  _EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu);
+  Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
+  input = padd<Packet4ui>(input, rounding_bias);
+
+  //Test NaN and Subnormal - Begin
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
+  Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f));
+
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
+  Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask, reinterpret_cast<Packet4ui>(p4f));
+
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(max_exp, 0x7F800000);
+  Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp);
+  Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
+
+  Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
+  Packet4ui nan_selector = pandnot<Packet4ui>(
+      reinterpret_cast<Packet4ui>(is_max_exp),
+      reinterpret_cast<Packet4ui>(is_mant_zero)
+  );
+
+  Packet4ui subnormal_selector = pandnot<Packet4ui>(
+      reinterpret_cast<Packet4ui>(is_zero_exp),
+      reinterpret_cast<Packet4ui>(is_mant_zero)
+  );
+
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
+  input = vec_sel(input, p4ui_nan, nan_selector);
+  input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
+  //Test NaN and Subnormal - End
+
+  input = plogical_shift_right<16>(input);
+  return reinterpret_cast<Packet8us>(input);
+}
+
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd){
+  Packet4f bf_odd, bf_even;
+  bf_odd = reinterpret_cast<Packet4f>(F32ToBf16(odd).m_val);
+  bf_odd = plogical_shift_left<16>(bf_odd);
+  bf_even = reinterpret_cast<Packet4f>(F32ToBf16(even).m_val);
+  return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd));
+}
+#define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
+  Packet4f a_even = Bf16ToF32Even(A);\
+  Packet4f a_odd = Bf16ToF32Odd(A);\
+  Packet4f op_even = OP(a_even);\
+  Packet4f op_odd = OP(a_odd);\
+  return F32ToBf16(op_even, op_odd);\
+
+#define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
+  Packet4f a_even = Bf16ToF32Even(A);\
+  Packet4f a_odd = Bf16ToF32Odd(A);\
+  Packet4f b_even = Bf16ToF32Even(B);\
+  Packet4f b_odd = Bf16ToF32Odd(B);\
+  Packet4f op_even = OP(a_even, b_even);\
+  Packet4f op_odd = OP(a_odd, b_odd);\
+  return F32ToBf16(op_even, op_odd);\
+
+#define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
+  Packet4f a_even = Bf16ToF32Even(A);\
+  Packet4f a_odd = Bf16ToF32Odd(A);\
+  Packet4f b_even = Bf16ToF32Even(B);\
+  Packet4f b_odd = Bf16ToF32Odd(B);\
+  Packet4f op_even = OP(a_even, b_even);\
+  Packet4f op_odd = OP(a_odd, b_odd);\
+  return F32ToBf16Bool(op_even, op_odd);\
+
+template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(pnegate<Packet4f>, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a,exponent);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf> (const Packet8bf& a, const Packet8bf& exponent){
+  BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a,exponent);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf> (const Packet8bf& a, Packet8bf& e){
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f e_even;
+  Packet4f e_odd;
+  Packet4f op_even = pfrexp<Packet4f>(a_even, e_even);
+  Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd);
+  e = F32ToBf16(e_even, e_odd);
+  return F32ToBf16(op_even, op_odd);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f b_even = Bf16ToF32Even(b);
+  Packet4f b_odd = Bf16ToF32Odd(b);
+  Packet4f c_even = Bf16ToF32Even(c);
+  Packet4f c_odd = Bf16ToF32Odd(c);
+  Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even);
+  Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd);
+  return F32ToBf16(pmadd_even, pmadd_odd);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
+  return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const  bfloat16*     from)
+{
+  return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
+  bfloat16 countdown[8] = { bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
+                            bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7) };
+  return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  Packet4f b, sum;
+  b   = vec_sld(a, a, 8);
+  sum = a + b;
+  b   = vec_sld(sum, sum, 4);
+  sum += b;
+  return pfirst(sum);
+}
+
+template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+{
+  Packet4i sum;
+  sum = vec_sums(a, p4i_ZERO);
+#ifdef _BIG_ENDIAN
+  sum = vec_sld(sum, p4i_ZERO, 12);
+#else
+  sum = vec_sld(p4i_ZERO, sum, 4);
+#endif
+  return pfirst(sum);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a)
+{
+  float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
+  float redux_odd  = predux<Packet4f>(Bf16ToF32Odd(a));
+  float f32_result = redux_even + redux_odd;
+  return bfloat16(f32_result);
+}
+template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a)
+{
+  union{
+    Packet v;
+    __UNPACK_TYPE__(Packet) n[8];
+  } vt;
+  vt.v = a;
+
+  EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
+  EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
+  Packet4i first_half  = pload<Packet4i>(first_loader);
+  Packet4i second_half = pload<Packet4i>(second_loader);
+
+  return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half));
+}
+
+template<> EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a)
+{
+  return predux_size8<Packet8s>(a);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a)
+{
+  return predux_size8<Packet8us>(a);
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a)
+{
+  union{
+    Packet v;
+    __UNPACK_TYPE__(Packet) n[16];
+  } vt;
+  vt.v = a;
+
+  EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
+  EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
+  EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] };
+  EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] };
+
+  Packet4i first_quarter = pload<Packet4i>(first_loader);
+  Packet4i second_quarter = pload<Packet4i>(second_loader);
+  Packet4i third_quarter = pload<Packet4i>(third_loader);
+  Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
+
+  return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter)
+		                  + predux(third_quarter) + predux(fourth_quarter));
+}
+
+template<> EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a)
+{
+  return predux_size16<Packet16c>(a);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a)
+{
+  return predux_size16<Packet16uc>(a);
+}
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  Packet4f prod;
+  prod = pmul(a, vec_sld(a, a, 8));
+  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
+}
+
+template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+{
+  EIGEN_ALIGN16 int aux[4];
+  pstore(aux, a);
+  return aux[0] * aux[1] * aux[2] * aux[3];
+}
+
+template<> EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a)
+{
+  Packet8s pair, quad, octo;
+
+  pair = vec_mul(a, vec_sld(a, a, 8));
+  quad = vec_mul(pair, vec_sld(pair, pair, 4));
+  octo = vec_mul(quad, vec_sld(quad, quad, 2));
+
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a)
+{
+  Packet8us pair, quad, octo;
+
+  pair = vec_mul(a, vec_sld(a, a, 8));
+  quad = vec_mul(pair, vec_sld(pair, pair, 4));
+  octo = vec_mul(quad, vec_sld(quad, quad, 2));
+
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a)
+{
+  float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
+  float redux_odd  = predux_mul<Packet4f>(Bf16ToF32Odd(a));
+  float f32_result = redux_even * redux_odd;
+  return bfloat16(f32_result);
+}
+
+
+template<> EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a)
+{
+  Packet16c pair, quad, octo, result;
+
+  pair = vec_mul(a, vec_sld(a, a, 8));
+  quad = vec_mul(pair, vec_sld(pair, pair, 4));
+  octo = vec_mul(quad, vec_sld(quad, quad, 2));
+  result = vec_mul(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a)
+{
+  Packet16uc pair, quad, octo, result;
+
+  pair = vec_mul(a, vec_sld(a, a, 8));
+  quad = vec_mul(pair, vec_sld(pair, pair, 4));
+  octo = vec_mul(quad, vec_sld(quad, quad, 2));
+  result = vec_mul(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+// min
+template<typename Packet> EIGEN_STRONG_INLINE
+__UNPACK_TYPE__(Packet) predux_min4(const Packet& a)
+{
+  Packet b, res;
+  b = vec_min(a, vec_sld(a, a, 8));
+  res = vec_min(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  return predux_min4<Packet4f>(a);
+}
+
+template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
+{
+  return predux_min4<Packet4i>(a);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a)
+{
+  float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
+  float redux_odd  = predux_min<Packet4f>(Bf16ToF32Odd(a));
+  float f32_result = (std::min)(redux_even, redux_odd);
+  return bfloat16(f32_result);
+}
+
+template<> EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a)
+{
+  Packet8s pair, quad, octo;
+  
+  //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
+  pair = vec_min(a, vec_sld(a, a, 8)); 
+
+  //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
+  quad = vec_min(pair, vec_sld(pair, pair, 4));
+
+  //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
+  octo = vec_min(quad, vec_sld(quad, quad, 2));
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a)
+{
+  Packet8us pair, quad, octo;
+  
+  //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
+  pair = vec_min(a, vec_sld(a, a, 8)); 
+
+  //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
+  quad = vec_min(pair, vec_sld(pair, pair, 4));
+
+  //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
+  octo = vec_min(quad, vec_sld(quad, quad, 2));
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a)
+{
+  Packet16c pair, quad, octo, result;
+
+  pair = vec_min(a, vec_sld(a, a, 8));
+  quad = vec_min(pair, vec_sld(pair, pair, 4));
+  octo = vec_min(quad, vec_sld(quad, quad, 2));
+  result = vec_min(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a)
+{
+  Packet16uc pair, quad, octo, result;
+
+  pair = vec_min(a, vec_sld(a, a, 8));
+  quad = vec_min(pair, vec_sld(pair, pair, 4));
+  octo = vec_min(quad, vec_sld(quad, quad, 2));
+  result = vec_min(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+// max
+template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a)
+{
+  Packet b, res;
+  b = vec_max(a, vec_sld(a, a, 8));
+  res = vec_max(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  return predux_max4<Packet4f>(a);
+}
+
+template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
+{
+  return predux_max4<Packet4i>(a);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a)
+{
+  float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
+  float redux_odd  = predux_max<Packet4f>(Bf16ToF32Odd(a));
+  float f32_result = (std::max)(redux_even, redux_odd);
+  return bfloat16(f32_result);
+}
+
+template<> EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a)
+{
+  Packet8s pair, quad, octo;
+  
+  //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
+  pair = vec_max(a, vec_sld(a, a, 8)); 
+
+  //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
+  quad = vec_max(pair, vec_sld(pair, pair, 4));
+
+  //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
+  octo = vec_max(quad, vec_sld(quad, quad, 2));
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a)
+{
+  Packet8us pair, quad, octo;
+  
+  //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
+  pair = vec_max(a, vec_sld(a, a, 8)); 
+
+  //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
+  quad = vec_max(pair, vec_sld(pair, pair, 4));
+
+  //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
+  octo = vec_max(quad, vec_sld(quad, quad, 2));
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a)
+{
+  Packet16c pair, quad, octo, result;
+
+  pair = vec_max(a, vec_sld(a, a, 8));
+  quad = vec_max(pair, vec_sld(pair, pair, 4));
+  octo = vec_max(quad, vec_sld(quad, quad, 2));
+  result = vec_max(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a)
+{
+  Packet16uc pair, quad, octo, result;
+
+  pair = vec_max(a, vec_sld(a, a, 8));
+  quad = vec_max(pair, vec_sld(pair, pair, 4));
+  octo = vec_max(quad, vec_sld(quad, quad, 2));
+  result = vec_max(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+  return vec_any_ne(x, pzero(x));
+}
+
+template <typename T> EIGEN_DEVICE_FUNC inline void
+ptranpose_common(PacketBlock<T,4>& kernel){
+  T t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  ptranpose_common<Packet4f>(kernel);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  ptranpose_common<Packet4i>(kernel);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8s,4>& kernel) {
+  Packet8s t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8us,4>& kernel) {
+  Packet8us t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8bf,4>& kernel) {
+  Packet8us t0, t1, t2, t3;
+
+  t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
+  t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val);
+  t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val);
+  t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16c,4>& kernel) {
+  Packet16c t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16uc,4>& kernel) {
+  Packet16uc t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8s,8>& kernel) {
+  Packet8s v[8], sum[8];
+
+  v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
+  v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
+  v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
+  v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
+  v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
+  v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
+  v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
+  v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
+  sum[0] = vec_mergeh(v[0], v[4]);
+  sum[1] = vec_mergel(v[0], v[4]);
+  sum[2] = vec_mergeh(v[1], v[5]);
+  sum[3] = vec_mergel(v[1], v[5]);
+  sum[4] = vec_mergeh(v[2], v[6]);
+  sum[5] = vec_mergel(v[2], v[6]);
+  sum[6] = vec_mergeh(v[3], v[7]);
+  sum[7] = vec_mergel(v[3], v[7]);
+
+  kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
+  kernel.packet[1] = vec_mergel(sum[0], sum[4]);
+  kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
+  kernel.packet[3] = vec_mergel(sum[1], sum[5]);
+  kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
+  kernel.packet[5] = vec_mergel(sum[2], sum[6]);
+  kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
+  kernel.packet[7] = vec_mergel(sum[3], sum[7]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8us,8>& kernel) {
+  Packet8us v[8], sum[8];
+
+  v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
+  v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
+  v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
+  v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
+  v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
+  v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
+  v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
+  v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
+  sum[0] = vec_mergeh(v[0], v[4]);
+  sum[1] = vec_mergel(v[0], v[4]);
+  sum[2] = vec_mergeh(v[1], v[5]);
+  sum[3] = vec_mergel(v[1], v[5]);
+  sum[4] = vec_mergeh(v[2], v[6]);
+  sum[5] = vec_mergel(v[2], v[6]);
+  sum[6] = vec_mergeh(v[3], v[7]);
+  sum[7] = vec_mergel(v[3], v[7]);
+
+  kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
+  kernel.packet[1] = vec_mergel(sum[0], sum[4]);
+  kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
+  kernel.packet[3] = vec_mergel(sum[1], sum[5]);
+  kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
+  kernel.packet[5] = vec_mergel(sum[2], sum[6]);
+  kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
+  kernel.packet[7] = vec_mergel(sum[3], sum[7]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8bf,8>& kernel) {
+  Packet8bf v[8], sum[8];
+
+  v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
+  v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val);
+  v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val);
+  v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val);
+  v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val);
+  v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val);
+  v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val);
+  v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val);
+  sum[0] = vec_mergeh(v[0].m_val, v[4].m_val);
+  sum[1] = vec_mergel(v[0].m_val, v[4].m_val);
+  sum[2] = vec_mergeh(v[1].m_val, v[5].m_val);
+  sum[3] = vec_mergel(v[1].m_val, v[5].m_val);
+  sum[4] = vec_mergeh(v[2].m_val, v[6].m_val);
+  sum[5] = vec_mergel(v[2].m_val, v[6].m_val);
+  sum[6] = vec_mergeh(v[3].m_val, v[7].m_val);
+  sum[7] = vec_mergel(v[3].m_val, v[7].m_val);
+
+  kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val);
+  kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val);
+  kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val);
+  kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val);
+  kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val);
+  kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val);
+  kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val);
+  kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16c,16>& kernel) {
+  Packet16c step1[16], step2[16], step3[16];
+
+  step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
+  step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
+  step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
+  step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
+  step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
+  step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
+  step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
+  step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
+  step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
+  step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
+  step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
+  step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
+  step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
+  step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
+  step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
+  step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
+
+  step2[0]  = vec_mergeh(step1[0], step1[8]);
+  step2[1]  = vec_mergel(step1[0], step1[8]);
+  step2[2]  = vec_mergeh(step1[1], step1[9]);
+  step2[3]  = vec_mergel(step1[1], step1[9]);
+  step2[4]  = vec_mergeh(step1[2], step1[10]);
+  step2[5]  = vec_mergel(step1[2], step1[10]);
+  step2[6]  = vec_mergeh(step1[3], step1[11]);
+  step2[7]  = vec_mergel(step1[3], step1[11]);
+  step2[8]  = vec_mergeh(step1[4], step1[12]);
+  step2[9]  = vec_mergel(step1[4], step1[12]);
+  step2[10] = vec_mergeh(step1[5], step1[13]);
+  step2[11] = vec_mergel(step1[5], step1[13]);
+  step2[12] = vec_mergeh(step1[6], step1[14]);
+  step2[13] = vec_mergel(step1[6], step1[14]);
+  step2[14] = vec_mergeh(step1[7], step1[15]);
+  step2[15] = vec_mergel(step1[7], step1[15]);
+
+  step3[0]  = vec_mergeh(step2[0], step2[8]);
+  step3[1]  = vec_mergel(step2[0], step2[8]);
+  step3[2]  = vec_mergeh(step2[1], step2[9]);
+  step3[3]  = vec_mergel(step2[1], step2[9]);
+  step3[4]  = vec_mergeh(step2[2], step2[10]);
+  step3[5]  = vec_mergel(step2[2], step2[10]);
+  step3[6]  = vec_mergeh(step2[3], step2[11]);
+  step3[7]  = vec_mergel(step2[3], step2[11]);
+  step3[8]  = vec_mergeh(step2[4], step2[12]);
+  step3[9]  = vec_mergel(step2[4], step2[12]);
+  step3[10] = vec_mergeh(step2[5], step2[13]);
+  step3[11] = vec_mergel(step2[5], step2[13]);
+  step3[12] = vec_mergeh(step2[6], step2[14]);
+  step3[13] = vec_mergel(step2[6], step2[14]);
+  step3[14] = vec_mergeh(step2[7], step2[15]);
+  step3[15] = vec_mergel(step2[7], step2[15]);
+
+  kernel.packet[0]  = vec_mergeh(step3[0], step3[8]);
+  kernel.packet[1]  = vec_mergel(step3[0], step3[8]);
+  kernel.packet[2]  = vec_mergeh(step3[1], step3[9]);
+  kernel.packet[3]  = vec_mergel(step3[1], step3[9]);
+  kernel.packet[4]  = vec_mergeh(step3[2], step3[10]);
+  kernel.packet[5]  = vec_mergel(step3[2], step3[10]);
+  kernel.packet[6]  = vec_mergeh(step3[3], step3[11]);
+  kernel.packet[7]  = vec_mergel(step3[3], step3[11]);
+  kernel.packet[8]  = vec_mergeh(step3[4], step3[12]);
+  kernel.packet[9]  = vec_mergel(step3[4], step3[12]);
+  kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
+  kernel.packet[11] = vec_mergel(step3[5], step3[13]);
+  kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
+  kernel.packet[13] = vec_mergel(step3[6], step3[14]);
+  kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
+  kernel.packet[15] = vec_mergel(step3[7], step3[15]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16uc,16>& kernel) {
+  Packet16uc step1[16], step2[16], step3[16];
+
+  step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
+  step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
+  step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
+  step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
+  step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
+  step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
+  step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
+  step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
+  step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
+  step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
+  step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
+  step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
+  step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
+  step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
+  step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
+  step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
+
+  step2[0]  = vec_mergeh(step1[0], step1[8]);
+  step2[1]  = vec_mergel(step1[0], step1[8]);
+  step2[2]  = vec_mergeh(step1[1], step1[9]);
+  step2[3]  = vec_mergel(step1[1], step1[9]);
+  step2[4]  = vec_mergeh(step1[2], step1[10]);
+  step2[5]  = vec_mergel(step1[2], step1[10]);
+  step2[6]  = vec_mergeh(step1[3], step1[11]);
+  step2[7]  = vec_mergel(step1[3], step1[11]);
+  step2[8]  = vec_mergeh(step1[4], step1[12]);
+  step2[9]  = vec_mergel(step1[4], step1[12]);
+  step2[10] = vec_mergeh(step1[5], step1[13]);
+  step2[11] = vec_mergel(step1[5], step1[13]);
+  step2[12] = vec_mergeh(step1[6], step1[14]);
+  step2[13] = vec_mergel(step1[6], step1[14]);
+  step2[14] = vec_mergeh(step1[7], step1[15]);
+  step2[15] = vec_mergel(step1[7], step1[15]);
+
+  step3[0]  = vec_mergeh(step2[0], step2[8]);
+  step3[1]  = vec_mergel(step2[0], step2[8]);
+  step3[2]  = vec_mergeh(step2[1], step2[9]);
+  step3[3]  = vec_mergel(step2[1], step2[9]);
+  step3[4]  = vec_mergeh(step2[2], step2[10]);
+  step3[5]  = vec_mergel(step2[2], step2[10]);
+  step3[6]  = vec_mergeh(step2[3], step2[11]);
+  step3[7]  = vec_mergel(step2[3], step2[11]);
+  step3[8]  = vec_mergeh(step2[4], step2[12]);
+  step3[9]  = vec_mergel(step2[4], step2[12]);
+  step3[10] = vec_mergeh(step2[5], step2[13]);
+  step3[11] = vec_mergel(step2[5], step2[13]);
+  step3[12] = vec_mergeh(step2[6], step2[14]);
+  step3[13] = vec_mergel(step2[6], step2[14]);
+  step3[14] = vec_mergeh(step2[7], step2[15]);
+  step3[15] = vec_mergel(step2[7], step2[15]);
+
+  kernel.packet[0]  = vec_mergeh(step3[0], step3[8]);
+  kernel.packet[1]  = vec_mergel(step3[0], step3[8]);
+  kernel.packet[2]  = vec_mergeh(step3[1], step3[9]);
+  kernel.packet[3]  = vec_mergel(step3[1], step3[9]);
+  kernel.packet[4]  = vec_mergeh(step3[2], step3[10]);
+  kernel.packet[5]  = vec_mergel(step3[2], step3[10]);
+  kernel.packet[6]  = vec_mergeh(step3[3], step3[11]);
+  kernel.packet[7]  = vec_mergel(step3[3], step3[11]);
+  kernel.packet[8]  = vec_mergeh(step3[4], step3[12]);
+  kernel.packet[9]  = vec_mergel(step3[4], step3[12]);
+  kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
+  kernel.packet[11] = vec_mergel(step3[5], step3[13]);
+  kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
+  kernel.packet[13] = vec_mergel(step3[6], step3[14]);
+  kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
+  kernel.packet[15] = vec_mergel(step3[7], step3[15]);
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE
+Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) {
+  Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
+  Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(select, p8us_ONE));
+  Packet8s result = vec_sel(elsePacket, thenPacket, mask);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) {
+  Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
+  Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(reinterpret_cast<Packet8us>(select), p8us_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket, const Packet8bf& elsePacket) {
+  return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket, const Packet16c& elsePacket) {
+  Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
+                       ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
+                       ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
+
+  Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) {
+  Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
+                       ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
+                       ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
+
+  Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<bfloat16, unsigned short int> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<unsigned short int, bfloat16> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return vec_cts(a,0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
+  return vec_ctu(a,0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return vec_ctf(a,0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
+  return vec_ctf(a,0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us pcast<Packet8bf, Packet8us>(const Packet8bf& a) {
+  Packet4f float_even = Bf16ToF32Even(a);
+  Packet4f float_odd = Bf16ToF32Odd(a);
+  Packet4ui int_even = pcast<Packet4f, Packet4ui>(float_even);
+  Packet4ui int_odd = pcast<Packet4f, Packet4ui>(float_odd);
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
+  Packet4ui low_even = pand<Packet4ui>(int_even, p4ui_low_mask);
+  Packet4ui low_odd = pand<Packet4ui>(int_odd, p4ui_low_mask);
+
+  //Check values that are bigger than USHRT_MAX (0xFFFF)
+  Packet4bi overflow_selector;
+  if(vec_any_gt(int_even, p4ui_low_mask)){
+    overflow_selector = vec_cmpgt(int_even, p4ui_low_mask);
+    low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector);
+  }
+  if(vec_any_gt(int_odd, p4ui_low_mask)){
+    overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask);
+    low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector);
+  }
+
+  low_odd = plogical_shift_left<16>(low_odd);
+
+  Packet4ui int_final = por<Packet4ui>(low_even, low_odd);
+  return reinterpret_cast<Packet8us>(int_final);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8us, Packet8bf>(const Packet8us& a) {
+  //short -> int -> float -> bfloat16
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
+  Packet4ui int_cast = reinterpret_cast<Packet4ui>(a);
+  Packet4ui int_even = pand<Packet4ui>(int_cast, p4ui_low_mask);
+  Packet4ui int_odd = plogical_shift_right<16>(int_cast);
+  Packet4f float_even = pcast<Packet4ui, Packet4f>(int_even);
+  Packet4f float_odd = pcast<Packet4ui, Packet4f>(int_odd);
+  return F32ToBf16(float_even, float_odd);
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+  return reinterpret_cast<Packet4i>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+  return reinterpret_cast<Packet4f>(a);
+}
+
+
+
+//---------- double ----------
+#ifdef __VSX__
+typedef __vector double              Packet2d;
+typedef __vector unsigned long long  Packet2ul;
+typedef __vector long long           Packet2l;
+#if EIGEN_COMP_CLANG
+typedef Packet2ul                    Packet2bl;
+#else
+typedef __vector __bool long         Packet2bl;
+#endif
+
+static Packet2l  p2l_ONE  = { 1, 1 };
+static Packet2l  p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
+static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull };
+static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull };
+static Packet2d  p2d_ONE  = { 1.0, 1.0 };
+static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
+static Packet2d  p2d_MZERO = { numext::bit_cast<double>(0x8000000000000000ull),
+                               numext::bit_cast<double>(0x8000000000000000ull) };
+
+#ifdef _BIG_ENDIAN
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
+#else
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
+#endif
+
+template<int index> Packet2d vec_splat_dbl(Packet2d& a)
+{
+  return vec_splat(a, index);
+}
+
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 1,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
+
+inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
+{
+  union {
+    Packet2l   v;
+    int64_t n[2];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1];
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
+{
+  union {
+    Packet2d   v;
+    double n[2];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1];
+  return s;
+}
+
+// Need to define them first or we get specialization after instantiation errors
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_xst(from, 0, to);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
+  Packet2d v = {from, from};
+  return v;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
+  Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
+  return reinterpret_cast<Packet2d>(v);
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet2d>(const double *a,
+                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
+{
+  //This way is faster than vec_splat (at least for doubles in Power 9)
+  a0 = pset1<Packet2d>(a[0]);
+  a1 = pset1<Packet2d>(a[1]);
+  a2 = pset1<Packet2d>(a[2]);
+  a3 = pset1<Packet2d>(a[3]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+{
+  EIGEN_ALIGN16 double af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+ return pload<Packet2d>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+  EIGEN_ALIGN16 double af[2];
+  pstore<double>(af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
+
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; }
+
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
+
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }
+
+template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); }
+template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); }
+
+// for some weird raisons, it has to be overloaded for packet of integers
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
+  Packet2d ret;
+  __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+ }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
+  Packet2d ret;
+  __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
+  Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a,b));
+  return vec_nor(c,c);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
+{
+    Packet2d t = vec_add(reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
+    Packet2d res;
+
+    __asm__("xvrdpiz %x0, %x1\n\t"
+        : "=&wa" (res)
+        : "wa" (t));
+
+    return res;
+}
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
+template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
+{
+    Packet2d res;
+
+    __asm__("xvrdpic %x0, %x1\n\t"
+        : "=&wa" (res)
+        : "wa" (a));
+
+    return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return vec_xl(0, const_cast<double*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
+{
+  Packet2d p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet2d>(from);
+  else                                  p = ploadu<Packet2d>(from);
+  return vec_splat_dbl<0>(p);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from)
+{
+  EIGEN_DEBUG_UNALIGNED_STORE
+  vec_xst(from, 0, to);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
+{
+  return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
+}
+template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
+
+// VSX support varies between different compilers and even different
+// versions of the same compiler.  For gcc version >= 4.9.3, we can use
+// vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
+// a slow version that works with older compilers. 
+// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
+// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
+template<>
+inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) {
+#if EIGEN_GNUC_AT_LEAST(5, 4) || \
+    (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
+  return vec_cts(x, 0);    // TODO: check clang version.
+#else
+  double tmp[2];
+  memcpy(tmp, &x, sizeof(tmp));
+  Packet2l l = { static_cast<long long>(tmp[0]),
+                 static_cast<long long>(tmp[1]) };
+  return l;
+#endif
+}
+
+template<>
+inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x) {
+  unsigned long long tmp[2];
+  memcpy(tmp, &x, sizeof(tmp));
+  Packet2d d = { static_cast<double>(tmp[0]),
+                 static_cast<double>(tmp[1]) };
+  return d;
+}
+
+
+// Packet2l shifts.
+// For POWER8 we simply use vec_sr/l. 
+//
+// Things are more complicated for POWER7. There is actually a
+// vec_xxsxdi intrinsic but it is not supported by some gcc versions.
+// So we need to shift by N % 32 and rearrage bytes.
+#ifdef __POWER8_VECTOR__
+
+template<int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
+  const Packet2ul shift = { N, N };
+  return vec_sl(a, shift); 
+}
+
+template<int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
+  const Packet2ul shift = { N, N };
+  return vec_sr(a, shift); 
+}
+
+#else
+
+// Shifts [A, B, C, D] to [B, 0, D, 0].
+// Used to implement left shifts for Packet2l.
+EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
+  static const Packet16uc perm = {
+      0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 
+      0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
+  #ifdef  _BIG_ENDIAN
+    return vec_perm(p4i_ZERO, a, perm);
+  #else
+    return vec_perm(a, p4i_ZERO, perm);
+  #endif
+}
+
+// Shifts [A, B, C, D] to [0, A, 0, C].
+// Used to implement right shifts for Packet2l.
+EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
+  static const Packet16uc perm = {
+      0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 
+      0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b };
+  #ifdef  _BIG_ENDIAN
+    return vec_perm(p4i_ZERO, a, perm);
+  #else
+    return vec_perm(a, p4i_ZERO, perm);
+  #endif
+}
+
+template<int N, typename EnableIf = void>
+struct plogical_shift_left_impl;
+
+template<int N>
+struct plogical_shift_left_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned n = static_cast<unsigned>(N);
+    const Packet4ui shift = {n, n, n, n};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    static const unsigned m = static_cast<unsigned>(32 - N);
+    const Packet4ui shift_right = {m, m, m, m};
+    const Packet4i out_hi = vec_sl(ai, shift);
+    const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right));
+    return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
+  }
+};
+
+template<int N>
+struct plogical_shift_left_impl<N, typename enable_if<(N >= 32)>::type> {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned m = static_cast<unsigned>(N - 32);
+    const Packet4ui shift = {m, m, m, m};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    return reinterpret_cast<Packet2l>(shift_even_left(vec_sl(ai, shift)));
+  }
+};
+
+template<int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
+  return plogical_shift_left_impl<N>::run(a); 
+}
+
+template<int N, typename EnableIf = void>
+struct plogical_shift_right_impl;
+
+template<int N>
+struct plogical_shift_right_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned n = static_cast<unsigned>(N);
+    const Packet4ui shift = {n, n, n, n};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    static const unsigned m = static_cast<unsigned>(32 - N);
+    const Packet4ui shift_left = {m, m, m, m};
+    const Packet4i out_lo = vec_sr(ai, shift);
+    const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left));
+    return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
+  }
+};
+
+template<int N>
+struct plogical_shift_right_impl<N, typename enable_if<(N >= 32)>::type> {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned m = static_cast<unsigned>(N - 32);
+    const Packet4ui shift = {m, m, m, m};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    return reinterpret_cast<Packet2l>(shift_odd_right(vec_sr(ai, shift)));
+  }
+};
+
+template<int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
+  return plogical_shift_right_impl<N>::run(a); 
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
+  const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+
+  // Split 2^e into four factors and multiply:
+  const Packet2l  bias = { 1023, 1023 };
+  Packet2l b = plogical_shift_right<2>(e);  // floor(e/4)
+  Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
+  Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
+  b = psub(psub(psub(e, b), b), b);  // e - 3b
+  c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
+  out = pmul(out, c); // a * 2^e
+  return out;
+}
+
+
+// Extract exponent without existence of Packet2l.
+template<>
+EIGEN_STRONG_INLINE  
+Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
+  return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d> (const Packet2d& a, Packet2d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{
+  Packet2d b, sum;
+  b   = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
+  sum = a + b;
+  return pfirst<Packet2d>(sum);
+}
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
+}
+
+// min
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
+}
+
+// max
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2d,2>& kernel) {
+  Packet2d t0, t1;
+  t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
+  t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
+  kernel.packet[0] = t0;
+  kernel.packet[1] = t1;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+  Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+
+#endif // __VSX__
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_ALTIVEC_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/CUDA/Complex.h b/src/3rdparty/eigen/Eigen/src/Core/arch/CUDA/Complex.h
new file mode 100644
index 000000000..deb4c8694
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/CUDA/Complex.h
@@ -0,0 +1,258 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2021 C. Antonio Sanchez <cantonios@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_CUDA_H
+#define EIGEN_COMPLEX_CUDA_H
+
+// clang-format off
+// Many std::complex methods such as operator+, operator-, operator* and
+// operator/ are not constexpr. Due to this, GCC and older versions of clang do
+// not treat them as device functions and thus Eigen functors making use of
+// these operators fail to compile. Here, we manually specialize these
+// operators and functors for complex types when building for CUDA to enable
+// their use on-device.
+
+#if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE)
+    
+// ICC already specializes std::complex<float> and std::complex<double>
+// operators, preventing us from making them device functions here.
+// This will lead to silent runtime errors if the operators are used on device.
+//
+// To allow std::complex operator use on device, define _OVERRIDE_COMPLEX_SPECIALIZATION_
+// prior to first inclusion of <complex>.  This prevents ICC from adding
+// its own specializations, so our custom ones below can be used instead.
+#if !(defined(EIGEN_COMP_ICC) && defined(_USE_COMPLEX_SPECIALIZATION_))
+
+// Import Eigen's internal operator specializations.
+#define EIGEN_USING_STD_COMPLEX_OPERATORS           \
+  using Eigen::complex_operator_detail::operator+;  \
+  using Eigen::complex_operator_detail::operator-;  \
+  using Eigen::complex_operator_detail::operator*;  \
+  using Eigen::complex_operator_detail::operator/;  \
+  using Eigen::complex_operator_detail::operator+=; \
+  using Eigen::complex_operator_detail::operator-=; \
+  using Eigen::complex_operator_detail::operator*=; \
+  using Eigen::complex_operator_detail::operator/=; \
+  using Eigen::complex_operator_detail::operator==; \
+  using Eigen::complex_operator_detail::operator!=;
+
+namespace Eigen {
+
+// Specialized std::complex overloads.
+namespace complex_operator_detail {
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<T> complex_multiply(const std::complex<T>& a, const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  return std::complex<T>(
+      a_real * b_real - a_imag * b_imag,
+      a_imag * b_real + a_real * b_imag);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<T> complex_divide_fast(const std::complex<T>& a, const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  const T norm = (b_real * b_real + b_imag * b_imag);
+  return std::complex<T>((a_real * b_real + a_imag * b_imag) / norm,
+                          (a_imag * b_real - a_real * b_imag) / norm);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<T> complex_divide_stable(const std::complex<T>& a, const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  // Smith's complex division (https://arxiv.org/pdf/1210.4539.pdf),
+  // guards against over/under-flow.
+  const bool scale_imag = numext::abs(b_imag) <= numext::abs(b_real);
+  const T rscale = scale_imag ? T(1) : b_real / b_imag;
+  const T iscale = scale_imag ? b_imag / b_real : T(1);
+  const T denominator = b_real * rscale + b_imag * iscale;
+  return std::complex<T>((a_real * rscale + a_imag * iscale) / denominator, 
+                         (a_imag * rscale - a_real * iscale) / denominator);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<T> complex_divide(const std::complex<T>& a, const std::complex<T>& b) {
+#if EIGEN_FAST_MATH
+  return complex_divide_fast(a, b);
+#else
+  return complex_divide_stable(a, b);
+#endif
+}
+
+// NOTE: We cannot specialize compound assignment operators with Scalar T,
+//         (i.e.  operator@=(const T&), for @=+,-,*,/)
+//       since they are already specialized for float/double/long double within
+//       the standard <complex> header. We also do not specialize the stream
+//       operators.
+#define EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(T)                                    \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator+(const std::complex<T>& a) { return a; }                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator-(const std::complex<T>& a) {                                           \
+  return std::complex<T>(-numext::real(a), -numext::imag(a));                                   \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator+(const std::complex<T>& a, const std::complex<T>& b) {                 \
+  return std::complex<T>(numext::real(a) + numext::real(b), numext::imag(a) + numext::imag(b)); \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator+(const std::complex<T>& a, const T& b) {                               \
+  return std::complex<T>(numext::real(a) + b, numext::imag(a));                                 \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator+(const T& a, const std::complex<T>& b) {                               \
+  return std::complex<T>(a + numext::real(b), numext::imag(b));                                 \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator-(const std::complex<T>& a, const std::complex<T>& b) {                 \
+  return std::complex<T>(numext::real(a) - numext::real(b), numext::imag(a) - numext::imag(b)); \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator-(const std::complex<T>& a, const T& b) {                               \
+  return std::complex<T>(numext::real(a) - b, numext::imag(a));                                 \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator-(const T& a, const std::complex<T>& b) {                               \
+  return std::complex<T>(a - numext::real(b), -numext::imag(b));                                \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator*(const std::complex<T>& a, const std::complex<T>& b) {                 \
+  return complex_multiply(a, b);                                                                \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator*(const std::complex<T>& a, const T& b) {                               \
+  return std::complex<T>(numext::real(a) * b, numext::imag(a) * b);                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator*(const T& a, const std::complex<T>& b) {                               \
+  return std::complex<T>(a * numext::real(b), a * numext::imag(b));                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator/(const std::complex<T>& a, const std::complex<T>& b) {                 \
+  return complex_divide(a, b);                                                                  \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator/(const std::complex<T>& a, const T& b) {                               \
+  return std::complex<T>(numext::real(a) / b, numext::imag(a) / b);                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator/(const T& a, const std::complex<T>& b) {                               \
+  return complex_divide(std::complex<T>(a, 0), b);                                              \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T>& operator+=(std::complex<T>& a, const std::complex<T>& b) {                     \
+  numext::real_ref(a) += numext::real(b);                                                       \
+  numext::imag_ref(a) += numext::imag(b);                                                       \
+  return a;                                                                                     \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T>& operator-=(std::complex<T>& a, const std::complex<T>& b) {                     \
+  numext::real_ref(a) -= numext::real(b);                                                       \
+  numext::imag_ref(a) -= numext::imag(b);                                                       \
+  return a;                                                                                     \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T>& operator*=(std::complex<T>& a, const std::complex<T>& b) {                     \
+  a = complex_multiply(a, b);                                                                   \
+  return a;                                                                                     \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T>& operator/=(std::complex<T>& a, const std::complex<T>& b) {                     \
+  a = complex_divide(a, b);                                                                     \
+  return  a;                                                                                    \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator==(const std::complex<T>& a, const std::complex<T>& b) {                           \
+  return numext::real(a) == numext::real(b) && numext::imag(a) == numext::imag(b);              \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator==(const std::complex<T>& a, const T& b) {                                         \
+  return numext::real(a) == b && numext::imag(a) == 0;                                          \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator==(const T& a, const std::complex<T>& b) {                                         \
+  return a  == numext::real(b) && 0 == numext::imag(b);                                         \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator!=(const std::complex<T>& a, const std::complex<T>& b) {                           \
+  return !(a == b);                                                                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator!=(const std::complex<T>& a, const T& b) {                                         \
+  return !(a == b);                                                                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator!=(const T& a, const std::complex<T>& b) {                                         \
+  return !(a == b);                                                                             \
+}
+
+// Do not specialize for long double, since that reduces to double on device.
+EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(float)
+EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(double)
+
+#undef EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS
+
+  
+}  // namespace complex_operator_detail
+
+EIGEN_USING_STD_COMPLEX_OPERATORS
+
+namespace numext {
+EIGEN_USING_STD_COMPLEX_OPERATORS
+}  // namespace numext
+
+namespace internal {
+EIGEN_USING_STD_COMPLEX_OPERATORS
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // !(EIGEN_COMP_ICC && _USE_COMPLEX_SPECIALIZATION_)
+
+#endif  // EIGEN_CUDACC && EIGEN_GPU_COMPILE_PHASE
+
+#endif  // EIGEN_COMPLEX_CUDA_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/Default/BFloat16.h b/src/3rdparty/eigen/Eigen/src/Core/arch/Default/BFloat16.h
new file mode 100644
index 000000000..1c28f4f95
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/Default/BFloat16.h
@@ -0,0 +1,700 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef EIGEN_BFLOAT16_H
+#define EIGEN_BFLOAT16_H
+
+#define BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, METHOD)         \
+  template <>                                                       \
+  EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED  \
+  PACKET_BF16 METHOD<PACKET_BF16>(const PACKET_BF16& _x) {          \
+    return F32ToBf16(METHOD<PACKET_F>(Bf16ToF32(_x)));              \
+  }
+
+namespace Eigen {
+
+struct bfloat16;
+
+namespace bfloat16_impl {
+
+// Make our own __bfloat16_raw definition.
+struct __bfloat16_raw {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw() : value(0) {}
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw(unsigned short raw) : value(raw) {}
+  unsigned short value;
+};
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(unsigned short value);
+template <bool AssumeArgumentIsNormalOrInfinityOrZero>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff);
+// Forward declarations of template specializations, to avoid Visual C++ 2019 errors, saying:
+// > error C2908: explicit specialization; 'float_to_bfloat16_rtne' has already been instantiated
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<false>(float ff);
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true>(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h);
+
+struct bfloat16_base : public __bfloat16_raw {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base() {}
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base(const __bfloat16_raw& h) : __bfloat16_raw(h) {}
+};
+
+} // namespace bfloat16_impl
+
+// Class definition.
+struct bfloat16 : public bfloat16_impl::bfloat16_base {
+
+  typedef bfloat16_impl::__bfloat16_raw __bfloat16_raw;
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const __bfloat16_raw& h) : bfloat16_impl::bfloat16_base(h) {}
+
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(bool b)
+      : bfloat16_impl::bfloat16_base(bfloat16_impl::raw_uint16_to_bfloat16(b ? 0x3f80 : 0)) {}
+
+  template<class T>
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(T val)
+      : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<internal::is_integral<T>::value>(static_cast<float>(val))) {}
+
+  explicit EIGEN_DEVICE_FUNC bfloat16(float f)
+      : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(f)) {}
+
+  // Following the convention of numpy, converting between complex and
+  // float will lead to loss of imag value.
+  template<typename RealScalar>
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const std::complex<RealScalar>& val)
+      : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(static_cast<float>(val.real()))) {}
+
+  EIGEN_DEVICE_FUNC operator float() const {  // NOLINT: Allow implicit conversion to float, because it is lossless.
+    return bfloat16_impl::bfloat16_to_float(*this);
+  }
+};
+} // namespace Eigen
+
+namespace std {
+template<>
+struct numeric_limits<Eigen::bfloat16> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = std::denorm_absent;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = numeric_limits<float>::round_style;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = true;
+  static const bool is_modulo = false;
+  static const int digits = 8;
+  static const int digits10 = 2;
+  static const int max_digits10 = 4;
+  static const int radix = 2;
+  static const int min_exponent = numeric_limits<float>::min_exponent;
+  static const int min_exponent10 = numeric_limits<float>::min_exponent10;
+  static const int max_exponent = numeric_limits<float>::max_exponent;
+  static const int max_exponent10 = numeric_limits<float>::max_exponent10;
+  static const bool traps = numeric_limits<float>::traps;
+  static const bool tinyness_before = numeric_limits<float>::tinyness_before;
+
+  static Eigen::bfloat16 (min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); }
+  static Eigen::bfloat16 lowest() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0xff7f); }
+  static Eigen::bfloat16 (max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); }
+  static Eigen::bfloat16 epsilon() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); }
+  static Eigen::bfloat16 round_error() { return Eigen::bfloat16(0x3f00); }
+  static Eigen::bfloat16 infinity() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); }
+  static Eigen::bfloat16 quiet_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); }
+  static Eigen::bfloat16 signaling_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f81); }
+  static Eigen::bfloat16 denorm_min() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0001); }
+};
+
+// If std::numeric_limits<T> is specialized, should also specialize
+// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
+// std::numeric_limits<const volatile T>
+// https://stackoverflow.com/a/16519653/
+template<>
+struct numeric_limits<const Eigen::bfloat16> : numeric_limits<Eigen::bfloat16> {};
+template<>
+struct numeric_limits<volatile Eigen::bfloat16> : numeric_limits<Eigen::bfloat16> {};
+template<>
+struct numeric_limits<const volatile Eigen::bfloat16> : numeric_limits<Eigen::bfloat16> {};
+} // namespace std
+
+namespace Eigen {
+
+namespace bfloat16_impl {
+
+// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
+// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
+// of the functions, while the latter can only deal with one of them.
+#if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for bfloat16 floats
+
+#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
+// We need to provide emulated *host-side* BF16 operators for clang.
+#pragma push_macro("EIGEN_DEVICE_FUNC")
+#undef EIGEN_DEVICE_FUNC
+#if defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_NATIVE_BF16)
+#define EIGEN_DEVICE_FUNC __host__
+#else // both host and device need emulated ops.
+#define EIGEN_DEVICE_FUNC __host__ __device__
+#endif
+#endif
+
+// Definitions for CPUs, mostly working through conversion
+// to/from fp32.
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const bfloat16& b) {
+  return bfloat16(float(a) + float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const int& b) {
+  return bfloat16(float(a) + static_cast<float>(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const int& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) + float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator * (const bfloat16& a, const bfloat16& b) {
+  return bfloat16(float(a) * float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a, const bfloat16& b) {
+  return bfloat16(float(a) - float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, const bfloat16& b) {
+  return bfloat16(float(a) / float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a) {
+  bfloat16 result;
+  result.value = a.value ^ 0x8000;
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator += (bfloat16& a, const bfloat16& b) {
+  a = bfloat16(float(a) + float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator *= (bfloat16& a, const bfloat16& b) {
+  a = bfloat16(float(a) * float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator -= (bfloat16& a, const bfloat16& b) {
+  a = bfloat16(float(a) - float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator /= (bfloat16& a, const bfloat16& b) {
+  a = bfloat16(float(a) / float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a) {
+  a += bfloat16(1);
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a) {
+  a -= bfloat16(1);
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a, int) {
+  bfloat16 original_value = a;
+  ++a;
+  return original_value;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a, int) {
+  bfloat16 original_value = a;
+  --a;
+  return original_value;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const bfloat16& a, const bfloat16& b) {
+  return numext::equal_strict(float(a),float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const bfloat16& a, const bfloat16& b) {
+  return numext::not_equal_strict(float(a), float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const bfloat16& a, const bfloat16& b) {
+  return float(a) < float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const bfloat16& a, const bfloat16& b) {
+  return float(a) <= float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const bfloat16& a, const bfloat16& b) {
+  return float(a) > float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const bfloat16& a, const bfloat16& b) {
+  return float(a) >= float(b);
+}
+
+#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
+#pragma pop_macro("EIGEN_DEVICE_FUNC")
+#endif
+#endif  // Emulate support for bfloat16 floats
+
+// Division by an index. Do it in full float precision to avoid accuracy
+// issues in converting the denominator to bfloat16.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, Index b) {
+  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const float v) {
+  __bfloat16_raw output;
+  if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(v)) {
+    output.value = std::signbit(v) ? 0xFFC0: 0x7FC0;
+    return output;
+  }
+  const uint16_t* p = reinterpret_cast<const uint16_t*>(&v);
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  output.value = p[0];
+#else
+  output.value = p[1];
+#endif
+  return output;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(numext::uint16_t value) {
+  return __bfloat16_raw(value);
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_bfloat16_as_uint16(const __bfloat16_raw& bf) {
+  return bf.value;
+}
+
+// float_to_bfloat16_rtne template specialization that does not make any
+// assumption about the value of its function argument (ff).
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<false>(float ff) {
+#if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16))
+  // Nothing to do here
+#else
+  __bfloat16_raw output;
+
+  if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(ff)) {
+    // If the value is a NaN, squash it to a qNaN with msb of fraction set,
+    // this makes sure after truncation we don't end up with an inf.
+    //
+    // qNaN magic: All exponent bits set + most significant bit of fraction
+    // set.
+    output.value = std::signbit(ff) ? 0xFFC0: 0x7FC0;
+  } else {
+    // Fast rounding algorithm that rounds a half value to nearest even. This
+    // reduces expected error when we convert a large number of floats. Here
+    // is how it works:
+    //
+    // Definitions:
+    // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
+    // with the following tags:
+    //
+    // Sign |  Exp (8 bits) | Frac (23 bits)
+    //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
+    //
+    //  S: Sign bit.
+    //  E: Exponent bits.
+    //  F: First 6 bits of fraction.
+    //  L: Least significant bit of resulting bfloat16 if we truncate away the
+    //  rest of the float32. This is also the 7th bit of fraction
+    //  R: Rounding bit, 8th bit of fraction.
+    //  T: Sticky bits, rest of fraction, 15 bits.
+    //
+    // To round half to nearest even, there are 3 cases where we want to round
+    // down (simply truncate the result of the bits away, which consists of
+    // rounding bit and sticky bits) and two cases where we want to round up
+    // (truncate then add one to the result).
+    //
+    // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
+    // 1s) as the rounding bias, adds the rounding bias to the input, then
+    // truncates the last 16 bits away.
+    //
+    // To understand how it works, we can analyze this algorithm case by case:
+    //
+    // 1. L = 0, R = 0:
+    //   Expect: round down, this is less than half value.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input may create any carry, depending on
+    //   whether there is any value set to 1 in T bits.
+    //   - R may be set to 1 if there is a carry.
+    //   - L remains 0.
+    //   - Note that this case also handles Inf and -Inf, where all fraction
+    //   bits, including L, R and Ts are all 0. The output remains Inf after
+    //   this algorithm.
+    //
+    // 2. L = 1, R = 0:
+    //   Expect: round down, this is less than half value.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 1 = 0x8000
+    //   - Adding rounding bias to input doesn't change sticky bits but
+    //   adds 1 to rounding bit.
+    //   - L remains 1.
+    //
+    // 3. L = 0, R = 1, all of T are 0:
+    //   Expect: round down, this is exactly at half, the result is already
+    //   even (L=0).
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input sets all sticky bits to 1, but
+    //   doesn't create a carry.
+    //   - R remains 1.
+    //   - L remains 0.
+    //
+    // 4. L = 1, R = 1:
+    //   Expect: round up, this is exactly at half, the result needs to be
+    //   round to the next even number.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 1 = 0x8000
+    //   - Adding rounding bias to input doesn't change sticky bits, but
+    //   creates a carry from rounding bit.
+    //   - The carry sets L to 0, creates another carry bit and propagate
+    //   forward to F bits.
+    //   - If all the F bits are 1, a carry then propagates to the exponent
+    //   bits, which then creates the minimum value with the next exponent
+    //   value. Note that we won't have the case where exponents are all 1,
+    //   since that's either a NaN (handled in the other if condition) or inf
+    //   (handled in case 1).
+    //
+    // 5. L = 0, R = 1, any of T is 1:
+    //   Expect: round up, this is greater than half.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input creates a carry from sticky bits,
+    //   sets rounding bit to 0, then create another carry.
+    //   - The second carry sets L to 1.
+    //
+    // Examples:
+    //
+    //  Exact half value that is already even:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
+    //
+    //     This falls into case 3. We truncate the rest of 16 bits and no
+    //     carry is created into F and L:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+    //
+    //  Exact half value, round to next even number:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     which then propagates into L and F:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+    //
+    //
+    //  Max denormal value round to min normal value:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     propagate into L and F, which then propagates into exponent
+    //     bits:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
+    //
+    //  Max normal value round to Inf:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     propagate into L and F, which then propagates into exponent
+    //     bits:
+    //
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
+
+    // At this point, ff must be either a normal float, or +/-infinity.
+    output = float_to_bfloat16_rtne<true>(ff);
+  }
+  return output;
+#endif
+}
+
+// float_to_bfloat16_rtne template specialization that assumes that its function
+// argument (ff) is either a normal floating point number, or +/-infinity, or
+// zero. Used to improve the runtime performance of conversion from an integer
+// type to bfloat16.
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true>(float ff) {
+#if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16))
+    // Nothing to do here
+#else
+    numext::uint32_t input = numext::bit_cast<numext::uint32_t>(ff);
+    __bfloat16_raw output;
+
+    // Least significant bit of resulting bfloat.
+    numext::uint32_t lsb = (input >> 16) & 1;
+    numext::uint32_t rounding_bias = 0x7fff + lsb;
+    input += rounding_bias;
+    output.value = static_cast<numext::uint16_t>(input >> 16);
+    return output;
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h) {
+    float result = 0;
+    unsigned short* q = reinterpret_cast<unsigned short*>(&result);
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    q[0] = h.value;
+#else
+    q[1] = h.value;
+#endif
+    return result;
+}
+// --- standard functions ---
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const bfloat16& a) {
+  EIGEN_USING_STD(isinf);
+  return (isinf)(float(a));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const bfloat16& a) {
+  EIGEN_USING_STD(isnan);
+  return (isnan)(float(a));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const bfloat16& a) {
+  return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 abs(const bfloat16& a) {
+  bfloat16 result;
+  result.value = a.value & 0x7FFF;
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp(const bfloat16& a) {
+   return bfloat16(::expf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 expm1(const bfloat16& a) {
+  return bfloat16(numext::expm1(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log(const bfloat16& a) {
+  return bfloat16(::logf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log1p(const bfloat16& a) {
+  return bfloat16(numext::log1p(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log10(const bfloat16& a) {
+  return bfloat16(::log10f(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log2(const bfloat16& a) {
+  return bfloat16(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) {
+    return bfloat16(::sqrtf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(::powf(float(a), float(b)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sin(const bfloat16& a) {
+  return bfloat16(::sinf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cos(const bfloat16& a) {
+  return bfloat16(::cosf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tan(const bfloat16& a) {
+  return bfloat16(::tanf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asin(const bfloat16& a) {
+  return bfloat16(::asinf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acos(const bfloat16& a) {
+  return bfloat16(::acosf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan(const bfloat16& a) {
+  return bfloat16(::atanf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sinh(const bfloat16& a) {
+  return bfloat16(::sinhf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cosh(const bfloat16& a) {
+  return bfloat16(::coshf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tanh(const bfloat16& a) {
+  return bfloat16(::tanhf(float(a)));
+}
+#if EIGEN_HAS_CXX11_MATH
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asinh(const bfloat16& a) {
+  return bfloat16(::asinhf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acosh(const bfloat16& a) {
+  return bfloat16(::acoshf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atanh(const bfloat16& a) {
+  return bfloat16(::atanhf(float(a)));
+}
+#endif
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) {
+  return bfloat16(::floorf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) {
+  return bfloat16(::ceilf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16& a) {
+  return bfloat16(::rintf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 round(const bfloat16& a) {
+  return bfloat16(::roundf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmod(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(::fmodf(float(a), float(b)));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (min)(const bfloat16& a, const bfloat16& b) {
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f2 < f1 ? b : a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (max)(const bfloat16& a, const bfloat16& b) {
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f1 < f2 ? b : a;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmin(const bfloat16& a, const bfloat16& b) {
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return bfloat16(::fminf(f1, f2));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmax(const bfloat16& a, const bfloat16& b) {
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return bfloat16(::fmaxf(f1, f2));
+}
+
+#ifndef EIGEN_NO_IO
+EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const bfloat16& v) {
+  os << static_cast<float>(v);
+  return os;
+}
+#endif
+
+} // namespace bfloat16_impl
+
+namespace internal {
+
+template<>
+struct random_default_impl<bfloat16, false, false>
+{
+  static inline bfloat16 run(const bfloat16& x, const bfloat16& y)
+  {
+    return x + (y-x) * bfloat16(float(std::rand()) / float(RAND_MAX));
+  }
+  static inline bfloat16 run()
+  {
+    return run(bfloat16(-1.f), bfloat16(1.f));
+  }
+};
+
+template<> struct is_arithmetic<bfloat16> { enum { value = true }; };
+
+} // namespace internal
+
+template<> struct NumTraits<Eigen::bfloat16>
+    : GenericNumTraits<Eigen::bfloat16>
+{
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 epsilon() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x3c00);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 dummy_precision() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x3D4D);  // bfloat16(5e-2f);
+
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 highest() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x7F7F);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 lowest() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0xFF7F);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 infinity() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x7f80);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 quiet_NaN() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0);
+  }
+};
+
+} // namespace Eigen
+
+namespace Eigen {
+namespace numext {
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isnan)(const Eigen::bfloat16& h) {
+  return (bfloat16_impl::isnan)(h);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isinf)(const Eigen::bfloat16& h) {
+  return (bfloat16_impl::isinf)(h);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isfinite)(const Eigen::bfloat16& h) {
+  return (bfloat16_impl::isfinite)(h);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bit_cast<Eigen::bfloat16, uint16_t>(const uint16_t& src) {
+  return Eigen::bfloat16(Eigen::bfloat16_impl::raw_uint16_to_bfloat16(src));
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::bfloat16>(const Eigen::bfloat16& src) {
+  return Eigen::bfloat16_impl::raw_bfloat16_as_uint16(src);
+}
+
+}  // namespace numext
+}  // namespace Eigen
+
+#if EIGEN_HAS_STD_HASH
+namespace std {
+template <>
+struct hash<Eigen::bfloat16> {
+  EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::bfloat16& a) const {
+    return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a));
+  }
+};
+} // namespace std
+#endif
+
+
+#endif // EIGEN_BFLOAT16_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/Default/ConjHelper.h b/src/3rdparty/eigen/Eigen/src/Core/arch/Default/ConjHelper.h
new file mode 100644
index 000000000..53830b5a2
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/Default/ConjHelper.h
@@ -0,0 +1,117 @@
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_CONJ_HELPER_H
+#define EIGEN_ARCH_CONJ_HELPER_H
+
+#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL)      \
+  template <>                                                           \
+  struct conj_helper<PACKET_REAL, PACKET_CPLX, false, false> {          \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x,         \
+                                          const PACKET_CPLX& y,         \
+                                          const PACKET_CPLX& c) const { \
+      return padd(c, this->pmul(x, y));                                 \
+    }                                                                   \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x,          \
+                                         const PACKET_CPLX& y) const {  \
+      return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v));   \
+    }                                                                   \
+  };                                                                    \
+                                                                        \
+  template <>                                                           \
+  struct conj_helper<PACKET_CPLX, PACKET_REAL, false, false> {          \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x,         \
+                                          const PACKET_REAL& y,         \
+                                          const PACKET_CPLX& c) const { \
+      return padd(c, this->pmul(x, y));                                 \
+    }                                                                   \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x,          \
+                                         const PACKET_REAL& y) const {  \
+      return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y));   \
+    }                                                                   \
+  };
+
+namespace Eigen {
+namespace internal {
+
+template<bool Conjugate> struct conj_if;
+
+template<> struct conj_if<true> {
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { return numext::conj(x); }
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T pconj(const T& x) const { return internal::pconj(x); }
+};
+
+template<> struct conj_if<false> {
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator()(const T& x) const { return x; }
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& pconj(const T& x) const { return x; }
+};
+
+// Generic Implementation, assume scalars since the packet-version is
+// specialized below.
+template<typename LhsType, typename RhsType, bool ConjLhs, bool ConjRhs>
+struct conj_helper {
+  typedef typename ScalarBinaryOpTraits<LhsType, RhsType>::ReturnType ResultType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
+  pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const
+  { return this->pmul(x, y) + c; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
+  pmul(const LhsType& x, const RhsType& y) const
+  { return conj_if<ConjLhs>()(x) * conj_if<ConjRhs>()(y); }
+};
+
+template<typename LhsScalar, typename RhsScalar>
+struct conj_helper<LhsScalar, RhsScalar, true, true> {
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar>::ReturnType ResultType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
+  pmadd(const LhsScalar& x, const RhsScalar& y, const ResultType& c) const
+  { return this->pmul(x, y) + c; }
+
+  // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
+  pmul(const LhsScalar& x, const RhsScalar& y) const
+  { return numext::conj(x * y); }
+};
+
+// Implementation with equal type, use packet operations.
+template<typename Packet, bool ConjLhs, bool ConjRhs>
+struct conj_helper<Packet, Packet, ConjLhs, ConjRhs>
+{
+  typedef Packet ResultType;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const
+  { return Eigen::internal::pmadd(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y), c); }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const
+  { return Eigen::internal::pmul(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y)); }
+};
+
+template<typename Packet>
+struct conj_helper<Packet, Packet, true, true>
+{
+  typedef Packet ResultType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const
+  { return Eigen::internal::pmadd(pconj(x), pconj(y), c); }
+  // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const
+  { return pconj(Eigen::internal::pmul(x, y)); }
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_ARCH_CONJ_HELPER_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/src/3rdparty/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
new file mode 100644
index 000000000..c9fbaf68b
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -0,0 +1,1649 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The exp and log functions of this file initially come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+// Creates a Scalar integer type with same bit-width.
+template<typename T> struct make_integer;
+template<> struct make_integer<float>    { typedef numext::int32_t type; };
+template<> struct make_integer<double>   { typedef numext::int64_t type; };
+template<> struct make_integer<half>     { typedef numext::int16_t type; };
+template<> struct make_integer<bfloat16> { typedef numext::int16_t type; };
+
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC  
+Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  enum { mantissa_bits = numext::numeric_limits<Scalar>::digits - 1};
+  return pcast<PacketI, Packet>(plogical_shift_right<mantissa_bits>(preinterpret<PacketI>(pabs(a))));
+}
+
+// Safely applies frexp, correctly handles denormals.
+// Assumes IEEE floating point format.
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+Packet pfrexp_generic(const Packet& a, Packet& exponent) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename make_unsigned<typename make_integer<Scalar>::type>::type ScalarUI;
+  enum {
+    TotalBits = sizeof(Scalar) * CHAR_BIT,
+    MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+    ExponentBits = int(TotalBits) - int(MantissaBits) - 1
+  };
+
+  EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask = 
+      ~(((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1)) << int(MantissaBits)); // ~0x7f800000
+  const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask)); 
+  const Packet half = pset1<Packet>(Scalar(0.5));
+  const Packet zero = pzero(a);
+  const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)()); // Minimum normal value, 2^-126
+  
+  // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
+  const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
+  EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(int(MantissaBits) + 1); // 24
+  // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
+  const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24
+  const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);  
+  const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a);
+  
+  // Determine exponent offset: -126 if normal, -126-24 if denormal
+  const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1)<<(int(ExponentBits)-1)) - ScalarUI(2)); // -126
+  Packet exponent_offset = pset1<Packet>(scalar_exponent_offset);
+  const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset)); // -24
+  exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset);
+  
+  // Determine exponent and mantissa from normalized_a.
+  exponent = pfrexp_generic_get_biased_exponent(normalized_a);
+  // Zero, Inf and NaN return 'a' unmodified, exponent is zero
+  // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero)
+  const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1));  // 255
+  const Packet non_finite_exponent = pset1<Packet>(scalar_non_finite_exponent);
+  const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent));
+  const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half));
+  exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset));  
+  return m;
+}
+
+// Safely applies ldexp, correctly handles overflows, underflows and denormals.
+// Assumes IEEE floating point format.
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+Packet pldexp_generic(const Packet& a, const Packet& exponent) {
+  // We want to return a * 2^exponent, allowing for all possible integer
+  // exponents without overflowing or underflowing in intermediate
+  // computations.
+  //
+  // Since 'a' and the output can be denormal, the maximum range of 'exponent'
+  // to consider for a float is:
+  //   -255-23 -> 255+23
+  // Below -278 any finite float 'a' will become zero, and above +278 any
+  // finite float will become inf, including when 'a' is the smallest possible 
+  // denormal.
+  //
+  // Unfortunately, 2^(278) cannot be represented using either one or two
+  // finite normal floats, so we must split the scale factor into at least
+  // three parts. It turns out to be faster to split 'exponent' into four
+  // factors, since [exponent>>2] is much faster to compute that [exponent/3].
+  //
+  // Set e = min(max(exponent, -278), 278);
+  //     b = floor(e/4);
+  //   out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b))
+  //
+  // This will avoid any intermediate overflows and correctly handle 0, inf,
+  // NaN cases.
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<PacketI>::type ScalarI;
+  enum {
+    TotalBits = sizeof(Scalar) * CHAR_BIT,
+    MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+    ExponentBits = int(TotalBits) - int(MantissaBits) - 1
+  };
+
+  const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1)<<int(ExponentBits)) + ScalarI(int(MantissaBits) - 1)));  // 278
+  const PacketI bias = pset1<PacketI>((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1));  // 127
+  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+  PacketI b = parithmetic_shift_right<2>(e); // floor(e/4);
+  Packet c = preinterpret<Packet>(plogical_shift_left<int(MantissaBits)>(padd(b, bias)));  // 2^b
+  Packet out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
+  b = psub(psub(psub(e, b), b), b); // e - 3b
+  c = preinterpret<Packet>(plogical_shift_left<int(MantissaBits)>(padd(b, bias)));  // 2^(e-3*b)
+  out = pmul(out, c);
+  return out;
+}
+
+// Explicitly multiplies 
+//    a * (2^e)
+// clamping e to the range
+// [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()]
+//
+// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow
+// if 2^e doesn't fit into a normal floating-point Scalar.
+//
+// Assumes IEEE floating point format
+template<typename Packet>
+struct pldexp_fast_impl {
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<PacketI>::type ScalarI;
+  enum {
+    TotalBits = sizeof(Scalar) * CHAR_BIT,
+    MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+    ExponentBits = int(TotalBits) - int(MantissaBits) - 1
+  };
+  
+  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+  Packet run(const Packet& a, const Packet& exponent) {
+    const Packet bias = pset1<Packet>(Scalar((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1)));  // 127
+    const Packet limit = pset1<Packet>(Scalar((ScalarI(1)<<int(ExponentBits)) - ScalarI(1)));     // 255
+    // restrict biased exponent between 0 and 255 for float.
+    const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127
+    // return a * (2^e)
+    return pmul(a, preinterpret<Packet>(plogical_shift_left<int(MantissaBits)>(e)));
+  }
+};
+
+// Natural or base 2 logarithm.
+// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
+// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
+// be easily approximated by a polynomial centered on m=1 for stability.
+// TODO(gonnet): Further reduce the interval allowing for lower-degree
+//               polynomial interpolants -> ... -> profit!
+template <typename Packet, bool base2>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_impl_float(const Packet _x)
+{
+  Packet x = _x;
+
+  const Packet cst_1              = pset1<Packet>(1.0f);
+  const Packet cst_neg_half       = pset1<Packet>(-0.5f);
+  // The smallest non denormalized float number.
+  const Packet cst_min_norm_pos   = pset1frombits<Packet>( 0x00800000u);
+  const Packet cst_minus_inf      = pset1frombits<Packet>( 0xff800000u);
+  const Packet cst_pos_inf        = pset1frombits<Packet>( 0x7f800000u);
+
+  // Polynomial coefficients.
+  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
+  const Packet cst_cephes_log_p0 = pset1<Packet>(7.0376836292E-2f);
+  const Packet cst_cephes_log_p1 = pset1<Packet>(-1.1514610310E-1f);
+  const Packet cst_cephes_log_p2 = pset1<Packet>(1.1676998740E-1f);
+  const Packet cst_cephes_log_p3 = pset1<Packet>(-1.2420140846E-1f);
+  const Packet cst_cephes_log_p4 = pset1<Packet>(+1.4249322787E-1f);
+  const Packet cst_cephes_log_p5 = pset1<Packet>(-1.6668057665E-1f);
+  const Packet cst_cephes_log_p6 = pset1<Packet>(+2.0000714765E-1f);
+  const Packet cst_cephes_log_p7 = pset1<Packet>(-2.4999993993E-1f);
+  const Packet cst_cephes_log_p8 = pset1<Packet>(+3.3333331174E-1f);
+
+  // Truncate input values to the minimum positive normal.
+  x = pmax(x, cst_min_norm_pos);
+
+  Packet e;
+  // extract significant in the range [0.5,1) and exponent
+  x = pfrexp(x,e);
+
+  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
+  // and shift by -1. The values are then centered around 0, which improves
+  // the stability of the polynomial evaluation.
+  //   if( x < SQRTHF ) {
+  //     e -= 1;
+  //     x = x + x - 1.0;
+  //   } else { x = x - 1.0; }
+  Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
+  Packet tmp = pand(x, mask);
+  x = psub(x, cst_1);
+  e = psub(e, pand(cst_1, mask));
+  x = padd(x, tmp);
+
+  Packet x2 = pmul(x, x);
+  Packet x3 = pmul(x2, x);
+
+  // Evaluate the polynomial approximant of degree 8 in three parts, probably
+  // to improve instruction-level parallelism.
+  Packet y, y1, y2;
+  y  = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
+  y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
+  y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7);
+  y  = pmadd(y, x, cst_cephes_log_p2);
+  y1 = pmadd(y1, x, cst_cephes_log_p5);
+  y2 = pmadd(y2, x, cst_cephes_log_p8);
+  y  = pmadd(y, x3, y1);
+  y  = pmadd(y, x3, y2);
+  y  = pmul(y, x3);
+
+  y = pmadd(cst_neg_half, x2, y);
+  x = padd(x, y);
+
+  // Add the logarithm of the exponent back to the result of the interpolation.
+  if (base2) {
+    const Packet cst_log2e = pset1<Packet>(static_cast<float>(EIGEN_LOG2E));
+    x = pmadd(x, cst_log2e, e);
+  } else {
+    const Packet cst_ln2 = pset1<Packet>(static_cast<float>(EIGEN_LN2));
+    x = pmadd(e, cst_ln2, x);
+  }
+
+  Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
+  Packet iszero_mask  = pcmp_eq(_x,pzero(_x));
+  Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
+  // Filter out invalid inputs, i.e.:
+  //  - negative arg will be NAN
+  //  - 0 will be -INF
+  //  - +INF will be +INF
+  return pselect(iszero_mask, cst_minus_inf,
+                              por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_float(const Packet _x)
+{
+  return plog_impl_float<Packet, /* base2 */ false>(_x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog2_float(const Packet _x)
+{
+  return plog_impl_float<Packet, /* base2 */ true>(_x);
+}
+
+/* Returns the base e (2.718...) or base 2 logarithm of x.
+ * The argument is separated into its exponent and fractional parts.
+ * The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
+ * is approximated by
+ *
+ *     log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x).
+ *
+ * for more detail see: http://www.netlib.org/cephes/
+ */
+template <typename Packet, bool base2>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_impl_double(const Packet _x)
+{
+  Packet x = _x;
+
+  const Packet cst_1              = pset1<Packet>(1.0);
+  const Packet cst_neg_half       = pset1<Packet>(-0.5);
+  // The smallest non denormalized double.
+  const Packet cst_min_norm_pos   = pset1frombits<Packet>( static_cast<uint64_t>(0x0010000000000000ull));
+  const Packet cst_minus_inf      = pset1frombits<Packet>( static_cast<uint64_t>(0xfff0000000000000ull));
+  const Packet cst_pos_inf        = pset1frombits<Packet>( static_cast<uint64_t>(0x7ff0000000000000ull));
+
+
+ // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
+ //                             1/sqrt(2) <= x < sqrt(2)
+  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
+  const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
+  const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
+  const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
+  const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
+  const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
+  const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
+
+  const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
+  const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
+  const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
+  const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
+  const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
+  const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
+
+  // Truncate input values to the minimum positive normal.
+  x = pmax(x, cst_min_norm_pos);
+
+  Packet e;
+  // extract significant in the range [0.5,1) and exponent
+  x = pfrexp(x,e);
+  
+  // Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
+  // and shift by -1. The values are then centered around 0, which improves
+  // the stability of the polynomial evaluation.
+  //   if( x < SQRTHF ) {
+  //     e -= 1;
+  //     x = x + x - 1.0;
+  //   } else { x = x - 1.0; }
+  Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
+  Packet tmp = pand(x, mask);
+  x = psub(x, cst_1);
+  e = psub(e, pand(cst_1, mask));
+  x = padd(x, tmp);
+
+  Packet x2 = pmul(x, x);
+  Packet x3 = pmul(x2, x);
+
+  // Evaluate the polynomial approximant , probably to improve instruction-level parallelism.
+  // y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
+  Packet y, y1, y_;
+  y  = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
+  y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
+  y  = pmadd(y, x, cst_cephes_log_p2);
+  y1 = pmadd(y1, x, cst_cephes_log_p5);
+  y_ = pmadd(y, x3, y1);
+
+  y  = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
+  y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
+  y  = pmadd(y, x, cst_cephes_log_q2);
+  y1 = pmadd(y1, x, cst_cephes_log_q5);
+  y  = pmadd(y, x3, y1);
+
+  y_ = pmul(y_, x3);
+  y  = pdiv(y_, y);
+
+  y = pmadd(cst_neg_half, x2, y);
+  x = padd(x, y);
+
+  // Add the logarithm of the exponent back to the result of the interpolation.
+  if (base2) {
+    const Packet cst_log2e = pset1<Packet>(static_cast<double>(EIGEN_LOG2E));
+    x = pmadd(x, cst_log2e, e);
+  } else {
+    const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
+    x = pmadd(e, cst_ln2, x);
+  }
+
+  Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
+  Packet iszero_mask  = pcmp_eq(_x,pzero(_x));
+  Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
+  // Filter out invalid inputs, i.e.:
+  //  - negative arg will be NAN
+  //  - 0 will be -INF
+  //  - +INF will be +INF
+  return pselect(iszero_mask, cst_minus_inf,
+                              por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_double(const Packet _x)
+{
+  return plog_impl_double<Packet, /* base2 */ false>(_x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog2_double(const Packet _x)
+{
+  return plog_impl_double<Packet, /* base2 */ true>(_x);
+}
+
+/** \internal \returns log(1 + x) computed using W. Kahan's formula.
+    See: http://www.plunk.org/~hatch/rightway.php
+ */
+template<typename Packet>
+Packet generic_plog1p(const Packet& x)
+{
+  typedef typename unpacket_traits<Packet>::type ScalarType;
+  const Packet one = pset1<Packet>(ScalarType(1));
+  Packet xp1 = padd(x, one);
+  Packet small_mask = pcmp_eq(xp1, one);
+  Packet log1 = plog(xp1);
+  Packet inf_mask = pcmp_eq(xp1, log1);
+  Packet log_large = pmul(x, pdiv(log1, psub(xp1, one)));
+  return pselect(por(small_mask, inf_mask), x, log_large);
+}
+
+/** \internal \returns exp(x)-1 computed using W. Kahan's formula.
+    See: http://www.plunk.org/~hatch/rightway.php
+ */
+template<typename Packet>
+Packet generic_expm1(const Packet& x)
+{
+  typedef typename unpacket_traits<Packet>::type ScalarType;
+  const Packet one = pset1<Packet>(ScalarType(1));
+  const Packet neg_one = pset1<Packet>(ScalarType(-1));
+  Packet u = pexp(x);
+  Packet one_mask = pcmp_eq(u, one);
+  Packet u_minus_one = psub(u, one);
+  Packet neg_one_mask = pcmp_eq(u_minus_one, neg_one);
+  Packet logu = plog(u);
+  // The following comparison is to catch the case where
+  // exp(x) = +inf. It is written in this way to avoid having
+  // to form the constant +inf, which depends on the packet
+  // type.
+  Packet pos_inf_mask = pcmp_eq(logu, u);
+  Packet expm1 = pmul(u_minus_one, pdiv(x, logu));
+  expm1 = pselect(pos_inf_mask, u, expm1);
+  return pselect(one_mask,
+                 x,
+                 pselect(neg_one_mask,
+                         neg_one,
+                         expm1));
+}
+
+
+// Exponential function. Works by writing "x = m*log(2) + r" where
+// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
+// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pexp_float(const Packet _x)
+{
+  const Packet cst_1      = pset1<Packet>(1.0f);
+  const Packet cst_half   = pset1<Packet>(0.5f);
+  const Packet cst_exp_hi = pset1<Packet>( 88.723f);
+  const Packet cst_exp_lo = pset1<Packet>(-88.723f);
+
+  const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
+  const Packet cst_cephes_exp_p0 = pset1<Packet>(1.9875691500E-4f);
+  const Packet cst_cephes_exp_p1 = pset1<Packet>(1.3981999507E-3f);
+  const Packet cst_cephes_exp_p2 = pset1<Packet>(8.3334519073E-3f);
+  const Packet cst_cephes_exp_p3 = pset1<Packet>(4.1665795894E-2f);
+  const Packet cst_cephes_exp_p4 = pset1<Packet>(1.6666665459E-1f);
+  const Packet cst_cephes_exp_p5 = pset1<Packet>(5.0000001201E-1f);
+
+  // Clamp x.
+  Packet x = pmax(pmin(_x, cst_exp_hi), cst_exp_lo);
+
+  // Express exp(x) as exp(m*ln(2) + r), start by extracting
+  // m = floor(x/ln(2) + 0.5).
+  Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half));
+
+  // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
+  // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
+  // truncation errors.
+  const Packet cst_cephes_exp_C1 = pset1<Packet>(-0.693359375f);
+  const Packet cst_cephes_exp_C2 = pset1<Packet>(2.12194440e-4f);
+  Packet r = pmadd(m, cst_cephes_exp_C1, x);
+  r = pmadd(m, cst_cephes_exp_C2, r);
+
+  Packet r2 = pmul(r, r);
+  Packet r3 = pmul(r2, r);
+
+  // Evaluate the polynomial approximant,improved by instruction-level parallelism.
+  Packet y, y1, y2;
+  y  = pmadd(cst_cephes_exp_p0, r, cst_cephes_exp_p1);
+  y1 = pmadd(cst_cephes_exp_p3, r, cst_cephes_exp_p4);
+  y2 = padd(r, cst_1);
+  y  = pmadd(y, r, cst_cephes_exp_p2);
+  y1 = pmadd(y1, r, cst_cephes_exp_p5);
+  y  = pmadd(y, r3, y1);
+  y  = pmadd(y, r2, y2);
+
+  // Return 2^m * exp(r).
+  // TODO: replace pldexp with faster implementation since y in [-1, 1).
+  return pmax(pldexp(y,m), _x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pexp_double(const Packet _x)
+{
+  Packet x = _x;
+
+  const Packet cst_1 = pset1<Packet>(1.0);
+  const Packet cst_2 = pset1<Packet>(2.0);
+  const Packet cst_half = pset1<Packet>(0.5);
+
+  const Packet cst_exp_hi = pset1<Packet>(709.784);
+  const Packet cst_exp_lo = pset1<Packet>(-709.784);
+
+  const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
+  const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
+  const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
+  const Packet cst_cephes_exp_p2 = pset1<Packet>(9.99999999999999999910e-1);
+  const Packet cst_cephes_exp_q0 = pset1<Packet>(3.00198505138664455042e-6);
+  const Packet cst_cephes_exp_q1 = pset1<Packet>(2.52448340349684104192e-3);
+  const Packet cst_cephes_exp_q2 = pset1<Packet>(2.27265548208155028766e-1);
+  const Packet cst_cephes_exp_q3 = pset1<Packet>(2.00000000000000000009e0);
+  const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693145751953125);
+  const Packet cst_cephes_exp_C2 = pset1<Packet>(1.42860682030941723212e-6);
+
+  Packet tmp, fx;
+
+  // clamp x
+  x = pmax(pmin(x, cst_exp_hi), cst_exp_lo);
+  // Express exp(x) as exp(g + n*log(2)).
+  fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
+
+  // Get the integer modulus of log(2), i.e. the "n" described above.
+  fx = pfloor(fx);
+
+  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
+  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
+  // digits right.
+  tmp = pmul(fx, cst_cephes_exp_C1);
+  Packet z = pmul(fx, cst_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial of the rational interpolant.
+  Packet px = cst_cephes_exp_p0;
+  px = pmadd(px, x2, cst_cephes_exp_p1);
+  px = pmadd(px, x2, cst_cephes_exp_p2);
+  px = pmul(px, x);
+
+  // Evaluate the denominator polynomial of the rational interpolant.
+  Packet qx = cst_cephes_exp_q0;
+  qx = pmadd(qx, x2, cst_cephes_exp_q1);
+  qx = pmadd(qx, x2, cst_cephes_exp_q2);
+  qx = pmadd(qx, x2, cst_cephes_exp_q3);
+
+  // I don't really get this bit, copied from the SSE2 routines, so...
+  // TODO(gonnet): Figure out what is going on here, perhaps find a better
+  // rational interpolant?
+  x = pdiv(px, psub(qx, px));
+  x = pmadd(cst_2, x, cst_1);
+
+  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
+  // non-finite values in the input.
+  // TODO: replace pldexp with faster implementation since x in [-1, 1).
+  return pmax(pldexp(x,fx), _x);
+}
+
+// The following code is inspired by the following stack-overflow answer:
+//   https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
+// It has been largely optimized:
+//  - By-pass calls to frexp.
+//  - Aligned loads of required 96 bits of 2/pi. This is accomplished by
+//    (1) balancing the mantissa and exponent to the required bits of 2/pi are
+//    aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
+//  - Avoid a branch in rounding and extraction of the remaining fractional part.
+// Overall, I measured a speed up higher than x2 on x86-64.
+inline float trig_reduce_huge (float xf, int *quadrant)
+{
+  using Eigen::numext::int32_t;
+  using Eigen::numext::uint32_t;
+  using Eigen::numext::int64_t;
+  using Eigen::numext::uint64_t;
+
+  const double pio2_62 = 3.4061215800865545e-19;    // pi/2 * 2^-62
+  const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt
+
+  // 192 bits of 2/pi for Payne-Hanek reduction
+  // Bits are introduced by packet of 8 to enable aligned reads.
+  static const uint32_t two_over_pi [] = 
+  {
+    0x00000028, 0x000028be, 0x0028be60, 0x28be60db,
+    0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a,
+    0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4,
+    0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770,
+    0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566,
+    0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410,
+    0x10e41000, 0xe4100000
+  };
+  
+  uint32_t xi = numext::bit_cast<uint32_t>(xf);
+  // Below, -118 = -126 + 8.
+  //   -126 is to get the exponent,
+  //   +8 is to enable alignment of 2/pi's bits on 8 bits.
+  // This is possible because the fractional part of x as only 24 meaningful bits.
+  uint32_t e = (xi >> 23) - 118;
+  // Extract the mantissa and shift it to align it wrt the exponent
+  xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7);
+
+  uint32_t i = e >> 3;
+  uint32_t twoopi_1  = two_over_pi[i-1];
+  uint32_t twoopi_2  = two_over_pi[i+3];
+  uint32_t twoopi_3  = two_over_pi[i+7];
+
+  // Compute x * 2/pi in 2.62-bit fixed-point format.
+  uint64_t p;
+  p = uint64_t(xi) * twoopi_3;
+  p = uint64_t(xi) * twoopi_2 + (p >> 32);
+  p = (uint64_t(xi * twoopi_1) << 32) + p;
+
+  // Round to nearest: add 0.5 and extract integral part.
+  uint64_t q = (p + zero_dot_five) >> 62;
+  *quadrant = int(q);
+  // Now it remains to compute "r = x - q*pi/2" with high accuracy,
+  // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
+  //   r = (p-q)*pi/2,
+  // where the product can be be carried out with sufficient accuracy using double precision.
+  p -= q<<62;
+  return float(double(int64_t(p)) * pio2_62);
+}
+
+template<bool ComputeSine,typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+#if EIGEN_GNUC_AT_LEAST(4,4) && EIGEN_COMP_GNUC_STRICT
+__attribute__((optimize("-fno-unsafe-math-optimizations")))
+#endif
+Packet psincos_float(const Packet& _x)
+{
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+
+  const Packet  cst_2oPI            = pset1<Packet>(0.636619746685028076171875f); // 2/PI
+  const Packet  cst_rounding_magic  = pset1<Packet>(12582912); // 2^23 for rounding
+  const PacketI csti_1              = pset1<PacketI>(1);
+  const Packet  cst_sign_mask       = pset1frombits<Packet>(0x80000000u);
+
+  Packet x = pabs(_x);
+
+  // Scale x by 2/Pi to find x's octant.
+  Packet y = pmul(x, cst_2oPI);
+
+  // Rounding trick:
+  Packet y_round = padd(y, cst_rounding_magic);
+  EIGEN_OPTIMIZATION_BARRIER(y_round)
+  PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
+  y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
+
+  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
+  // using "Extended precision modular arithmetic"
+  #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
+  // This version requires true FMA for high accuracy
+  // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
+  const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
+  x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
+  x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
+  x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
+  #else
+  // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
+  // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
+  // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
+
+  // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
+  // and 2 ULP up to:
+  const float huge_th = ComputeSine ? 25966.f : 18838.f;
+  x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
+  EIGEN_OPTIMIZATION_BARRIER(x)
+  x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
+  EIGEN_OPTIMIZATION_BARRIER(x)
+  x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
+  x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
+
+  // For the record, the following set of coefficients maintain 2ULP up
+  // to a slightly larger range:
+  // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
+  // but it slightly fails to maintain 1ULP for two values of sin below pi.
+  // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
+  // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
+  // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
+  // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
+
+  // For the record, with only 3 iterations it is possible to maintain
+  // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
+  // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
+  #endif
+
+  if(predux_any(pcmp_le(pset1<Packet>(huge_th),pabs(_x))))
+  {
+    const int PacketSize = unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize];
+    pstoreu(vals, pabs(_x));
+    pstoreu(x_cpy, x);
+    pstoreu(y_int2, y_int);
+    for(int k=0; k<PacketSize;++k)
+    {
+      float val = vals[k];
+      if(val>=huge_th && (numext::isfinite)(val))
+        x_cpy[k] = trig_reduce_huge(val,&y_int2[k]);
+    }
+    x = ploadu<Packet>(x_cpy);
+    y_int = ploadu<PacketI>(y_int2);
+  }
+
+  // Compute the sign to apply to the polynomial.
+  // sin: sign = second_bit(y_int) xor signbit(_x)
+  // cos: sign = second_bit(y_int+1)
+  Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
+                                : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int,csti_1)));
+  sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
+
+  // Get the polynomial selection mask from the second bit of y_int
+  // We'll calculate both (sin and cos) polynomials and then select from the two.
+  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
+
+  Packet x2 = pmul(x,x);
+
+  // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
+  Packet y1 =        pset1<Packet>(2.4372266125283204019069671630859375e-05f);
+  y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f     ));
+  y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f           ));
+  y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
+  y1 = pmadd(y1, x2, pset1<Packet>(1.f));
+
+  // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
+  // octave/matlab code to compute those coefficients:
+  //    x = (0:0.0001:pi/4)';
+  //    A = [x.^3 x.^5 x.^7];
+  //    w = ((1.-(x/(pi/4)).^2).^5)*2000+1;         # weights trading relative accuracy
+  //    c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
+  //    printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
+  //
+  Packet y2 =        pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
+  y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
+  y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
+  y2 = pmul(y2, x2);
+  y2 = pmadd(y2, x, x);
+
+  // Select the correct result from the two polynomials.
+  y = ComputeSine ? pselect(poly_mask,y2,y1)
+                  : pselect(poly_mask,y1,y2);
+
+  // Update the sign and filter huge inputs
+  return pxor(y, sign_bit);
+}
+
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet psin_float(const Packet& x)
+{
+  return psincos_float<true>(x);
+}
+
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pcos_float(const Packet& x)
+{
+  return psincos_float<false>(x);
+}
+
+
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet psqrt_complex(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename Scalar::value_type RealScalar;
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+
+  // Computes the principal sqrt of the complex numbers in the input.
+  //
+  // For example, for packets containing 2 complex numbers stored in interleaved format
+  //    a = [a0, a1] = [x0, y0, x1, y1],
+  // where x0 = real(a0), y0 = imag(a0) etc., this function returns
+  //    b = [b0, b1] = [u0, v0, u1, v1],
+  // such that b0^2 = a0, b1^2 = a1.
+  //
+  // To derive the formula for the complex square roots, let's consider the equation for
+  // a single complex square root of the number x + i*y. We want to find real numbers
+  // u and v such that
+  //    (u + i*v)^2 = x + i*y  <=>
+  //    u^2 - v^2 + i*2*u*v = x + i*v.
+  // By equating the real and imaginary parts we get:
+  //    u^2 - v^2 = x
+  //    2*u*v = y.
+  //
+  // For x >= 0, this has the numerically stable solution
+  //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
+  //    v = 0.5 * (y / u)
+  // and for x < 0,
+  //    v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
+  //    u = 0.5 * (y / v)
+  //
+  //  To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
+  //     l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
+
+  // In the following, without lack of generality, we have annotated the code, assuming
+  // that the input is a packet of 2 complex numbers.
+  //
+  // Step 1. Compute l = [l0, l0, l1, l1], where
+  //    l0 = sqrt(x0^2 + y0^2),  l1 = sqrt(x1^2 + y1^2)
+  // To avoid over- and underflow, we use the stable formula for each hypotenuse
+  //    l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
+  // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
+
+  RealPacket a_abs = pabs(a.v);           // [|x0|, |y0|, |x1|, |y1|]
+  RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; // [|y0|, |x0|, |y1|, |x1|]
+  RealPacket a_max = pmax(a_abs, a_abs_flip);
+  RealPacket a_min = pmin(a_abs, a_abs_flip);
+  RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
+  RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
+  RealPacket r = pdiv(a_min, a_max);
+  const RealPacket cst_one  = pset1<RealPacket>(RealScalar(1));
+  RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r))));  // [l0, l0, l1, l1]
+  // Set l to a_max if a_min is zero.
+  l = pselect(a_min_zero_mask, a_max, l);
+
+  // Step 2. Compute [rho0, *, rho1, *], where
+  // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 =  sqrt(0.5 * (l1 + |x1|))
+  // We don't care about the imaginary parts computed here. They will be overwritten later.
+  const RealPacket cst_half = pset1<RealPacket>(RealScalar(0.5));
+  Packet rho;
+  rho.v = psqrt(pmul(cst_half, padd(a_abs, l)));
+
+  // Step 3. Compute [rho0, eta0, rho1, eta1], where
+  // eta0 = (y0 / l0) / 2, and eta1 = (y1 / l1) / 2.
+  // set eta = 0 of input is 0 + i0.
+  RealPacket eta = pandnot(pmul(cst_half, pdiv(a.v, pcplxflip(rho).v)), a_max_zero_mask);
+  RealPacket real_mask = peven_mask(a.v);
+  Packet positive_real_result;
+  // Compute result for inputs with positive real part.
+  positive_real_result.v = pselect(real_mask, rho.v, eta);
+
+  // Step 4. Compute solution for inputs with negative real part:
+  //         [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
+  const RealScalar neg_zero = RealScalar(numext::bit_cast<float>(0x80000000u));
+  const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), neg_zero)).v;
+  RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
+  Packet negative_real_result;
+  // Notice that rho is positive, so taking it's absolute value is a noop.
+  negative_real_result.v = por(pabs(pcplxflip(positive_real_result).v), imag_signs);
+
+  // Step 5. Select solution branch based on the sign of the real parts.
+  Packet negative_real_mask;
+  negative_real_mask.v = pcmp_lt(pand(real_mask, a.v), pzero(a.v));
+  negative_real_mask.v = por(negative_real_mask.v, pcplxflip(negative_real_mask).v);
+  Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result);
+
+  // Step 6. Handle special cases for infinities:
+  // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN
+  // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN
+  // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y
+  // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y
+  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
+  Packet is_inf;
+  is_inf.v = pcmp_eq(a_abs, cst_pos_inf);
+  Packet is_real_inf;
+  is_real_inf.v = pand(is_inf.v, real_mask);
+  is_real_inf = por(is_real_inf, pcplxflip(is_real_inf));
+  // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part.
+  Packet real_inf_result;
+  real_inf_result.v = pmul(a_abs, pset1<Packet>(Scalar(RealScalar(1.0), RealScalar(0.0))).v);
+  real_inf_result.v = pselect(negative_real_mask.v, pcplxflip(real_inf_result).v, real_inf_result.v);
+  // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part.
+  Packet is_imag_inf;
+  is_imag_inf.v = pandnot(is_inf.v, real_mask);
+  is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf));
+  Packet imag_inf_result;
+  imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask));
+
+  return  pselect(is_imag_inf, imag_inf_result,
+                  pselect(is_real_inf, real_inf_result,result));
+}
+
+// TODO(rmlarsen): The following set of utilities for double word arithmetic
+// should perhaps be refactored as a separate file, since it would be generally
+// useful for special function implementation etc. Writing the algorithms in
+// terms if a double word type would also make the code more readable.
+
+// This function splits x into the nearest integer n and fractional part r,
+// such that x = n + r holds exactly.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void absolute_split(const Packet& x, Packet& n, Packet& r) {
+  n = pround(x);
+  r = psub(x, n);
+}
+
+// This function computes the sum {s, r}, such that x + y = s_hi + s_lo
+// holds exactly, and s_hi = fl(x+y), if |x| >= |y|.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
+  s_hi = padd(x, y);
+  const Packet t = psub(s_hi, x);
+  s_lo = psub(y, t);
+}
+
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+// This function implements the extended precision product of
+// a pair of floating point numbers. Given {x, y}, it computes the pair
+// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
+// p_hi = fl(x * y).
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void twoprod(const Packet& x, const Packet& y,
+             Packet& p_hi, Packet& p_lo) {
+  p_hi = pmul(x, y);
+  p_lo = pmadd(x, y, pnegate(p_hi));
+}
+
+#else
+
+// This function implements the Veltkamp splitting. Given a floating point
+// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds
+// exactly and that half of the significant of x fits in x_hi.
+// This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  EIGEN_CONSTEXPR int shift = (NumTraits<Scalar>::digits() + 1) / 2;
+  const Scalar shift_scale = Scalar(uint64_t(1) << shift);  // Scalar constructor not necessarily constexpr.
+  const Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
+  Packet rho = psub(x, gamma);
+  x_hi = padd(rho, gamma);
+  x_lo = psub(x, x_hi);
+}
+
+// This function implements Dekker's algorithm for products x * y.
+// Given floating point numbers {x, y} computes the pair
+// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
+// p_hi = fl(x * y).
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void twoprod(const Packet& x, const Packet& y,
+             Packet& p_hi, Packet& p_lo) {
+  Packet x_hi, x_lo, y_hi, y_lo;
+  veltkamp_splitting(x, x_hi, x_lo);
+  veltkamp_splitting(y, y_hi, y_lo);
+
+  p_hi = pmul(x, y);
+  p_lo = pmadd(x_hi, y_hi, pnegate(p_hi));
+  p_lo = pmadd(x_hi, y_lo, p_lo);
+  p_lo = pmadd(x_lo, y_hi, p_lo);
+  p_lo = pmadd(x_lo, y_lo, p_lo);
+}
+
+#endif  // EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+
+
+// This function implements Dekker's algorithm for the addition
+// of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
+// It returns the result as a pair {s_hi, s_lo} such that
+// x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly.
+// This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+  void twosum(const Packet& x_hi, const Packet& x_lo,
+              const Packet& y_hi, const Packet& y_lo,
+              Packet& s_hi, Packet& s_lo) {
+  const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi));
+  Packet r_hi_1, r_lo_1;
+  fast_twosum(x_hi, y_hi,r_hi_1, r_lo_1);
+  Packet r_hi_2, r_lo_2;
+  fast_twosum(y_hi, x_hi,r_hi_2, r_lo_2);
+  const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2);
+
+  const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo);
+  const Packet s2 = padd(padd(x_lo, r_lo_2), y_lo);
+  const Packet s = pselect(x_greater_mask, s1, s2);
+
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This is a version of twosum for double word numbers,
+// which assumes that |x_hi| >= |y_hi|.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+  void fast_twosum(const Packet& x_hi, const Packet& x_lo,
+              const Packet& y_hi, const Packet& y_lo,
+              Packet& s_hi, Packet& s_lo) {
+  Packet r_hi, r_lo;
+  fast_twosum(x_hi, y_hi, r_hi, r_lo);
+  const Packet s = padd(padd(y_lo, r_lo), x_lo);
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This is a version of twosum for adding a floating point number x to
+// double word number {y_hi, y_lo} number, with the assumption
+// that |x| >= |y_hi|.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void fast_twosum(const Packet& x,
+                 const Packet& y_hi, const Packet& y_lo,
+                 Packet& s_hi, Packet& s_lo) {
+  Packet r_hi, r_lo;
+  fast_twosum(x, y_hi, r_hi, r_lo);
+  const Packet s = padd(y_lo, r_lo);
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This function implements the multiplication of a double word
+// number represented by {x_hi, x_lo} by a floating point number y.
+// It returns the result as a pair {p_hi, p_lo} such that
+// (x_hi + x_lo) * y = p_hi + p_lo hold with a relative error
+// of less than 2*2^{-2p}, where p is the number of significand bit
+// in the floating point type.
+// This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
+             Packet& p_hi, Packet& p_lo) {
+  Packet c_hi, c_lo1;
+  twoprod(x_hi, y, c_hi, c_lo1);
+  const Packet c_lo2 = pmul(x_lo, y);
+  Packet t_hi, t_lo1;
+  fast_twosum(c_hi, c_lo2, t_hi, t_lo1);
+  const Packet t_lo2 = padd(t_lo1, c_lo1);
+  fast_twosum(t_hi, t_lo2, p_hi, p_lo);
+}
+
+// This function implements the multiplication of two double word
+// numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
+// It returns the result as a pair {p_hi, p_lo} such that
+// (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error
+// of less than 2*2^{-2p}, where p is the number of significand bit
+// in the floating point type.
+template<typename Packet>
+EIGEN_STRONG_INLINE
+void twoprod(const Packet& x_hi, const Packet& x_lo,
+             const Packet& y_hi, const Packet& y_lo,
+             Packet& p_hi, Packet& p_lo) {
+  Packet p_hi_hi, p_hi_lo;
+  twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo);
+  Packet p_lo_hi, p_lo_lo;
+  twoprod(x_hi, x_lo, y_lo, p_lo_hi, p_lo_lo);
+  fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo);
+}
+
+// This function computes the reciprocal of a floating point number
+// with extra precision and returns the result as a double word.
+template <typename Packet>
+void doubleword_reciprocal(const Packet& x, Packet& recip_hi, Packet& recip_lo) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  // 1. Approximate the reciprocal as the reciprocal of the high order element.
+  Packet approx_recip = prsqrt(x);
+  approx_recip = pmul(approx_recip, approx_recip);
+
+  // 2. Run one step of Newton-Raphson iteration in double word arithmetic
+  // to get the bottom half. The NR iteration for reciprocal of 'a' is
+  //    x_{i+1} = x_i * (2 - a * x_i)
+
+  // -a*x_i
+  Packet t1_hi, t1_lo;
+  twoprod(pnegate(x), approx_recip, t1_hi, t1_lo);
+  // 2 - a*x_i
+  Packet t2_hi, t2_lo;
+  fast_twosum(pset1<Packet>(Scalar(2)), t1_hi, t2_hi, t2_lo);
+  Packet t3_hi, t3_lo;
+  fast_twosum(t2_hi, padd(t2_lo, t1_lo), t3_hi, t3_lo);
+  // x_i * (2 - a * x_i)
+  twoprod(t3_hi, t3_lo, approx_recip, recip_hi, recip_lo);
+}
+
+
+// This function computes log2(x) and returns the result as a double word.
+template <typename Scalar>
+struct accurate_log2 {
+  template <typename Packet>
+  EIGEN_STRONG_INLINE
+  void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
+    log2_x_hi = plog2(x);
+    log2_x_lo = pzero(x);
+  }
+};
+
+// This specialization uses a more accurate algorithm to compute log2(x) for
+// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.42e-10.
+// This additional accuracy is needed to counter the error-magnification
+// inherent in multiplying by a potentially large exponent in pow(x,y).
+// The minimax polynomial used was calculated using the Sollya tool.
+// See sollya.org.
+template <>
+struct accurate_log2<float> {
+  template <typename Packet>
+  EIGEN_STRONG_INLINE
+  void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
+    // The function log(1+x)/x is approximated in the interval
+    // [1/sqrt(2)-1;sqrt(2)-1] by a degree 10 polynomial of the form
+    //  Q(x) = (C0 + x * (C1 + x * (C2 + x * (C3 + x * P(x))))),
+    // where the degree 6 polynomial P(x) is evaluated in single precision,
+    // while the remaining 4 terms of Q(x), as well as the final multiplication by x
+    // to reconstruct log(1+x) are evaluated in extra precision using
+    // double word arithmetic. C0 through C3 are extra precise constants
+    // stored as double words.
+    //
+    // The polynomial coefficients were calculated using Sollya commands:
+    // > n = 10;
+    // > f = log2(1+x)/x;
+    // > interval = [sqrt(0.5)-1;sqrt(2)-1];
+    // > p = fpminimax(f,n,[|double,double,double,double,single...|],interval,relative,floating);
+    
+    const Packet p6 = pset1<Packet>( 9.703654795885e-2f);
+    const Packet p5 = pset1<Packet>(-0.1690667718648f);
+    const Packet p4 = pset1<Packet>( 0.1720575392246f);
+    const Packet p3 = pset1<Packet>(-0.1789081543684f);
+    const Packet p2 = pset1<Packet>( 0.2050433009862f);
+    const Packet p1 = pset1<Packet>(-0.2404672354459f);
+    const Packet p0 = pset1<Packet>( 0.2885761857032f);
+
+    const Packet C3_hi = pset1<Packet>(-0.360674142838f);
+    const Packet C3_lo = pset1<Packet>(-6.13283912543e-09f);
+    const Packet C2_hi = pset1<Packet>(0.480897903442f);
+    const Packet C2_lo = pset1<Packet>(-1.44861207474e-08f);
+    const Packet C1_hi = pset1<Packet>(-0.721347510815f);
+    const Packet C1_lo = pset1<Packet>(-4.84483164698e-09f);
+    const Packet C0_hi = pset1<Packet>(1.44269502163f);
+    const Packet C0_lo = pset1<Packet>(2.01711713999e-08f);
+    const Packet one = pset1<Packet>(1.0f);
+
+    const Packet x = psub(z, one);
+    // Evaluate P(x) in working precision.
+    // We evaluate it in multiple parts to improve instruction level
+    // parallelism.
+    Packet x2 = pmul(x,x);
+    Packet p_even = pmadd(p6, x2, p4);
+    p_even = pmadd(p_even, x2, p2);
+    p_even = pmadd(p_even, x2, p0);
+    Packet p_odd = pmadd(p5, x2, p3);
+    p_odd = pmadd(p_odd, x2, p1);
+    Packet p = pmadd(p_odd, x, p_even);
+
+    // Now evaluate the low-order tems of Q(x) in double word precision.
+    // In the following, due to the alternating signs and the fact that
+    // |x| < sqrt(2)-1, we can assume that |C*_hi| >= q_i, and use
+    // fast_twosum instead of the slower twosum.
+    Packet q_hi, q_lo;
+    Packet t_hi, t_lo;
+    // C3 + x * p(x)
+    twoprod(p, x, t_hi, t_lo);
+    fast_twosum(C3_hi, C3_lo, t_hi, t_lo, q_hi, q_lo);
+    // C2 + x * p(x)
+    twoprod(q_hi, q_lo, x, t_hi, t_lo);
+    fast_twosum(C2_hi, C2_lo, t_hi, t_lo, q_hi, q_lo);
+    // C1 + x * p(x)
+    twoprod(q_hi, q_lo, x, t_hi, t_lo);
+    fast_twosum(C1_hi, C1_lo, t_hi, t_lo, q_hi, q_lo);
+    // C0 + x * p(x)
+    twoprod(q_hi, q_lo, x, t_hi, t_lo);
+    fast_twosum(C0_hi, C0_lo, t_hi, t_lo, q_hi, q_lo);
+
+    // log(z) ~= x * Q(x)
+    twoprod(q_hi, q_lo, x, log2_x_hi, log2_x_lo);
+  }
+};
+
+// This specialization uses a more accurate algorithm to compute log2(x) for
+// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~1.27e-18.
+// This additional accuracy is needed to counter the error-magnification
+// inherent in multiplying by a potentially large exponent in pow(x,y).
+// The minimax polynomial used was calculated using the Sollya tool.
+// See sollya.org.
+
+template <>
+struct accurate_log2<double> {
+  template <typename Packet>
+  EIGEN_STRONG_INLINE
+  void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
+    // We use a transformation of variables:
+    //    r = c * (x-1) / (x+1),
+    // such that
+    //    log2(x) = log2((1 + r/c) / (1 - r/c)) = f(r).
+    // The function f(r) can be approximated well using an odd polynomial
+    // of the form
+    //   P(r) = ((Q(r^2) * r^2 + C) * r^2 + 1) * r,
+    // For the implementation of log2<double> here, Q is of degree 6 with
+    // coefficient represented in working precision (double), while C is a
+    // constant represented in extra precision as a double word to achieve
+    // full accuracy.
+    //
+    // The polynomial coefficients were computed by the Sollya script:
+    //
+    // c = 2 / log(2);
+    // trans = c * (x-1)/(x+1);
+    // itrans = (1+x/c)/(1-x/c);
+    // interval=[trans(sqrt(0.5)); trans(sqrt(2))];
+    // print(interval);
+    // f = log2(itrans(x));
+    // p=fpminimax(f,[|1,3,5,7,9,11,13,15,17|],[|1,DD,double...|],interval,relative,floating);
+    const Packet q12 = pset1<Packet>(2.87074255468000586e-9);
+    const Packet q10 = pset1<Packet>(2.38957980901884082e-8);
+    const Packet q8 = pset1<Packet>(2.31032094540014656e-7);
+    const Packet q6 = pset1<Packet>(2.27279857398537278e-6);
+    const Packet q4 = pset1<Packet>(2.31271023278625638e-5);
+    const Packet q2 = pset1<Packet>(2.47556738444535513e-4);
+    const Packet q0 = pset1<Packet>(2.88543873228900172e-3);
+    const Packet C_hi = pset1<Packet>(0.0400377511598501157);
+    const Packet C_lo = pset1<Packet>(-4.77726582251425391e-19);
+    const Packet one = pset1<Packet>(1.0);
+
+    const Packet cst_2_log2e_hi = pset1<Packet>(2.88539008177792677);
+    const Packet cst_2_log2e_lo = pset1<Packet>(4.07660016854549667e-17);
+    // c * (x - 1)
+    Packet num_hi, num_lo;
+    twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), num_hi, num_lo);
+    // TODO(rmlarsen): Investigate if using the division algorithm by
+    // Muller et al. is faster/more accurate.
+    // 1 / (x + 1)
+    Packet denom_hi, denom_lo;
+    doubleword_reciprocal(padd(x, one), denom_hi, denom_lo);
+    // r =  c * (x-1) / (x+1),
+    Packet r_hi, r_lo;
+    twoprod(num_hi, num_lo, denom_hi, denom_lo, r_hi, r_lo);
+    // r2 = r * r
+    Packet r2_hi, r2_lo;
+    twoprod(r_hi, r_lo, r_hi, r_lo, r2_hi, r2_lo);
+    // r4 = r2 * r2
+    Packet r4_hi, r4_lo;
+    twoprod(r2_hi, r2_lo, r2_hi, r2_lo, r4_hi, r4_lo);
+
+    // Evaluate Q(r^2) in working precision. We evaluate it in two parts
+    // (even and odd in r^2) to improve instruction level parallelism.
+    Packet q_even = pmadd(q12, r4_hi, q8);
+    Packet q_odd = pmadd(q10, r4_hi, q6);
+    q_even = pmadd(q_even, r4_hi, q4);
+    q_odd = pmadd(q_odd, r4_hi, q2);
+    q_even = pmadd(q_even, r4_hi, q0);
+    Packet q = pmadd(q_odd, r2_hi, q_even);
+
+    // Now evaluate the low order terms of P(x) in double word precision.
+    // In the following, due to the increasing magnitude of the coefficients
+    // and r being constrained to [-0.5, 0.5] we can use fast_twosum instead
+    // of the slower twosum.
+    // Q(r^2) * r^2
+    Packet p_hi, p_lo;
+    twoprod(r2_hi, r2_lo, q, p_hi, p_lo);
+    // Q(r^2) * r^2 + C
+    Packet p1_hi, p1_lo;
+    fast_twosum(C_hi, C_lo, p_hi, p_lo, p1_hi, p1_lo);
+    // (Q(r^2) * r^2 + C) * r^2
+    Packet p2_hi, p2_lo;
+    twoprod(r2_hi, r2_lo, p1_hi, p1_lo, p2_hi, p2_lo);
+    // ((Q(r^2) * r^2 + C) * r^2 + 1)
+    Packet p3_hi, p3_lo;
+    fast_twosum(one, p2_hi, p2_lo, p3_hi, p3_lo);
+
+    // log(z) ~= ((Q(r^2) * r^2 + C) * r^2 + 1) * r
+    twoprod(p3_hi, p3_lo, r_hi, r_lo, log2_x_hi, log2_x_lo);
+  }
+};
+
+// This function computes exp2(x) (i.e. 2**x).
+template <typename Scalar>
+struct fast_accurate_exp2 {
+  template <typename Packet>
+  EIGEN_STRONG_INLINE
+  Packet operator()(const Packet& x) {
+    // TODO(rmlarsen): Add a pexp2 packetop.
+    return pexp(pmul(pset1<Packet>(Scalar(EIGEN_LN2)), x));
+  }
+};
+
+// This specialization uses a faster algorithm to compute exp2(x) for floats
+// in [-0.5;0.5] with a relative accuracy of 1 ulp.
+// The minimax polynomial used was calculated using the Sollya tool.
+// See sollya.org.
+template <>
+struct fast_accurate_exp2<float> {
+  template <typename Packet>
+  EIGEN_STRONG_INLINE
+  Packet operator()(const Packet& x) {
+    // This function approximates exp2(x) by a degree 6 polynomial of the form
+    // Q(x) = 1 + x * (C + x * P(x)), where the degree 4 polynomial P(x) is evaluated in
+    // single precision, and the remaining steps are evaluated with extra precision using
+    // double word arithmetic. C is an extra precise constant stored as a double word.
+    //
+    // The polynomial coefficients were calculated using Sollya commands:
+    // > n = 6;
+    // > f = 2^x;
+    // > interval = [-0.5;0.5];
+    // > p = fpminimax(f,n,[|1,double,single...|],interval,relative,floating);
+
+    const Packet p4 = pset1<Packet>(1.539513905e-4f);
+    const Packet p3 = pset1<Packet>(1.340007293e-3f);
+    const Packet p2 = pset1<Packet>(9.618283249e-3f);
+    const Packet p1 = pset1<Packet>(5.550328270e-2f);
+    const Packet p0 = pset1<Packet>(0.2402264923f);
+
+    const Packet C_hi = pset1<Packet>(0.6931471825f);
+    const Packet C_lo = pset1<Packet>(2.36836577e-08f);
+    const Packet one = pset1<Packet>(1.0f);
+
+    // Evaluate P(x) in working precision.
+    // We evaluate even and odd parts of the polynomial separately
+    // to gain some instruction level parallelism.
+    Packet x2 = pmul(x,x);
+    Packet p_even = pmadd(p4, x2, p2);
+    Packet p_odd = pmadd(p3, x2, p1);
+    p_even = pmadd(p_even, x2, p0);
+    Packet p = pmadd(p_odd, x, p_even);
+
+    // Evaluate the remaining terms of Q(x) with extra precision using
+    // double word arithmetic.
+    Packet p_hi, p_lo;
+    // x * p(x)
+    twoprod(p, x, p_hi, p_lo);
+    // C + x * p(x)
+    Packet q1_hi, q1_lo;
+    twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo);
+    // x * (C + x * p(x))
+    Packet q2_hi, q2_lo;
+    twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo);
+    // 1 + x * (C + x * p(x))
+    Packet q3_hi, q3_lo;
+    // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum
+    // for adding it to unity here.
+    fast_twosum(one, q2_hi, q3_hi, q3_lo);
+    return padd(q3_hi, padd(q2_lo, q3_lo));
+  }
+};
+
+// in [-0.5;0.5] with a relative accuracy of 1 ulp.
+// The minimax polynomial used was calculated using the Sollya tool.
+// See sollya.org.
+template <>
+struct fast_accurate_exp2<double> {
+  template <typename Packet>
+  EIGEN_STRONG_INLINE
+  Packet operator()(const Packet& x) {
+    // This function approximates exp2(x) by a degree 10 polynomial of the form
+    // Q(x) = 1 + x * (C + x * P(x)), where the degree 8 polynomial P(x) is evaluated in
+    // single precision, and the remaining steps are evaluated with extra precision using
+    // double word arithmetic. C is an extra precise constant stored as a double word.
+    //
+    // The polynomial coefficients were calculated using Sollya commands:
+    // > n = 11;
+    // > f = 2^x;
+    // > interval = [-0.5;0.5];
+    // > p = fpminimax(f,n,[|1,DD,double...|],interval,relative,floating);
+
+    const Packet p9 = pset1<Packet>(4.431642109085495276e-10);
+    const Packet p8 = pset1<Packet>(7.073829923303358410e-9);
+    const Packet p7 = pset1<Packet>(1.017822306737031311e-7);
+    const Packet p6 = pset1<Packet>(1.321543498017646657e-6);
+    const Packet p5 = pset1<Packet>(1.525273342728892877e-5);
+    const Packet p4 = pset1<Packet>(1.540353045780084423e-4);
+    const Packet p3 = pset1<Packet>(1.333355814685869807e-3);
+    const Packet p2 = pset1<Packet>(9.618129107593478832e-3);
+    const Packet p1 = pset1<Packet>(5.550410866481961247e-2);
+    const Packet p0 = pset1<Packet>(0.240226506959101332);
+    const Packet C_hi = pset1<Packet>(0.693147180559945286); 
+    const Packet C_lo = pset1<Packet>(4.81927865669806721e-17);
+    const Packet one = pset1<Packet>(1.0);
+
+    // Evaluate P(x) in working precision.
+    // We evaluate even and odd parts of the polynomial separately
+    // to gain some instruction level parallelism.
+    Packet x2 = pmul(x,x);
+    Packet p_even = pmadd(p8, x2, p6);
+    Packet p_odd = pmadd(p9, x2, p7);
+    p_even = pmadd(p_even, x2, p4);
+    p_odd = pmadd(p_odd, x2, p5);
+    p_even = pmadd(p_even, x2, p2);
+    p_odd = pmadd(p_odd, x2, p3);
+    p_even = pmadd(p_even, x2, p0);
+    p_odd = pmadd(p_odd, x2, p1);
+    Packet p = pmadd(p_odd, x, p_even);
+
+    // Evaluate the remaining terms of Q(x) with extra precision using
+    // double word arithmetic.
+    Packet p_hi, p_lo;
+    // x * p(x)
+    twoprod(p, x, p_hi, p_lo);
+    // C + x * p(x)
+    Packet q1_hi, q1_lo;
+    twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo);
+    // x * (C + x * p(x))
+    Packet q2_hi, q2_lo;
+    twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo);
+    // 1 + x * (C + x * p(x))
+    Packet q3_hi, q3_lo;
+    // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum
+    // for adding it to unity here.
+    fast_twosum(one, q2_hi, q3_hi, q3_lo);
+    return padd(q3_hi, padd(q2_lo, q3_lo));
+  }
+};
+
+// This function implements the non-trivial case of pow(x,y) where x is
+// positive and y is (possibly) non-integer.
+// Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x.
+// TODO(rmlarsen): We should probably add this as a packet up 'ppow', to make it
+// easier to specialize or turn off for specific types and/or backends.x
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  // Split x into exponent e_x and mantissa m_x.
+  Packet e_x;
+  Packet m_x = pfrexp(x, e_x);
+
+  // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x).
+  EIGEN_CONSTEXPR Scalar sqrt_half = Scalar(0.70710678118654752440);
+  const Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(sqrt_half));
+  m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
+  e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
+
+  // Compute log2(m_x) with 6 extra bits of accuracy.
+  Packet rx_hi, rx_lo;
+  accurate_log2<Scalar>()(m_x, rx_hi, rx_lo);
+
+  // Compute the two terms {y * e_x, y * r_x} in f = y * log2(x) with doubled
+  // precision using double word arithmetic.
+  Packet f1_hi, f1_lo, f2_hi, f2_lo;
+  twoprod(e_x, y, f1_hi, f1_lo);
+  twoprod(rx_hi, rx_lo, y, f2_hi, f2_lo);
+  // Sum the two terms in f using double word arithmetic. We know
+  // that |e_x| > |log2(m_x)|, except for the case where e_x==0.
+  // This means that we can use fast_twosum(f1,f2).
+  // In the case e_x == 0, e_x * y = f1 = 0, so we don't lose any
+  // accuracy by violating the assumption of fast_twosum, because
+  // it's a no-op.
+  Packet f_hi, f_lo;
+  fast_twosum(f1_hi, f1_lo, f2_hi, f2_lo, f_hi, f_lo);
+
+  // Split f into integer and fractional parts.
+  Packet n_z, r_z;
+  absolute_split(f_hi, n_z, r_z);
+  r_z = padd(r_z, f_lo);
+  Packet n_r;
+  absolute_split(r_z, n_r, r_z);
+  n_z = padd(n_z, n_r);
+
+  // We now have an accurate split of f = n_z + r_z and can compute
+  //   x^y = 2**{n_z + r_z) = exp2(r_z) * 2**{n_z}.
+  // Since r_z is in [-0.5;0.5], we compute the first factor to high accuracy
+  // using a specialized algorithm. Multiplication by the second factor can
+  // be done exactly using pldexp(), since it is an integer power of 2.
+  const Packet e_r = fast_accurate_exp2<Scalar>()(r_z);
+  return pldexp(e_r, n_z);
+}
+
+// Generic implementation of pow(x,y).
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet generic_pow(const Packet& x, const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+
+  const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
+  const Packet cst_zero = pset1<Packet>(Scalar(0));
+  const Packet cst_one = pset1<Packet>(Scalar(1));
+  const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
+
+  const Packet abs_x = pabs(x);
+  // Predicates for sign and magnitude of x.
+  const Packet x_is_zero = pcmp_eq(x, cst_zero);
+  const Packet x_is_neg = pcmp_lt(x, cst_zero);
+  const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
+  const Packet abs_x_is_one =  pcmp_eq(abs_x, cst_one);
+  const Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x);
+  const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one);
+  const Packet x_is_one =  pandnot(abs_x_is_one, x_is_neg);
+  const Packet x_is_neg_one =  pand(abs_x_is_one, x_is_neg);
+  const Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x));
+
+  // Predicates for sign and magnitude of y.
+  const Packet y_is_one = pcmp_eq(y, cst_one);
+  const Packet y_is_zero = pcmp_eq(y, cst_zero);
+  const Packet y_is_neg = pcmp_lt(y, cst_zero);
+  const Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg));
+  const Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y));
+  const Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf);
+  EIGEN_CONSTEXPR Scalar huge_exponent =
+      (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) /
+       NumTraits<Scalar>::epsilon();
+  const Packet abs_y_is_huge = pcmp_le(pset1<Packet>(huge_exponent), pabs(y));
+
+  // Predicates for whether y is integer and/or even.
+  const Packet y_is_int = pcmp_eq(pfloor(y), y);
+  const Packet y_div_2 = pmul(y, pset1<Packet>(Scalar(0.5)));
+  const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
+
+  // Predicates encoding special cases for the value of pow(x,y)
+  const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf),
+                                                    y_is_int),
+                                            abs_y_is_inf);
+  const Packet pow_is_one = por(por(x_is_one, y_is_zero),
+                                pand(x_is_neg_one,
+                                     por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x))));
+  const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan));
+  const Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos),
+                                         pand(abs_x_is_inf, y_is_neg)),
+                                     pand(pand(abs_x_is_lt_one, abs_y_is_huge),
+                                          y_is_pos)),
+                                 pand(pand(abs_x_is_gt_one, abs_y_is_huge),
+                                      y_is_neg));
+  const Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg),
+                                        pand(abs_x_is_inf, y_is_pos)),
+                                    pand(pand(abs_x_is_lt_one, abs_y_is_huge),
+                                         y_is_neg)),
+                                pand(pand(abs_x_is_gt_one, abs_y_is_huge),
+                                     y_is_pos));
+
+  // General computation of pow(x,y) for positive x or negative x and integer y.
+  const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even);
+  const Packet pow_abs = generic_pow_impl(abs_x, y);
+  return pselect(y_is_one, x,
+                 pselect(pow_is_one, cst_one,
+                         pselect(pow_is_nan, cst_nan,
+                                 pselect(pow_is_inf, cst_pos_inf,
+                                         pselect(pow_is_zero, cst_zero,
+                                                 pselect(negate_pow_abs, pnegate(pow_abs), pow_abs))))));
+}
+
+
+
+/* polevl (modified for Eigen)
+ *
+ *      Evaluate polynomial
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N+1];
+ *
+ * y = polevl<decltype(x), N>( x, coef);
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates polynomial of degree N:
+ *
+ *                     2          N
+ * y  =  C  + C x + C x  +...+ C x
+ *        0    1     2          N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C  , ..., coef[N] = C  .
+ *            N                   0
+ *
+ *  The function p1evl() assumes that coef[N] = 1.0 and is
+ * omitted from the array.  Its calling arguments are
+ * otherwise the same as polevl().
+ *
+ *
+ * The Eigen implementation is templatized.  For best speed, store
+ * coef as a const array (constexpr), e.g.
+ *
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
+ *
+ */
+template <typename Packet, int N>
+struct ppolevl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
+    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return pmadd(ppolevl<Packet, N-1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
+  }
+};
+
+template <typename Packet>
+struct ppolevl<Packet, 0> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
+    EIGEN_UNUSED_VARIABLE(x);
+    return pset1<Packet>(coeff[0]);
+  }
+};
+
+/* chbevl (modified for Eigen)
+ *
+ *     Evaluate Chebyshev series
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N], chebevl();
+ *
+ * y = chbevl( x, coef, N );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates the series
+ *
+ *        N-1
+ *         - '
+ *  y  =   >   coef[i] T (x/2)
+ *         -            i
+ *        i=0
+ *
+ * of Chebyshev polynomials Ti at argument x/2.
+ *
+ * Coefficients are stored in reverse order, i.e. the zero
+ * order term is last in the array.  Note N is the number of
+ * coefficients, not the order.
+ *
+ * If coefficients are for the interval a to b, x must
+ * have been transformed to x -> 2(2x - b - a)/(b-a) before
+ * entering the routine.  This maps x from (a, b) to (-1, 1),
+ * over which the Chebyshev polynomials are defined.
+ *
+ * If the coefficients are for the inverted interval, in
+ * which (a, b) is mapped to (1/b, 1/a), the transformation
+ * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity,
+ * this becomes x -> 4a/x - 1.
+ *
+ *
+ *
+ * SPEED:
+ *
+ * Taking advantage of the recurrence properties of the
+ * Chebyshev polynomials, the routine requires one more
+ * addition per loop than evaluating a nested polynomial of
+ * the same degree.
+ *
+ */
+
+template <typename Packet, int N>
+struct pchebevl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
+    typedef typename unpacket_traits<Packet>::type Scalar;
+    Packet b0 = pset1<Packet>(coef[0]);
+    Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
+    Packet b2;
+
+    for (int i = 1; i < N; i++) {
+      b2 = b1;
+      b1 = b0;
+      b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
+    }
+
+    return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
+  }
+};
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/src/3rdparty/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
new file mode 100644
index 000000000..177a04e93
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H
+
+namespace Eigen {
+namespace internal {
+
+// Forward declarations of the generic math functions
+// implemented in GenericPacketMathFunctions.h
+// This is needed to workaround a circular dependency.
+
+/***************************************************************************
+ * Some generic implementations to be used by implementors
+***************************************************************************/
+
+/** Default implementation of pfrexp.
+  * It is expected to be called by implementers of template<> pfrexp.
+  */
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+Packet pfrexp_generic(const Packet& a, Packet& exponent);
+
+// Extracts the biased exponent value from Packet p, and casts the results to
+// a floating-point Packet type. Used by pfrexp_generic. Override this if
+// there is no unpacket_traits<Packet>::integer_packet.
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+Packet pfrexp_generic_get_biased_exponent(const Packet& p);
+
+/** Default implementation of pldexp.
+  * It is expected to be called by implementers of template<> pldexp.
+  */
+template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+Packet pldexp_generic(const Packet& a, const Packet& exponent);
+
+/** \internal \returns log(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_float(const Packet _x);
+
+/** \internal \returns log2(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog2_float(const Packet _x);
+
+/** \internal \returns log(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_double(const Packet _x);
+
+/** \internal \returns log2(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog2_double(const Packet _x);
+
+/** \internal \returns log(1 + x) */
+template<typename Packet>
+Packet generic_plog1p(const Packet& x);
+
+/** \internal \returns exp(x)-1 */
+template<typename Packet>
+Packet generic_expm1(const Packet& x);
+
+/** \internal \returns exp(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pexp_float(const Packet _x);
+
+/** \internal \returns exp(x) for double precision real numbers */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pexp_double(const Packet _x);
+
+/** \internal \returns sin(x) for single precision float */
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet psin_float(const Packet& x);
+
+/** \internal \returns cos(x) for single precision float */
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pcos_float(const Packet& x);
+
+/** \internal \returns sqrt(x) for complex types */
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet psqrt_complex(const Packet& a);
+
+template <typename Packet, int N> struct ppolevl;
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/Default/Half.h b/src/3rdparty/eigen/Eigen/src/Core/arch/Default/Half.h
new file mode 100644
index 000000000..9f8e8cc1e
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/Default/Half.h
@@ -0,0 +1,942 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+// The conversion routines are Copyright (c) Fabian Giesen, 2016.
+// The original license follows:
+//
+// Copyright (c) Fabian Giesen, 2016
+// All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Standard 16-bit float type, mostly useful for GPUs. Defines a new
+// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
+// operator overloads such that it behaves basically as an arithmetic
+// type. It will be quite slow on CPUs (so it is recommended to stay
+// in fp32 for CPUs, except for simple parameter conversions, I/O
+// to disk and the likes), but fast on GPUs.
+
+
+#ifndef EIGEN_HALF_H
+#define EIGEN_HALF_H
+
+#include <sstream>
+
+#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+// When compiling with GPU support, the "__half_raw" base class as well as
+// some other routines are defined in the GPU compiler header files
+// (cuda_fp16.h, hip_fp16.h), and they are not tagged constexpr
+// As a consequence, we get compile failures when compiling Eigen with
+// GPU support. Hence the need to disable EIGEN_CONSTEXPR when building
+// Eigen with GPU support
+  #pragma push_macro("EIGEN_CONSTEXPR")
+  #undef EIGEN_CONSTEXPR
+  #define EIGEN_CONSTEXPR
+#endif
+
+#define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD)           \
+  template <>                                                       \
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED                \
+  PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) {             \
+    return float2half(METHOD<PACKET_F>(half2float(_x)));            \
+  }
+
+namespace Eigen {
+
+struct half;
+
+namespace half_impl {
+
+// We want to use the __half_raw struct from the HIP header file only during the device compile phase.
+// This is required because of a quirk in the way TensorFlow GPU builds are done.
+// When compiling TensorFlow source code with GPU support, files that
+//  * contain GPU kernels (i.e. *.cu.cc files) are compiled via hipcc
+//  * do not contain GPU kernels ( i.e. *.cc files) are compiled via gcc (typically)
+//
+// Tensorflow uses the Eigen::half type as its FP16 type, and there are functions that
+//  * are defined in a file that gets compiled via hipcc AND
+//  * have Eigen::half as a pass-by-value argument AND
+//  * are called in a file that gets compiled via gcc
+//
+// In the scenario described above the caller and callee will see different versions
+// of the Eigen::half base class __half_raw, and they will be compiled by different compilers
+//
+// There appears to be an ABI mismatch between gcc and clang (which is called by hipcc) that results in
+// the callee getting corrupted values for the Eigen::half argument.
+//
+// Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves
+// this error, and hence the following convoluted #if condition
+#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+// Make our own __half_raw definition that is similar to CUDA's.
+struct __half_raw {
+#if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE))
+  // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF)
+  // The element type for shared memory cannot have non-trivial constructors
+  // and hence the following special casing (which skips the zero-initilization).
+  // Note that this check gets done even in the host compilation phase, and
+  // hence the need for this
+  EIGEN_DEVICE_FUNC __half_raw() {}
+#else
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw() : x(0) {}
+#endif
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {
+  }
+  __fp16 x;
+#else
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(raw) {}
+  numext::uint16_t x;
+#endif
+};
+
+#elif defined(EIGEN_HAS_HIP_FP16)
+  // Nothing to do here
+  // HIP fp16 header file has a definition for __half_raw
+#elif defined(EIGEN_HAS_CUDA_FP16)
+  #if EIGEN_CUDA_SDK_VER < 90000
+    // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
+    typedef __half __half_raw;
+  #endif // defined(EIGEN_HAS_CUDA_FP16)
+#elif defined(SYCL_DEVICE_ONLY)
+  typedef cl::sycl::half __half_raw;
+#endif
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
+
+struct half_base : public __half_raw {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base() {}
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
+
+#if defined(EIGEN_HAS_GPU_FP16)
+ #if defined(EIGEN_HAS_HIP_FP16)
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
+ #elif defined(EIGEN_HAS_CUDA_FP16)
+  #if EIGEN_CUDA_SDK_VER >= 90000
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
+  #endif
+ #endif
+#endif
+};
+
+} // namespace half_impl
+
+// Class definition.
+struct half : public half_impl::half_base {
+
+  // Writing this out as separate #if-else blocks to make the code easier to follow
+  // The same applies to most #if-else blocks in this file
+#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+  // Use the same base class for the following two scenarios
+  // * when compiling without GPU support enabled
+  // * during host compile phase when compiling with GPU support enabled
+  typedef half_impl::__half_raw __half_raw;
+#elif defined(EIGEN_HAS_HIP_FP16)
+  // Nothing to do here
+  // HIP fp16 header file has a definition for __half_raw
+#elif defined(EIGEN_HAS_CUDA_FP16)
+  // Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
+  // (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP!  So keeping this within
+  // #if defined(EIGEN_HAS_CUDA_FP16) is needed
+  #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    typedef half_impl::__half_raw __half_raw;
+  #endif
+#endif
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
+
+#if defined(EIGEN_HAS_GPU_FP16)
+ #if defined(EIGEN_HAS_HIP_FP16)
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
+ #elif defined(EIGEN_HAS_CUDA_FP16)
+  #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
+  #endif
+ #endif
+#endif
+
+
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(bool b)
+      : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
+  template<class T>
+  explicit EIGEN_DEVICE_FUNC half(T val)
+      : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(val))) {}
+  explicit EIGEN_DEVICE_FUNC half(float f)
+      : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
+
+  // Following the convention of numpy, converting between complex and
+  // float will lead to loss of imag value.
+  template<typename RealScalar>
+  explicit EIGEN_DEVICE_FUNC half(std::complex<RealScalar> c)
+      : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(c.real()))) {}
+
+   EIGEN_DEVICE_FUNC operator float() const {  // NOLINT: Allow implicit conversion to float, because it is lossless.
+    return half_impl::half_to_float(*this);
+  }
+
+#if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)
+  EIGEN_DEVICE_FUNC operator __half() const {
+    ::__half_raw hr;
+    hr.x = x;
+    return __half(hr);
+  }
+#endif
+};
+
+} // end namespace Eigen
+
+namespace std {
+template<>
+struct numeric_limits<Eigen::half> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 11;
+  static const int digits10 = 3;      // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static const int max_digits10 = 5;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static const int radix = 2;
+  static const int min_exponent = -13;
+  static const int min_exponent10 = -4;
+  static const int max_exponent = 16;
+  static const int max_exponent10 = 4;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
+  static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
+  static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
+  static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
+  static Eigen::half round_error() { return Eigen::half(0.5); }
+  static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
+  static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+  static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); }
+  static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
+};
+
+// If std::numeric_limits<T> is specialized, should also specialize
+// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
+// std::numeric_limits<const volatile T>
+// https://stackoverflow.com/a/16519653/
+template<>
+struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {};
+template<>
+struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {};
+template<>
+struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {};
+} // end namespace std
+
+namespace Eigen {
+
+namespace half_impl {
+
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
+     EIGEN_CUDA_ARCH >= 530) ||                                  \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
+// Note: We deliberatly do *not* define this to 1 even if we have Arm's native
+// fp16 type since GPU halfs are rather different from native CPU halfs.
+// TODO: Rename to something like EIGEN_HAS_NATIVE_GPU_FP16
+#define EIGEN_HAS_NATIVE_FP16
+#endif
+
+// Intrinsics for native fp16 support. Note that on current hardware,
+// these are no faster than fp32 arithmetic (you need to use the half2
+// versions to get the ALU speed increased), but you do save the
+// conversion steps back and forth.
+
+#if defined(EIGEN_HAS_NATIVE_FP16)
+EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+  return __hadd(::__half(a), ::__half(b));
+#else
+  return __hadd(a, b);
+#endif
+}
+EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
+  return __hmul(a, b);
+}
+EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
+  return __hsub(a, b);
+}
+EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+  return __hdiv(a, b);
+#else
+  float num = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+#endif
+}
+EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
+  return __hneg(a);
+}
+EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
+  a = a + b;
+  return a;
+}
+EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
+  a = a * b;
+  return a;
+}
+EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
+  a = a - b;
+  return a;
+}
+EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
+  a = a / b;
+  return a;
+}
+EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
+  return __heq(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
+  return __hne(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
+  return __hlt(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
+  return __hle(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
+  return __hgt(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
+  return __hge(a, b);
+}
+#endif
+
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
+  return half(vaddh_f16(a.x, b.x));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
+  return half(vmulh_f16(a.x, b.x));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
+  return half(vsubh_f16(a.x, b.x));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
+  return half(vdivh_f16(a.x, b.x));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
+  return half(vnegh_f16(a.x));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
+  a = half(vaddh_f16(a.x, b.x));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
+  a = half(vmulh_f16(a.x, b.x));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
+  a = half(vsubh_f16(a.x, b.x));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
+  a = half(vdivh_f16(a.x, b.x));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
+  return vceqh_f16(a.x, b.x);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
+  return !vceqh_f16(a.x, b.x);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
+  return vclth_f16(a.x, b.x);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
+  return vcleh_f16(a.x, b.x);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
+  return vcgth_f16(a.x, b.x);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
+  return vcgeh_f16(a.x, b.x);
+}
+// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
+// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
+// of the functions, while the latter can only deal with one of them.
+#elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats
+
+#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
+// We need to provide emulated *host-side* FP16 operators for clang.
+#pragma push_macro("EIGEN_DEVICE_FUNC")
+#undef EIGEN_DEVICE_FUNC
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16)
+#define EIGEN_DEVICE_FUNC __host__
+#else // both host and device need emulated ops.
+#define EIGEN_DEVICE_FUNC __host__ __device__
+#endif
+#endif
+
+// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
+// to/from fp32.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
+  return half(float(a) + float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
+  return half(float(a) * float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
+  return half(float(a) - float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
+  return half(float(a) / float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
+  half result;
+  result.x = a.x ^ 0x8000;
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
+  a = half(float(a) + float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
+  a = half(float(a) * float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
+  a = half(float(a) - float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
+  a = half(float(a) / float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
+  return numext::equal_strict(float(a),float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
+  return numext::not_equal_strict(float(a), float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
+  return float(a) < float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
+  return float(a) <= float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
+  return float(a) > float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
+  return float(a) >= float(b);
+}
+
+#if defined(__clang__) && defined(__CUDA__)
+#pragma pop_macro("EIGEN_DEVICE_FUNC")
+#endif
+#endif  // Emulate support for half floats
+
+// Division by an index. Do it in full float precision to avoid accuracy
+// issues in converting the denominator to half.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
+  return half(static_cast<float>(a) / static_cast<float>(b));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a) {
+  a += half(1);
+  return a;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a) {
+  a -= half(1);
+  return a;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a, int) {
+  half original_value = a;
+  ++a;
+  return original_value;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a, int) {
+  half original_value = a;
+  --a;
+  return original_value;
+}
+
+// Conversion routines, including fallbacks for the host or older CUDA.
+// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
+// these in hardware. If we need more performance on older/other CPUs, they are
+// also possible to vectorize directly.
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) {
+  // We cannot simply do a "return __half_raw(x)" here, because __half_raw is union type
+  // in the hip_fp16 header file, and that will trigger a compile error
+  // On the other hand, having anything but a return statement also triggers a compile error
+  // because this is constexpr function.
+  // Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out
+  // of this catch22 by having separate bodies for GPU / non GPU
+#if defined(EIGEN_HAS_GPU_FP16)
+   __half_raw h;
+   h.x = x;
+  return h;
+#else
+  return __half_raw(x);
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const __half_raw& h) {
+  // HIP/CUDA/Default have a member 'x' of type uint16_t.
+  // For ARM64 native half, the member 'x' is of type __fp16, so we need to bit-cast.
+  // For SYCL, cl::sycl::half is _Float16, so cast directly.
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  return numext::bit_cast<numext::uint16_t>(h.x);
+#elif defined(SYCL_DEVICE_ONLY)
+  return numext::bit_cast<numext::uint16_t>(h);
+#else
+  return h.x;
+#endif
+}
+
+union float32_bits {
+  unsigned int u;
+  float f;
+};
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  __half tmp_ff = __float2half(ff);
+  return *(__half_raw*)&tmp_ff;
+
+#elif defined(EIGEN_HAS_FP16_C)
+  __half_raw h;
+  h.x = _cvtss_sh(ff, 0);
+  return h;
+
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  __half_raw h;
+  h.x = static_cast<__fp16>(ff);
+  return h;
+
+#else
+  float32_bits f; f.f = ff;
+
+  const float32_bits f32infty = { 255 << 23 };
+  const float32_bits f16max = { (127 + 16) << 23 };
+  const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+  unsigned int sign_mask = 0x80000000u;
+  __half_raw o;
+  o.x = static_cast<numext::uint16_t>(0x0u);
+
+  unsigned int sign = f.u & sign_mask;
+  f.u ^= sign;
+
+  // NOTE all the integer compares in this function can be safely
+  // compiled into signed compares since all operands are below
+  // 0x80000000. Important if you want fast straight SSE2 code
+  // (since there's no unsigned PCMPGTD).
+
+  if (f.u >= f16max.u) {  // result is Inf or NaN (all exponent bits set)
+    o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+  } else {  // (De)normalized number or zero
+    if (f.u < (113 << 23)) {  // resulting FP16 is subnormal or zero
+      // use a magic value to align our 10 mantissa bits at the bottom of
+      // the float. as long as FP addition is round-to-nearest-even this
+      // just works.
+      f.f += denorm_magic.f;
+
+      // and one integer subtract of the bias later, we have our final float!
+      o.x = static_cast<numext::uint16_t>(f.u - denorm_magic.u);
+    } else {
+      unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+
+      // update exponent, rounding bias part 1
+      // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
+      // without arithmetic overflow.
+      f.u += 0xc8000fffU;
+      // rounding bias part 2
+      f.u += mant_odd;
+      // take the bits!
+      o.x = static_cast<numext::uint16_t>(f.u >> 13);
+    }
+  }
+
+  o.x |= static_cast<numext::uint16_t>(sign >> 16);
+  return o;
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  return __half2float(h);
+#elif defined(EIGEN_HAS_FP16_C)
+  return _cvtsh_ss(h.x);
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  return static_cast<float>(h.x);
+#else
+  const float32_bits magic = { 113 << 23 };
+  const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
+  float32_bits o;
+
+  o.u = (h.x & 0x7fff) << 13;             // exponent/mantissa bits
+  unsigned int exp = shifted_exp & o.u;   // just the exponent
+  o.u += (127 - 15) << 23;                // exponent adjust
+
+  // handle exponent special cases
+  if (exp == shifted_exp) {     // Inf/NaN?
+    o.u += (128 - 16) << 23;    // extra exp adjust
+  } else if (exp == 0) {        // Zero/Denormal?
+    o.u += 1 << 23;             // extra exp adjust
+    o.f -= magic.f;             // renormalize
+  }
+
+  o.u |= (h.x & 0x8000) << 16;    // sign bit
+  return o.f;
+#endif
+}
+
+// --- standard functions ---
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
+#ifdef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
+  return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) == 0x7c00;
+#else
+  return (a.x & 0x7fff) == 0x7c00;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  return __hisnan(a);
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
+#else
+  return (a.x & 0x7fff) > 0x7c00;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) {
+  return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  return half(vabsh_f16(a.x));
+#else
+  half result;
+  result.x = a.x & 0x7FFF;
+  return result;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
+  return half(hexp(a));
+#else
+   return half(::expf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
+  return half(numext::expm1(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  return half(::hlog(a));
+#else
+  return half(::logf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) {
+  return half(numext::log1p(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
+  return half(::log10f(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) {
+  return half(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
+  return half(hsqrt(a));
+#else
+    return half(::sqrtf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
+  return half(::powf(float(a), float(b)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) {
+  return half(::sinf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) {
+  return half(::cosf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) {
+  return half(::tanf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
+  return half(::tanhf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half asin(const half& a) {
+  return half(::asinf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) {
+  return half(::acosf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
+  return half(hfloor(a));
+#else
+  return half(::floorf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
+  return half(hceil(a));
+#else
+  return half(::ceilf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half rint(const half& a) {
+  return half(::rintf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half round(const half& a) {
+  return half(::roundf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half fmod(const half& a, const half& b) {
+  return half(::fmodf(float(a), float(b)));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  return __hlt(b, a) ? b : a;
+#else
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f2 < f1 ? b : a;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  return __hlt(a, b) ? b : a;
+#else
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f1 < f2 ? b : a;
+#endif
+}
+
+#ifndef EIGEN_NO_IO
+EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) {
+  os << static_cast<float>(v);
+  return os;
+}
+#endif
+
+} // end namespace half_impl
+
+// import Eigen::half_impl::half into Eigen namespace
+// using half_impl::half;
+
+namespace internal {
+
+template<>
+struct random_default_impl<half, false, false>
+{
+  static inline half run(const half& x, const half& y)
+  {
+    return x + (y-x) * half(float(std::rand()) / float(RAND_MAX));
+  }
+  static inline half run()
+  {
+    return run(half(-1.f), half(1.f));
+  }
+};
+
+template<> struct is_arithmetic<half> { enum { value = true }; };
+
+} // end namespace internal
+
+template<> struct NumTraits<Eigen::half>
+    : GenericNumTraits<Eigen::half>
+{
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() {
+    return half_impl::raw_uint16_to_half(0x0800);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
+    return half_impl::raw_uint16_to_half(0x211f); //  Eigen::half(1e-2f);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() {
+    return half_impl::raw_uint16_to_half(0x7bff);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half lowest() {
+    return half_impl::raw_uint16_to_half(0xfbff);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half infinity() {
+    return half_impl::raw_uint16_to_half(0x7c00);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
+    return half_impl::raw_uint16_to_half(0x7e00);
+  }
+};
+
+} // end namespace Eigen
+
+#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  #pragma pop_macro("EIGEN_CONSTEXPR")
+#endif
+
+namespace Eigen {
+namespace numext {
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::half& h) {
+  return (half_impl::isnan)(h);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::half& h) {
+  return (half_impl::isinf)(h);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::half& h) {
+  return (half_impl::isfinite)(h);
+}
+
+#endif
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bit_cast<Eigen::half, uint16_t>(const uint16_t& src) {
+  return Eigen::half(Eigen::half_impl::raw_uint16_to_half(src));
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::half>(const Eigen::half& src) {
+  return Eigen::half_impl::raw_half_as_uint16(src);
+}
+
+}  // namespace numext
+}  // namespace Eigen
+
+// Add the missing shfl* intrinsics.
+// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
+//   CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
+//
+// HIP and CUDA prior to SDK 9.0 define
+//    __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
+// CUDA since 9.0 deprecates those and instead defines
+//    __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
+//    with native support for __half and __nv_bfloat16
+//
+// Note that the following are __device__ - only functions.
+#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) \
+    || defined(EIGEN_HIPCC)
+
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane, int width=warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_sync(mask, h, srcLane, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_up_sync(mask, h, delta, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_down_sync(mask, h, delta, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask, int width=warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
+}
+
+#else // HIP or CUDA SDK < 9.0
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width=warpSize) {
+  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+  return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl(ivar, srcLane, width)));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width=warpSize) {
+  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+  return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_up(ivar, delta, width)));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width=warpSize) {
+  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+  return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_down(ivar, delta, width)));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
+  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+  return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_xor(ivar, laneMask, width)));
+}
+
+#endif // HIP vs CUDA
+#endif // __shfl*
+
+// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
+#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) \
+    || defined(EIGEN_HIPCC)
+EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
+  return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
+}
+#endif // __ldg
+
+#if EIGEN_HAS_STD_HASH
+namespace std {
+template <>
+struct hash<Eigen::half> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const {
+    return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a));
+  }
+};
+} // end namespace std
+#endif
+
+#endif // EIGEN_HALF_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/Default/Settings.h b/src/3rdparty/eigen/Eigen/src/Core/arch/Default/Settings.h
new file mode 100644
index 000000000..a5c3ada4c
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/Default/Settings.h
@@ -0,0 +1,49 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+/* All the parameters defined in this file can be specialized in the
+ * architecture specific files, and/or by the user.
+ * More to come... */
+
+#ifndef EIGEN_DEFAULT_SETTINGS_H
+#define EIGEN_DEFAULT_SETTINGS_H
+
+/** Defines the maximal loop size to enable meta unrolling of loops.
+  * Note that the value here is expressed in Eigen's own notion of "number of FLOPS",
+  * it does not correspond to the number of iterations or the number of instructions
+  */
+#ifndef EIGEN_UNROLLING_LIMIT
+#define EIGEN_UNROLLING_LIMIT 110
+#endif
+
+/** Defines the threshold between a "small" and a "large" matrix.
+  * This threshold is mainly used to select the proper product implementation.
+  */
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+/** Defines the maximal width of the blocks used in the triangular product and solver
+  * for vectors (level 2 blas xTRMV and xTRSV). The default is 8.
+  */
+#ifndef EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH
+#define EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH 8
+#endif
+
+
+/** Defines the default number of registers available for that architecture.
+  * Currently it must be 8 or 16. Other values will fail.
+  */
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
+#endif
+
+#endif // EIGEN_DEFAULT_SETTINGS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/Default/TypeCasting.h b/src/3rdparty/eigen/Eigen/src/Core/arch/Default/TypeCasting.h
new file mode 100644
index 000000000..fb8183b78
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/Default/TypeCasting.h
@@ -0,0 +1,120 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERIC_TYPE_CASTING_H
+#define EIGEN_GENERIC_TYPE_CASTING_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<>
+struct scalar_cast_op<float, Eigen::half> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef Eigen::half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+      return __float2half(a);
+    #else
+      return Eigen::half(a);
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<float, Eigen::half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<int, Eigen::half> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef Eigen::half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+      return __float2half(static_cast<float>(a));
+    #else
+      return Eigen::half(static_cast<float>(a));
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<int, Eigen::half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<Eigen::half, float> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef float result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+      return __half2float(a);
+    #else
+      return static_cast<float>(a);
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<Eigen::half, float> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<float, Eigen::bfloat16> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef Eigen::bfloat16 result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 operator() (const float& a) const {
+    return Eigen::bfloat16(a);
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<float, Eigen::bfloat16> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<int, Eigen::bfloat16> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef Eigen::bfloat16 result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 operator() (const int& a) const {
+    return Eigen::bfloat16(static_cast<float>(a));
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<int, Eigen::bfloat16> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<Eigen::bfloat16, float> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef float result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::bfloat16& a) const {
+    return static_cast<float>(a);
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<Eigen::bfloat16, float> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+}
+}
+
+#endif  // EIGEN_GENERIC_TYPE_CASTING_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h b/src/3rdparty/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h
new file mode 100644
index 000000000..d2b3a2568
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h
@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_GPU_H
+#define EIGEN_MATH_FUNCTIONS_GPU_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plog<float4>(const float4& a)
+{
+  return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
+}
+
+template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plog<double2>(const double2& a)
+{
+  using ::log;
+  return make_double2(log(a.x), log(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plog1p<float4>(const float4& a)
+{
+  return make_float4(log1pf(a.x), log1pf(a.y), log1pf(a.z), log1pf(a.w));
+}
+
+template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plog1p<double2>(const double2& a)
+{
+  return make_double2(log1p(a.x), log1p(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pexp<float4>(const float4& a)
+{
+  return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pexp<double2>(const double2& a)
+{
+  using ::exp;
+  return make_double2(exp(a.x), exp(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pexpm1<float4>(const float4& a)
+{
+  return make_float4(expm1f(a.x), expm1f(a.y), expm1f(a.z), expm1f(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pexpm1<double2>(const double2& a)
+{
+  return make_double2(expm1(a.x), expm1(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 psqrt<float4>(const float4& a)
+{
+  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 psqrt<double2>(const double2& a)
+{
+  using ::sqrt;
+  return make_double2(sqrt(a.x), sqrt(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 prsqrt<float4>(const float4& a)
+{
+  return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 prsqrt<double2>(const double2& a)
+{
+  return make_double2(rsqrt(a.x), rsqrt(a.y));
+}
+
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_GPU_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/GPU/PacketMath.h b/src/3rdparty/eigen/Eigen/src/Core/arch/GPU/PacketMath.h
new file mode 100644
index 000000000..689110ded
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -0,0 +1,1685 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_GPU_H
+#define EIGEN_PACKET_MATH_GPU_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Read-only data cached load available.
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
+#define EIGEN_GPU_HAS_LDG 1
+#endif
+
+// FP16 math available.
+#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
+#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
+#endif
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
+#endif
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+
+template<> struct is_arithmetic<float4>  { enum { value = true }; };
+template<> struct is_arithmetic<double2> { enum { value = true }; };
+
+template<> struct packet_traits<float> : default_packet_traits
+{
+  typedef float4 type;
+  typedef float4 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
+    HasIGamma = 1,
+    HasIGammaDerA = 1,
+    HasGammaSampleDerAlpha = 1,
+    HasIGammac = 1,
+    HasBetaInc = 1,
+
+    HasBlend = 0,
+    HasFloor = 1,
+  };
+};
+
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef double2 type;
+  typedef double2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
+    HasIGamma = 1,
+    HasIGammaDerA = 1,
+    HasGammaSampleDerAlpha = 1,
+    HasIGammac = 1,
+    HasBetaInc = 1,
+
+    HasBlend = 0,
+    HasFloor = 1,
+  };
+};
+
+
+template<> struct unpacket_traits<float4>  { typedef float  type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef float4 half; };
+template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef double2 half; };
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float&  from) {
+  return make_float4(from, from, from, from);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
+  return make_double2(from, from);
+}
+
+// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
+// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
+// of the functions, while the latter can only deal with one of them.
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+namespace {
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
+                                                        const float& b) {
+  return __int_as_float(__float_as_int(a) & __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a,
+                                                         const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) &
+                              __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a,
+                                                       const float& b) {
+  return __int_as_float(__float_as_int(a) | __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a,
+                                                        const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) |
+                              __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a,
+                                                        const float& b) {
+  return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a,
+                                                         const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) ^
+                              __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a,
+                                                           const float& b) {
+  return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a,
+                                                            const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) &
+                              ~__double_as_longlong(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a,
+                                                    const float& b) {
+  return __int_as_float(a == b ? 0xffffffffu : 0u);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a,
+                                                     const double& b) {
+  return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a,
+                                                    const float& b) {
+  return __int_as_float(a < b ? 0xffffffffu : 0u);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a,
+                                                     const double& b) {
+  return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
+}
+
+}  // namespace
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
+                                                          const float4& b) {
+  return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
+                     bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a,
+                                                            const double2& b) {
+  return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a,
+                                                         const float4& b) {
+  return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
+                     bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a,
+                                                           const double2& b) {
+  return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a,
+                                                          const float4& b) {
+  return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
+                     bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a,
+                                                            const double2& b) {
+  return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a,
+                                                             const float4& b) {
+  return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y),
+                     bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pandnot<double2>(const double2& a, const double2& b) {
+  return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a,
+                                                             const float4& b) {
+  return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
+                     eq_mask(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a,
+                                                             const float4& b) {
+  return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z),
+                     lt_mask(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pcmp_eq<double2>(const double2& a, const double2& b) {
+  return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pcmp_lt<double2>(const double2& a, const double2& b) {
+  return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
+}
+#endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
+  return make_float4(a, a+1, a+2, a+3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
+  return make_double2(a, a+1);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x+b.x, a.y+b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x-b.x, a.y-b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
+  return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
+  return make_double2(-a.x, -a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x*b.x, a.y*b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x/b.x, a.y/b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
+  return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
+  return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
+  return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
+  return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
+  return *reinterpret_cast<const float4*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
+  return *reinterpret_cast<const double2*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
+  return make_float4(from[0], from[1], from[2], from[3]);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
+  return make_double2(from[0], from[1]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
+  return make_float4(from[0], from[0], from[1], from[1]);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
+  return make_double2(from[0], from[0]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float*   to, const float4& from) {
+  *reinterpret_cast<float4*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
+  *reinterpret_cast<double2*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const float4& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+  to[2] = from.z;
+  to[3] = from.w;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return __ldg((const float4*)from);
+#else
+  return make_float4(from[0], from[1], from[2], from[3]);
+#endif
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return __ldg((const double2*)from);
+#else
+  return make_double2(from[0], from[1]);
+#endif
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
+#else
+  return make_float4(from[0], from[1], from[2], from[3]);
+#endif
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return make_double2(__ldg(from+0), __ldg(from+1));
+#else
+  return make_double2(from[0], from[1]);
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
+  return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
+  return make_double2(from[0*stride], from[1*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
+  to[stride*0] = from.x;
+  to[stride*1] = from.y;
+  to[stride*2] = from.z;
+  to[stride*3] = from.w;
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
+  to[stride*0] = from.x;
+  to[stride*1] = from.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  pfirst<float4>(const float4& a) {
+  return a.x;
+}
+template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
+  return a.x;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux<float4>(const float4& a) {
+  return a.x + a.y + a.z + a.w;
+}
+template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
+  return a.x + a.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_max<float4>(const float4& a) {
+  return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
+  return fmax(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_min<float4>(const float4& a) {
+  return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
+  return fmin(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_mul<float4>(const float4& a) {
+  return a.x * a.y * a.z * a.w;
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
+  return a.x * a.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
+  return make_double2(fabs(a.x), fabs(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4  pfloor<float4>(const float4& a) {
+  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
+  return make_double2(floor(a.x), floor(a.y));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<float4,4>& kernel) {
+  float tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+
+  tmp = kernel.packet[0].z;
+  kernel.packet[0].z = kernel.packet[2].x;
+  kernel.packet[2].x = tmp;
+
+  tmp = kernel.packet[0].w;
+  kernel.packet[0].w = kernel.packet[3].x;
+  kernel.packet[3].x = tmp;
+
+  tmp = kernel.packet[1].z;
+  kernel.packet[1].z = kernel.packet[2].y;
+  kernel.packet[2].y = tmp;
+
+  tmp = kernel.packet[1].w;
+  kernel.packet[1].w = kernel.packet[3].y;
+  kernel.packet[3].y = tmp;
+
+  tmp = kernel.packet[2].w;
+  kernel.packet[2].w = kernel.packet[3].z;
+  kernel.packet[3].z = tmp;
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<double2,2>& kernel) {
+  double tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+}
+
+#endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+
+// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning
+// its corresponding packet_traits<Eigen::half> must be visible on host.
+#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
+
+typedef ulonglong2 Packet4h2;
+template<> struct unpacket_traits<Packet4h2> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; };
+template<> struct is_arithmetic<Packet4h2> { enum { value = true }; };
+
+template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; };
+template<> struct is_arithmetic<half2> { enum { value = true }; };
+
+template<> struct packet_traits<Eigen::half> : default_packet_traits
+{
+  typedef Packet4h2 type;
+  typedef Packet4h2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=8,
+    HasHalfPacket = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasExp    = 1,
+    HasExpm1  = 1,
+    HasLog    = 1,
+    HasLog1p  = 1
+  };
+};
+
+namespace {
+// This is equivalent to make_half2, which is undocumented and doesn't seem to always exist.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+  return __halves2half2(a, b);
+#else
+  // Round-about way since __halves2half2 is a __device__ function.
+  return __floats2half2_rn(__half2float(a), __half2float(b));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+  return __low2half(a);
+#else
+  return __float2half(__low2float(a));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+  return __high2half(a);
+#else
+  return __float2half(__high2float(a));
+#endif
+}
+} // namespace
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+  return __half2half2(from);
+#else
+  const float f = __half2float(from);
+  return __floats2half2_rn(f, f);
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+pset1<Packet4h2>(const Eigen::half& from) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = pset1<half2>(from);
+  p_alias[1] = pset1<half2>(from);
+  p_alias[2] = pset1<half2>(from);
+  p_alias[3] = pset1<half2>(from);
+  return r;
+}
+
+// We now need this visible on both host and device.
+// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+namespace {
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
+  return *reinterpret_cast<const half2*>(from);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) {
+  return combine_half(from[0], from[1]);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half*  from) {
+  return combine_half(from[0], from[0]);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
+                                                  const half2& from) {
+  *reinterpret_cast<half2*>(to) = from;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
+                                                   const half2& from) {
+  to[0] = get_half2_low(from);
+  to[1] = get_half2_high(from);
+}
+
+
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
+    const Eigen::half* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  // Input is guaranteed to be properly aligned.
+  return __ldg(reinterpret_cast<const half2*>(from));
+#else
+  return combine_half(*(from+0), *(from+1));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
+    const Eigen::half* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return __halves2half2(__ldg(from+0), __ldg(from+1));
+#else
+  return combine_half(*(from+0), *(from+1));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
+                                                    Index stride) {
+  return combine_half(from[0*stride], from[1*stride]);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(
+    Eigen::half* to, const half2& from, Index stride) {
+  to[stride*0] = get_half2_low(from);
+  to[stride*1] = get_half2_high(from);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) {
+  return get_half2_low(a);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
+  half a1 = get_half2_low(a);
+  half a2 = get_half2_high(a);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
+  return combine_half(result1, result2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  return pset1<half2>(true_half);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  return pset1<half2>(false_half);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<half2,2>& kernel) {
+  __half a1 = get_half2_low(kernel.packet[0]);
+  __half a2 = get_half2_high(kernel.packet[0]);
+  __half b1 = get_half2_low(kernel.packet[1]);
+  __half b2 = get_half2_high(kernel.packet[1]);
+  kernel.packet[0] = combine_half(a1, b1);
+  kernel.packet[1] = combine_half(a2, b2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
+#else
+  float f = __half2float(a) + 1.0f;
+  return combine_half(a, __float2half(f));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
+                                                    const half2& a,
+                                                    const half2& b) {
+  half mask_low = get_half2_low(mask);
+  half mask_high = get_half2_high(mask);
+  half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a);
+  half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a);
+  return combine_half(result_low, result_high);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a,
+                                                    const half2& b) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  half a1 = get_half2_low(a);
+  half a2 = get_half2_high(a);
+  half b1 = get_half2_low(b);
+  half b2 = get_half2_high(b);
+  half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
+  half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
+  return combine_half(eq1, eq2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a,
+                                                    const half2& b) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  half a1 = get_half2_low(a);
+  half a2 = get_half2_high(a);
+  half b1 = get_half2_low(b);
+  half b2 = get_half2_high(b);
+  half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
+  half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
+  return combine_half(eq1, eq2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a,
+                                                 const half2& b) {
+  half a1 = get_half2_low(a);
+  half a2 = get_half2_high(a);
+  half b1 = get_half2_low(b);
+  half b2 = get_half2_high(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
+  return combine_half(result1, result2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a,
+                                                const half2& b) {
+  half a1 = get_half2_low(a);
+  half a2 = get_half2_high(a);
+  half b1 = get_half2_low(b);
+  half b2 = get_half2_high(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
+  return combine_half(result1, result2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a,
+                                                 const half2& b) {
+  half a1 = get_half2_low(a);
+  half a2 = get_half2_high(a);
+  half b1 = get_half2_low(b);
+  half b2 = get_half2_high(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
+  return combine_half(result1, result2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
+                                                    const half2& b) {
+  half a1 = get_half2_low(a);
+  half a2 = get_half2_high(a);
+  half b1 = get_half2_low(b);
+  half b2 = get_half2_high(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
+  return combine_half(result1, result2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
+                                                 const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hadd2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 + b1;
+  float r2 = a2 + b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a,
+                                                 const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hsub2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 - b1;
+  float r2 = a2 - b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hneg2(a);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return __floats2half2_rn(-a1, -a2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
+                                                 const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hmul2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 * b1;
+  float r2 = a2 * b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
+                                                  const half2& b,
+                                                  const half2& c) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+   return __hfma2(a, b, c);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float c1 = __low2float(c);
+  float c2 = __high2float(c);
+  float r1 = a1 * b1 + c1;
+  float r2 = a2 * b2 + c2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a,
+                                                 const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __h2div(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 / b1;
+  float r2 = a2 / b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a,
+                                                 const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
+  __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
+  return combine_half(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
+                                                 const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
+  __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
+  return combine_half(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hadd(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(__float2half(a1 + a2));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hgt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 > a2 ? get_half2_low(a) : get_half2_high(a);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hlt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 < a2 ? get_half2_low(a) : get_half2_high(a);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hmul(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(__float2half(a1 * a2));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = log1pf(a1);
+  float r2 = log1pf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = expm1f(a1);
+  float r2 = expm1f(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+half2 plog(const half2& a) {
+  return h2log(a);
+}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+half2 pexp(const half2& a) {
+  return h2exp(a);
+}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+half2 psqrt(const half2& a) {
+  return h2sqrt(a);
+}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+half2 prsqrt(const half2& a) {
+  return h2rsqrt(a);
+}
+
+#else
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = logf(a1);
+  float r2 = logf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = expf(a1);
+  float r2 = expf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = sqrtf(a1);
+  float r2 = sqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = rsqrtf(a1);
+  float r2 = rsqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+#endif
+} // namespace
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+pload<Packet4h2>(const Eigen::half* from) {
+  return *reinterpret_cast<const Packet4h2*>(from);
+}
+
+// unaligned load;
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+ploadu<Packet4h2>(const Eigen::half* from) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = ploadu(from + 0);
+  p_alias[1] = ploadu(from + 2);
+  p_alias[2] = ploadu(from + 4);
+  p_alias[3] = ploadu(from + 6);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+ploaddup<Packet4h2>(const Eigen::half* from) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = ploaddup(from + 0);
+  p_alias[1] = ploaddup(from + 1);
+  p_alias[2] = ploaddup(from + 2);
+  p_alias[3] = ploaddup(from + 3);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(
+    Eigen::half* to, const Packet4h2& from) {
+  *reinterpret_cast<Packet4h2*>(to) = from;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(
+    Eigen::half* to, const Packet4h2& from) {
+  const half2* from_alias = reinterpret_cast<const half2*>(&from);
+  pstoreu(to + 0,from_alias[0]);
+  pstoreu(to + 2,from_alias[1]);
+  pstoreu(to + 4,from_alias[2]);
+  pstoreu(to + 6,from_alias[3]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
+ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  Packet4h2 r;
+  r = __ldg(reinterpret_cast<const Packet4h2*>(from));
+  return r;
+#else
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  r_alias[0] = ploadt_ro_aligned(from + 0);
+  r_alias[1] = ploadt_ro_aligned(from + 2);
+  r_alias[2] = ploadt_ro_aligned(from + 4);
+  r_alias[3] = ploadt_ro_aligned(from + 6);
+  return r;
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
+ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  r_alias[0] = ploadt_ro_unaligned(from + 0);
+  r_alias[1] = ploadt_ro_unaligned(from + 2);
+  r_alias[2] = ploadt_ro_unaligned(from + 4);
+  r_alias[3] = ploadt_ro_unaligned(from + 6);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = combine_half(from[0 * stride], from[1 * stride]);
+  p_alias[1] = combine_half(from[2 * stride], from[3 * stride]);
+  p_alias[2] = combine_half(from[4 * stride], from[5 * stride]);
+  p_alias[3] = combine_half(from[6 * stride], from[7 * stride]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(
+    Eigen::half* to, const Packet4h2& from, Index stride) {
+  const half2* from_alias = reinterpret_cast<const half2*>(&from);
+  pscatter(to + stride * 0, from_alias[0], stride);
+  pscatter(to + stride * 2, from_alias[1], stride);
+  pscatter(to + stride * 4, from_alias[2], stride);
+  pscatter(to + stride * 6, from_alias[3], stride);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(
+    const Packet4h2& a) {
+  return pfirst(*(reinterpret_cast<const half2*>(&a)));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(
+    const Packet4h2& a) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  p_alias[0] = pabs(a_alias[0]);
+  p_alias[1] = pabs(a_alias[1]);
+  p_alias[2] = pabs(a_alias[2]);
+  p_alias[3] = pabs(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(
+    const Packet4h2& /*a*/) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  return pset1<Packet4h2>(true_half);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2& /*a*/) {
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  return pset1<Packet4h2>(false_half);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(
+    double* d_row0, double* d_row1, double* d_row2, double* d_row3,
+    double* d_row4, double* d_row5, double* d_row6, double* d_row7) {
+  double d_tmp;
+  d_tmp = d_row0[1];
+  d_row0[1] = d_row4[0];
+  d_row4[0] = d_tmp;
+
+  d_tmp = d_row1[1];
+  d_row1[1] = d_row5[0];
+  d_row5[0] = d_tmp;
+
+  d_tmp = d_row2[1];
+  d_row2[1] = d_row6[0];
+  d_row6[0] = d_tmp;
+
+  d_tmp = d_row3[1];
+  d_row3[1] = d_row7[0];
+  d_row7[0] = d_tmp;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(
+    half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) {
+  half2 f_tmp;
+  f_tmp = f_row0[1];
+  f_row0[1] = f_row2[0];
+  f_row2[0] = f_tmp;
+
+  f_tmp = f_row1[1];
+  f_row1[1] = f_row3[0];
+  f_row3[0] = f_tmp;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+ptranspose_half(half2& f0, half2& f1) {
+  __half a1 = get_half2_low(f0);
+  __half a2 = get_half2_high(f0);
+  __half b1 = get_half2_low(f1);
+  __half b2 = get_half2_high(f1);
+  f0 = combine_half(a1, b1);
+  f1 = combine_half(a2, b2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet4h2,8>& kernel) {
+  double* d_row0 = reinterpret_cast<double*>(&kernel.packet[0]);
+  double* d_row1 = reinterpret_cast<double*>(&kernel.packet[1]);
+  double* d_row2 = reinterpret_cast<double*>(&kernel.packet[2]);
+  double* d_row3 = reinterpret_cast<double*>(&kernel.packet[3]);
+  double* d_row4 = reinterpret_cast<double*>(&kernel.packet[4]);
+  double* d_row5 = reinterpret_cast<double*>(&kernel.packet[5]);
+  double* d_row6 = reinterpret_cast<double*>(&kernel.packet[6]);
+  double* d_row7 = reinterpret_cast<double*>(&kernel.packet[7]);
+  ptranspose_double(d_row0, d_row1, d_row2, d_row3,
+                    d_row4, d_row5, d_row6, d_row7);
+
+
+  half2* f_row0 = reinterpret_cast<half2*>(d_row0);
+  half2* f_row1 = reinterpret_cast<half2*>(d_row1);
+  half2* f_row2 = reinterpret_cast<half2*>(d_row2);
+  half2* f_row3 = reinterpret_cast<half2*>(d_row3);
+  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
+  ptranspose_half(f_row0[0], f_row1[0]);
+  ptranspose_half(f_row0[1], f_row1[1]);
+  ptranspose_half(f_row2[0], f_row3[0]);
+  ptranspose_half(f_row2[1], f_row3[1]);
+
+  f_row0 = reinterpret_cast<half2*>(d_row0 + 1);
+  f_row1 = reinterpret_cast<half2*>(d_row1 + 1);
+  f_row2 = reinterpret_cast<half2*>(d_row2 + 1);
+  f_row3 = reinterpret_cast<half2*>(d_row3 + 1);
+  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
+  ptranspose_half(f_row0[0], f_row1[0]);
+  ptranspose_half(f_row0[1], f_row1[1]);
+  ptranspose_half(f_row2[0], f_row3[0]);
+  ptranspose_half(f_row2[1], f_row3[1]);
+
+  f_row0 = reinterpret_cast<half2*>(d_row4);
+  f_row1 = reinterpret_cast<half2*>(d_row5);
+  f_row2 = reinterpret_cast<half2*>(d_row6);
+  f_row3 = reinterpret_cast<half2*>(d_row7);
+  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
+  ptranspose_half(f_row0[0], f_row1[0]);
+  ptranspose_half(f_row0[1], f_row1[1]);
+  ptranspose_half(f_row2[0], f_row3[0]);
+  ptranspose_half(f_row2[1], f_row3[1]);
+
+  f_row0 = reinterpret_cast<half2*>(d_row4 + 1);
+  f_row1 = reinterpret_cast<half2*>(d_row5 + 1);
+  f_row2 = reinterpret_cast<half2*>(d_row6 + 1);
+  f_row3 = reinterpret_cast<half2*>(d_row7 + 1);
+  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
+  ptranspose_half(f_row0[0], f_row1[0]);
+  ptranspose_half(f_row0[1], f_row1[1]);
+  ptranspose_half(f_row2[0], f_row3[0]);
+  ptranspose_half(f_row2[1], f_row3[1]);
+
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+plset<Packet4h2>(const Eigen::half& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
+  p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)),
+                              __hadd(a, __float2half(3.0f)));
+  p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)),
+                              __hadd(a, __float2half(5.0f)));
+  p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)),
+                              __hadd(a, __float2half(7.0f)));
+  return r;
+#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+
+  half2 b = pset1<half2>(a);
+  half2 c;
+  half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f));
+  half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f));
+
+  c = __hadd2(b, half_offset0);
+  r_alias[0] = plset(__low2half(c));
+  r_alias[1] = plset(__high2half(c));
+
+  c = __hadd2(b, half_offset1);
+  r_alias[2] = plset(__low2half(c));
+  r_alias[3] = plset(__high2half(c));
+
+  return r;
+
+#else
+  float f = __half2float(a);
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = combine_half(a, __float2half(f + 1.0f));
+  p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f));
+  p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f));
+  p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f));
+  return r;
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
+                   const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* mask_alias = reinterpret_cast<const half2*>(&mask);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pselect(mask_alias[0], a_alias[0], b_alias[0]);
+  r_alias[1] = pselect(mask_alias[1], a_alias[1], b_alias[1]);
+  r_alias[2] = pselect(mask_alias[2], a_alias[2], b_alias[2]);
+  r_alias[3] = pselect(mask_alias[3], a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pcmp_eq(a_alias[0], b_alias[0]);
+  r_alias[1] = pcmp_eq(a_alias[1], b_alias[1]);
+  r_alias[2] = pcmp_eq(a_alias[2], b_alias[2]);
+  r_alias[3] = pcmp_eq(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pand(a_alias[0], b_alias[0]);
+  r_alias[1] = pand(a_alias[1], b_alias[1]);
+  r_alias[2] = pand(a_alias[2], b_alias[2]);
+  r_alias[3] = pand(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = por(a_alias[0], b_alias[0]);
+  r_alias[1] = por(a_alias[1], b_alias[1]);
+  r_alias[2] = por(a_alias[2], b_alias[2]);
+  r_alias[3] = por(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pxor(a_alias[0], b_alias[0]);
+  r_alias[1] = pxor(a_alias[1], b_alias[1]);
+  r_alias[2] = pxor(a_alias[2], b_alias[2]);
+  r_alias[3] = pxor(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pandnot(a_alias[0], b_alias[0]);
+  r_alias[1] = pandnot(a_alias[1], b_alias[1]);
+  r_alias[2] = pandnot(a_alias[2], b_alias[2]);
+  r_alias[3] = pandnot(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = padd(a_alias[0], b_alias[0]);
+  r_alias[1] = padd(a_alias[1], b_alias[1]);
+  r_alias[2] = padd(a_alias[2], b_alias[2]);
+  r_alias[3] = padd(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = psub(a_alias[0], b_alias[0]);
+  r_alias[1] = psub(a_alias[1], b_alias[1]);
+  r_alias[2] = psub(a_alias[2], b_alias[2]);
+  r_alias[3] = psub(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pnegate(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = pnegate(a_alias[0]);
+  r_alias[1] = pnegate(a_alias[1]);
+  r_alias[2] = pnegate(a_alias[2]);
+  r_alias[3] = pnegate(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(const Packet4h2& a) {
+  return a;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pmul(a_alias[0], b_alias[0]);
+  r_alias[1] = pmul(a_alias[1], b_alias[1]);
+  r_alias[2] = pmul(a_alias[2], b_alias[2]);
+  r_alias[3] = pmul(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b, const Packet4h2& c) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  const half2* c_alias = reinterpret_cast<const half2*>(&c);
+  r_alias[0] = pmadd(a_alias[0], b_alias[0], c_alias[0]);
+  r_alias[1] = pmadd(a_alias[1], b_alias[1], c_alias[1]);
+  r_alias[2] = pmadd(a_alias[2], b_alias[2], c_alias[2]);
+  r_alias[3] = pmadd(a_alias[3], b_alias[3], c_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pdiv(a_alias[0], b_alias[0]);
+  r_alias[1] = pdiv(a_alias[1], b_alias[1]);
+  r_alias[2] = pdiv(a_alias[2], b_alias[2]);
+  r_alias[3] = pdiv(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pmin(a_alias[0], b_alias[0]);
+  r_alias[1] = pmin(a_alias[1], b_alias[1]);
+  r_alias[2] = pmin(a_alias[2], b_alias[2]);
+  r_alias[3] = pmin(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(
+    const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pmax(a_alias[0], b_alias[0]);
+  r_alias[1] = pmax(a_alias[1], b_alias[1]);
+  r_alias[2] = pmax(a_alias[2], b_alias[2]);
+  r_alias[3] = pmax(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(
+    const Packet4h2& a) {
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+
+  return predux(a_alias[0]) + predux(a_alias[1]) +
+         predux(a_alias[2]) + predux(a_alias[3]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
+    const Packet4h2& a) {
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  half2 m0 = combine_half(predux_max(a_alias[0]),
+                            predux_max(a_alias[1]));
+  half2 m1 = combine_half(predux_max(a_alias[2]),
+                            predux_max(a_alias[3]));
+  __half first  = predux_max(m0);
+  __half second = predux_max(m1);
+#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+  return (__hgt(first, second) ? first : second);
+#else
+  float ffirst  = __half2float(first);
+  float fsecond = __half2float(second);
+  return (ffirst > fsecond)? first: second;
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
+    const Packet4h2& a) {
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  half2 m0 = combine_half(predux_min(a_alias[0]),
+                            predux_min(a_alias[1]));
+  half2 m1 = combine_half(predux_min(a_alias[2]),
+                            predux_min(a_alias[3]));
+  __half first  = predux_min(m0);
+  __half second = predux_min(m1);
+#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+  return (__hlt(first, second) ? first : second);
+#else
+  float ffirst  = __half2float(first);
+  float fsecond = __half2float(second);
+  return (ffirst < fsecond)? first: second;
+#endif
+}
+
+// likely overflow/underflow
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(
+    const Packet4h2& a) {
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  return predux_mul(pmul(pmul(a_alias[0], a_alias[1]),
+                                       pmul(a_alias[2], a_alias[3])));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+plog1p<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = plog1p(a_alias[0]);
+  r_alias[1] = plog1p(a_alias[1]);
+  r_alias[2] = plog1p(a_alias[2]);
+  r_alias[3] = plog1p(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+pexpm1<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = pexpm1(a_alias[0]);
+  r_alias[1] = pexpm1(a_alias[1]);
+  r_alias[2] = pexpm1(a_alias[2]);
+  r_alias[3] = pexpm1(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = plog(a_alias[0]);
+  r_alias[1] = plog(a_alias[1]);
+  r_alias[2] = plog(a_alias[2]);
+  r_alias[3] = plog(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = pexp(a_alias[0]);
+  r_alias[1] = pexp(a_alias[1]);
+  r_alias[2] = pexp(a_alias[2]);
+  r_alias[3] = pexp(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = psqrt(a_alias[0]);
+  r_alias[1] = psqrt(a_alias[1]);
+  r_alias[2] = psqrt(a_alias[2]);
+  r_alias[3] = psqrt(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
+prsqrt<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = prsqrt(a_alias[0]);
+  r_alias[1] = prsqrt(a_alias[1]);
+  r_alias[2] = prsqrt(a_alias[2]);
+  r_alias[3] = prsqrt(a_alias[3]);
+  return r;
+}
+
+// The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for
+// the implementation of GPU half reduction.
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
+                                                        const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hadd2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 + b1;
+  float r2 = a2 + b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
+                                                        const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hmul2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 * b1;
+  float r2 = a2 * b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a,
+                                                        const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __h2div(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 / b1;
+  float r2 = a2 / b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a,
+                                                        const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
+  __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
+  return combine_half(r1, r2);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a,
+                                                        const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
+  __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
+  return combine_half(r1, r2);
+}
+
+// #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+
+#endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
+
+#undef EIGEN_GPU_HAS_LDG
+#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
+#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+
+#endif // EIGEN_PACKET_MATH_GPU_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h b/src/3rdparty/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h
new file mode 100644
index 000000000..754546225
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h
@@ -0,0 +1,80 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_GPU_H
+#define EIGEN_TYPE_CASTING_GPU_H
+
+namespace Eigen {
+
+namespace internal {
+
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 2
+  };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
+  float2 r1 = __half22float2(a);
+  float2 r2 = __half22float2(b);
+  return make_float4(r1.x, r1.y, r2.x, r2.y);
+}
+
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcast<float4, Packet4h2>(const float4& a, const float4& b) {
+  Packet4h2 r;
+  half2* r_alias=reinterpret_cast<half2*>(&r);
+  r_alias[0]=__floats2half2_rn(a.x,a.y);
+  r_alias[1]=__floats2half2_rn(a.z,a.w);
+  r_alias[2]=__floats2half2_rn(b.x,b.y);
+  r_alias[3]=__floats2half2_rn(b.z,b.w);
+  return r;
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 2,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<Packet4h2, float4>(const Packet4h2& a) {
+  // Simply discard the second half of the input
+  float4 r;
+  const half2* a_alias=reinterpret_cast<const half2*>(&a);
+  float2 r1 = __half22float2(a_alias[0]);
+  float2 r2 = __half22float2(a_alias[1]);
+  r.x=static_cast<float>(r1.x);
+  r.y=static_cast<float>(r1.y);
+  r.z=static_cast<float>(r2.x);
+  r.w=static_cast<float>(r2.y);
+  return r;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
+  // Simply discard the second half of the input
+  return __floats2half2_rn(a.x, a.y);
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_GPU_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/src/3rdparty/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h
new file mode 100644
index 000000000..25375a0a4
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h
@@ -0,0 +1,23 @@
+/*
+ * math_constants.h - 
+ *  HIP equivalent of the CUDA header of the same name
+ */
+
+#ifndef __MATH_CONSTANTS_H__
+#define __MATH_CONSTANTS_H__
+
+/* single precision constants */
+
+#define HIPRT_INF_F        __int_as_float(0x7f800000)
+#define HIPRT_NAN_F        __int_as_float(0x7fffffff)
+#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001)
+#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff)
+#define HIPRT_NEG_ZERO_F   __int_as_float(0x80000000)
+#define HIPRT_ZERO_F       0.0f
+#define HIPRT_ONE_F        1.0f
+
+/* double precision constants */
+#define HIPRT_INF          __hiloint2double(0x7ff00000, 0x00000000)
+#define HIPRT_NAN          __hiloint2double(0xfff80000, 0x00000000)
+
+#endif
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/MSA/Complex.h b/src/3rdparty/eigen/Eigen/src/Core/arch/MSA/Complex.h
new file mode 100644
index 000000000..53dacfa43
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/MSA/Complex.h
@@ -0,0 +1,648 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+//   Chris Larsen
+//   Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_MSA_H
+#define EIGEN_COMPLEX_MSA_H
+
+#include <iostream>
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet2cf {
+  EIGEN_STRONG_INLINE Packet2cf() {
+  }
+  EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex<float>& a,
+                                         const std::complex<float>& b) {
+    Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) };
+    v = t;
+  }
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {
+  }
+  EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) {
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) {
+    v = b.v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf conjugate(void) const {
+    return Packet2cf((Packet4f)__builtin_msa_bnegi_d((v2u64)v, 63));
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
+    Packet4f v1, v2;
+
+    // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+    v1 = (Packet4f)__builtin_msa_ilvev_w((v4i32)v, (v4i32)v);
+    // Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
+    v2 = (Packet4f)__builtin_msa_ilvod_w((v4i32)v, (v4i32)v);
+    // Multiply the real a with b
+    v1 = pmul(v1, b.v);
+    // Multiply the imag a with b
+    v2 = pmul(v2, b.v);
+    // Conjugate v2
+    v2 = Packet2cf(v2).conjugate().v;
+    // Swap real/imag elements in v2.
+    v2 = (Packet4f)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(1, 0, 3, 2));
+    // Add and return the result
+    v = padd(v1, v2);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
+    return Packet2cf(*this) *= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
+    return Packet2cf(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
+    return Packet2cf(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) {
+    *this *= b.conjugate();
+    Packet4f s = pmul<Packet4f>(b.v, b.v);
+    s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+    v = pdiv(v, s);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const {
+    return Packet2cf(*this) /= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
+    return Packet2cf(pnegate(v));
+  }
+
+  Packet4f v;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2cf& value) {
+  os << "[ (" << value.v[0] << ", " << value.v[1]
+     << "i),"
+        "  ("
+     << value.v[2] << ", " << value.v[3] << "i) ]";
+  return os;
+}
+
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
+  typedef Packet2cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float> type;
+  enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  typedef Packet2cf half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
+  EIGEN_MSA_DEBUG;
+
+  float f0 = from.real(), f1 = from.imag();
+  Packet4f v0 = { f0, f0, f0, f0 };
+  Packet4f v1 = { f1, f1, f1, f1 };
+  return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a + b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a - b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return -a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a.conjugate();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a * b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(pand(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(por(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(pxor(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(pandnot(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_MSA_DEBUG;
+
+  return pset1<Packet2cf>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to,
+                                                      const Packet2cf& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE pstore<float>((float*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to,
+                                                       const Packet2cf& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu<float>((float*)to, from.v);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
+    const std::complex<float>* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(from[0 * stride], from[1 * stride]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to,
+                                                                       const Packet2cf& from,
+                                                                       Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = std::complex<float>(from.v[0], from.v[1]);
+  to += stride;
+  *to = std::complex<float>(from.v[2], from.v[3]);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+  EIGEN_MSA_DEBUG;
+
+  prefetch(reinterpret_cast<const float*>(addr));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return std::complex<float>(a.v[0], a.v[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f value = (Packet4f)preverse((Packet2d)a.v);
+  value += a.v;
+  return std::complex<float>(value[0], value[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return std::complex<float>((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]),
+                             (a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a / b;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2cf, 2>& value) {
+  os << "[ " << value.packet[0] << ", " << std::endl << "  " << value.packet[1] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f tmp =
+      (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
+  kernel.packet[0].v =
+      (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
+  kernel.packet[1].v = tmp;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+                                     const Packet2cf& elsePacket) {
+  return (Packet2cf)(Packet4f)pblend<Packet2d>(ifPacket, (Packet2d)thenPacket.v,
+                                               (Packet2d)elsePacket.v);
+}
+
+//---------- double ----------
+
+struct Packet1cd {
+  EIGEN_STRONG_INLINE Packet1cd() {
+  }
+  EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex<double>& a) {
+    v[0] = std::real(a);
+    v[1] = std::imag(a);
+  }
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {
+  }
+  EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) {
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) {
+    v = b.v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd conjugate(void) const {
+    static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 };
+    return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR);
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
+    Packet2d v1, v2;
+
+    // Get the real values of a | a1_re | a1_re
+    v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)v, (v2i64)v);
+    // Get the imag values of a | a1_im | a1_im
+    v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)v, (v2i64)v);
+    // Multiply the real a with b
+    v1 = pmul(v1, b.v);
+    // Multiply the imag a with b
+    v2 = pmul(v2, b.v);
+    // Conjugate v2
+    v2 = Packet1cd(v2).conjugate().v;
+    // Swap real/imag elements in v2.
+    v2 = (Packet2d)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+    // Add and return the result
+    v = padd(v1, v2);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
+    return Packet1cd(*this) *= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
+    return Packet1cd(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
+    return Packet1cd(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) {
+    *this *= b.conjugate();
+    Packet2d s = pmul<Packet2d>(b.v, b.v);
+    s = padd(s, preverse<Packet2d>(s));
+    v = pdiv(v, s);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const {
+    return Packet1cd(*this) /= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
+    return Packet1cd(pnegate(v));
+  }
+
+  Packet2d v;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Packet1cd& value) {
+  os << "[ (" << value.v[0] << ", " << value.v[1] << "i) ]";
+  return os;
+}
+
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  enum { size = 1, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  typedef Packet1cd half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a + b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a - b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return -a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a.conjugate();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a * b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(pand(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(por(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(pxor(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(pandnot(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_MSA_DEBUG;
+
+  return pset1<Packet1cd>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to,
+                                                       const Packet1cd& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE pstore<double>((double*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to,
+                                                        const Packet1cd& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu<double>((double*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+  EIGEN_MSA_DEBUG;
+
+  prefetch(reinterpret_cast<const double*>(addr));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
+    const std::complex<double>* from, Index stride __attribute__((unused))) {
+  EIGEN_MSA_DEBUG;
+
+  Packet1cd res;
+  res.v[0] = std::real(from[0]);
+  res.v[1] = std::imag(from[0]);
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to,
+                                                                        const Packet1cd& from,
+                                                                        Index stride
+                                                                        __attribute__((unused))) {
+  EIGEN_MSA_DEBUG;
+
+  pstore(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return std::complex<double>(a.v[0], a.v[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return pfirst(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return pfirst(a);
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a / b;
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet1cd, 2>& value) {
+  os << "[ " << value.packet[0] << ", " << std::endl << "  " << value.packet[1] << " ]";
+  return os;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d v1, v2;
+
+  v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
+  // Get the imag values of a
+  v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
+
+  kernel.packet[0].v = v1;
+  kernel.packet[1].v = v2;
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_COMPLEX_MSA_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h b/src/3rdparty/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h
new file mode 100644
index 000000000..f5181b90e
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h
@@ -0,0 +1,387 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+//   Chris Larsen
+//   Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+/* The tanh function of this file is an adaptation of
+ * template<typename T> T generic_fast_tanh_float(const T&)
+ * from MathFunctionsImpl.h.
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_MSA_H
+#define EIGEN_MATH_FUNCTIONS_MSA_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+plog<Packet4f>(const Packet4f& _x) {
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+  // Convert negative argument into NAN (quiet negative, to be specific).
+  Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0);
+  Packet4i neg_mask = __builtin_msa_fclt_w(_x, zero);
+  Packet4i zero_mask = __builtin_msa_fceq_w(_x, zero);
+  Packet4f non_neg_x_or_nan = padd(_x, (Packet4f)neg_mask);  // Add 0.0 or NAN.
+  Packet4f x = non_neg_x_or_nan;
+
+  // Extract exponent from x = mantissa * 2**exponent, where 1.0 <= mantissa < 2.0.
+  // N.B. the exponent is one less of what frexpf() would return.
+  Packet4i e_int = __builtin_msa_ftint_s_w(__builtin_msa_flog2_w(x));
+  // Multiply x by 2**(-exponent-1) to get 0.5 <= x < 1.0 as from frexpf().
+  x = __builtin_msa_fexp2_w(x, (Packet4i)__builtin_msa_nori_b((v16u8)e_int, 0));
+
+  /*
+     if (x < SQRTHF) {
+       x = x + x - 1.0;
+     } else {
+       e += 1;
+       x = x - 1.0;
+     }
+  */
+  Packet4f xx = padd(x, x);
+  Packet4i ge_mask = __builtin_msa_fcle_w(p4f_cephes_SQRTHF, x);
+  e_int = psub(e_int, ge_mask);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)ge_mask, (v16u8)xx, (v16u8)x);
+  x = psub(x, p4f_1);
+  Packet4f e = __builtin_msa_ffint_s_w(e_int);
+
+  Packet4f x2 = pmul(x, x);
+  Packet4f x3 = pmul(x2, x);
+
+  Packet4f y, y1, y2;
+  y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
+  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
+  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
+  y = pmadd(y, x, p4f_cephes_log_p2);
+  y1 = pmadd(y1, x, p4f_cephes_log_p5);
+  y2 = pmadd(y2, x, p4f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  y = pmadd(e, p4f_cephes_log_q1, y);
+  x = __builtin_msa_fmsub_w(x, x2, p4f_half);
+  x = padd(x, y);
+  x = pmadd(e, p4f_cephes_log_q2, x);
+
+  // x is now the logarithm result candidate. We still need to handle the
+  // extreme arguments of zero and positive infinity, though.
+  // N.B. if the argument is +INFINITY, x is NAN because the polynomial terms
+  // contain infinities of both signs (see the coefficients and code above).
+  // INFINITY - INFINITY is NAN.
+
+  // If the argument is +INFINITY, make it the new result candidate.
+  // To achieve that we choose the smaller of the result candidate and the
+  // argument.
+  // This is correct for all finite pairs of values (the logarithm is smaller
+  // than the argument).
+  // This is also correct in the special case when the argument is +INFINITY
+  // and the result candidate is NAN. This is because the fmin.df instruction
+  // prefers non-NANs to NANs.
+  x = __builtin_msa_fmin_w(x, non_neg_x_or_nan);
+
+  // If the argument is zero (including -0.0), the result becomes -INFINITY.
+  Packet4i neg_infs = __builtin_msa_slli_w(zero_mask, 23);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)zero_mask, (v16u8)x, (v16u8)neg_infs);
+
+  return x;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+pexp<Packet4f>(const Packet4f& _x) {
+  // Limiting single-precision pexp's argument to [-128, +128] lets pexp
+  // reach 0 and INFINITY naturally.
+  static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f);
+  static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+  Packet4f x = _x;
+
+  // Clamp x.
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x,
+                                     (v16u8)p4f_exp_lo);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x,
+                                     (v16u8)p4f_exp_hi);
+
+  // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
+  Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0);
+  Packet4f x2 = pmadd(x, p4f_cephes_LOG2EF, x2_add);
+  Packet4i x2_int = __builtin_msa_ftrunc_s_w(x2);
+  Packet4f x2_int_f = __builtin_msa_ffint_s_w(x2_int);
+
+  x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C1);
+  x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C2);
+
+  Packet4f z = pmul(x, x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // y *= 2**exponent.
+  y = __builtin_msa_fexp2_w(y, x2_int);
+
+  return y;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+ptanh<Packet4f>(const Packet4f& _x) {
+  static _EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f);
+  // The monomial coefficients of the numerator polynomial (odd).
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
+  // The monomial coefficients of the denominator polynomial (even).
+  static _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f);
+
+  Packet4f x = pabs(_x);
+  Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny);
+
+  // Clamp the inputs to the range [-9, 9] since anything outside
+  // this range is -/+1.0f in single-precision.
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x,
+                                     (v16u8)p4f_tanh_hi);
+
+  // Since the polynomials are odd/even, we need x**2.
+  Packet4f x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial p.
+  Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11);
+  p = pmadd(x2, p, p4f_alpha_9);
+  p = pmadd(x2, p, p4f_alpha_7);
+  p = pmadd(x2, p, p4f_alpha_5);
+  p = pmadd(x2, p, p4f_alpha_3);
+  p = pmadd(x2, p, p4f_alpha_1);
+  p = pmul(x, p);
+
+  // Evaluate the denominator polynomial q.
+  Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4);
+  q = pmadd(x2, q, p4f_beta_2);
+  q = pmadd(x2, q, p4f_beta_0);
+
+  // Divide the numerator by the denominator.
+  p = pdiv(p, q);
+
+  // Reinstate the sign.
+  p = (Packet4f)__builtin_msa_binsli_w((v4u32)p, (v4u32)_x, 0);
+
+  // When the argument is very small in magnitude it's more accurate to just return it.
+  p = (Packet4f)__builtin_msa_bsel_v((v16u8)tiny_mask, (v16u8)p, (v16u8)_x);
+
+  return p;
+}
+
+template <bool sine>
+Packet4f psincos_inner_msa_float(const Packet4f& _x) {
+  static _EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f);  // Approx. (2**24) / (4/Pi).
+  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f);
+  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
+  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f);  // 4/Pi.
+  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+  Packet4f x = pabs(_x);
+
+  // Translate infinite arguments into NANs.
+  Packet4f zero_or_nan_if_inf = psub(_x, _x);
+  x = padd(x, zero_or_nan_if_inf);
+  // Prevent sin/cos from generating values larger than 1.0 in magnitude
+  // for very large arguments by setting x to 0.0.
+  Packet4i small_or_nan_mask = __builtin_msa_fcult_w(x, p4f_sincos_max_arg);
+  x = pand(x, (Packet4f)small_or_nan_mask);
+
+  // Scale x by 4/Pi to find x's octant.
+  Packet4f y = pmul(x, p4f_cephes_FOPI);
+  // Get the octant. We'll reduce x by this number of octants or by one more than it.
+  Packet4i y_int = __builtin_msa_ftrunc_s_w(y);
+  // x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
+  // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
+  // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
+  Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1);
+  Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0); // bclri = bit-clear
+  y = __builtin_msa_ffint_s_w(y_int2);
+
+  // Compute the sign to apply to the polynomial.
+  Packet4i sign_mask = sine ? pxor(__builtin_msa_slli_w(y_int1, 29), (Packet4i)_x)
+                            : __builtin_msa_slli_w(__builtin_msa_addvi_w(y_int, 3), 29);
+
+  // Get the polynomial selection mask.
+  // We'll calculate both (sin and cos) polynomials and then select from the two.
+  Packet4i poly_mask = __builtin_msa_ceqi_w(__builtin_msa_slli_w(y_int2, 30), 0);
+
+  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
+  // The magic pass: "Extended precision modular arithmetic"
+  // x = ((x - y * DP1) - y * DP2) - y * DP3
+  Packet4f tmp1 = pmul(y, p4f_minus_cephes_DP1);
+  Packet4f tmp2 = pmul(y, p4f_minus_cephes_DP2);
+  Packet4f tmp3 = pmul(y, p4f_minus_cephes_DP3);
+  x = padd(x, tmp1);
+  x = padd(x, tmp2);
+  x = padd(x, tmp3);
+
+  // Evaluate the cos(x) polynomial.
+  y = p4f_coscof_p0;
+  Packet4f z = pmul(x, x);
+  y = pmadd(y, z, p4f_coscof_p1);
+  y = pmadd(y, z, p4f_coscof_p2);
+  y = pmul(y, z);
+  y = pmul(y, z);
+  y = __builtin_msa_fmsub_w(y, z, p4f_half);
+  y = padd(y, p4f_1);
+
+  // Evaluate the sin(x) polynomial.
+  Packet4f y2 = p4f_sincof_p0;
+  y2 = pmadd(y2, z, p4f_sincof_p1);
+  y2 = pmadd(y2, z, p4f_sincof_p2);
+  y2 = pmul(y2, z);
+  y2 = pmadd(y2, x, x);
+
+  // Select the correct result from the two polynomials.
+  y = sine ? (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y, (v16u8)y2)
+           : (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y2, (v16u8)y);
+
+  // Update the sign.
+  sign_mask = pxor(sign_mask, (Packet4i)y);
+  y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0); // binsli = bit-insert-left
+  return y;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+psin<Packet4f>(const Packet4f& x) {
+  return psincos_inner_msa_float</* sine */ true>(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+pcos<Packet4f>(const Packet4f& x) {
+  return psincos_inner_msa_float</* sine */ false>(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d
+pexp<Packet2d>(const Packet2d& _x) {
+  // Limiting double-precision pexp's argument to [-1024, +1024] lets pexp
+  // reach 0 and INFINITY naturally.
+  static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0);
+  static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+  static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+  static _EIGEN_DECLARE_CONST_Packet2d(1, 1.0);
+  static _EIGEN_DECLARE_CONST_Packet2d(2, 2.0);
+
+  Packet2d x = _x;
+
+  // Clamp x.
+  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x,
+                                     (v16u8)p2d_exp_lo);
+  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x,
+                                     (v16u8)p2d_exp_hi);
+
+  // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
+  Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0);
+  Packet2d x2 = pmadd(x, p2d_cephes_LOG2EF, x2_add);
+  Packet2l x2_long = __builtin_msa_ftrunc_s_d(x2);
+  Packet2d x2_long_d = __builtin_msa_ffint_s_d(x2_long);
+
+  x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C1);
+  x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C2);
+
+  x2 = pmul(x, x);
+
+  Packet2d px = p2d_cephes_exp_p0;
+  px = pmadd(px, x2, p2d_cephes_exp_p1);
+  px = pmadd(px, x2, p2d_cephes_exp_p2);
+  px = pmul(px, x);
+
+  Packet2d qx = p2d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+  x = pdiv(px, psub(qx, px));
+  x = pmadd(p2d_2, x, p2d_1);
+
+  // x *= 2**exponent.
+  x = __builtin_msa_fexp2_d(x, x2_long);
+
+  return x;
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_MSA_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/MSA/PacketMath.h b/src/3rdparty/eigen/Eigen/src/Core/arch/MSA/PacketMath.h
new file mode 100644
index 000000000..afe8f3375
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/MSA/PacketMath.h
@@ -0,0 +1,1233 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+//   Chris Larsen
+//   Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_MSA_H
+#define EIGEN_PACKET_MATH_MSA_H
+
+#include <iostream>
+#include <string>
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#endif
+
+#if 0
+#define EIGEN_MSA_DEBUG                                                             \
+  static bool firstTime = true;                                                     \
+  do {                                                                              \
+    if (firstTime) {                                                                \
+      std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \
+      firstTime = false;                                                            \
+    }                                                                               \
+  } while (0)
+#else
+#define EIGEN_MSA_DEBUG
+#endif
+
+#define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a))
+
+typedef v4f32 Packet4f;
+typedef v4i32 Packet4i;
+typedef v4u32 Packet4ui;
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X }
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X }
+#define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X }
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) {
+  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) {
+  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) {
+  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+  return os;
+}
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet4f type;
+  typedef Packet4f half;  // Packet2f intrinsics not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,  // Packet2f intrinsics not implemented yet
+    // FIXME check the Has*
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<int32_t> : default_packet_traits {
+  typedef Packet4i type;
+  typedef Packet4i half;  // Packet2i intrinsics not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,  // Packet2i intrinsics not implemented yet
+    // FIXME check the Has*
+    HasDiv = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4f> {
+  typedef float type;
+  enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  typedef Packet4f half;
+};
+
+template <>
+struct unpacket_traits<Packet4i> {
+  typedef int32_t type;
+  enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  typedef Packet4i half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f v = { from, from, from, from };
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fill_w(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  float f = *from;
+  Packet4f v = { f, f, f, f };
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pload1<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fill_w(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fadd_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_addv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+  EIGEN_MSA_DEBUG;
+
+  static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f };
+  return padd(pset1<Packet4f>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
+  EIGEN_MSA_DEBUG;
+
+  static const Packet4i countdown = { 0, 1, 2, 3 };
+  return padd(pset1<Packet4i>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsub_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_subv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmul_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_mulv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fdiv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_div_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmadd_w(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  EIGEN_MSA_DEBUG;
+
+  // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug.
+  Packet4i value = c;
+  __asm__("maddv.w %w[value], %w[a], %w[b]\n"
+          // Outputs
+          : [value] "+f"(value)
+          // Inputs
+          : [a] "f"(a), [b] "f"(b));
+  return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmin_w(a, b);
+#else
+  // This prefers NaNs to numbers.
+  Packet4i aNaN = __builtin_msa_fcun_w(a, a);
+  Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN);
+  return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_min_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmax_w(a, b);
+#else
+  // This prefers NaNs to numbers.
+  Packet4i aNaN = __builtin_msa_fcun_w(a, a);
+  Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN);
+  return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_max_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  float f0 = from[0], f1 = from[1];
+  Packet4f v0 = { f0, f0, f0, f0 };
+  Packet4f v1 = { f1, f1, f1, f1 };
+  return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  int32_t i0 = from[0], i1 = from[1];
+  Packet4i v0 = { i0, i0, i0, i0 };
+  Packet4i v1 = { i1, i1, i1, i1 };
+  return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  float f = *from;
+  Packet4f v = { f, f, f, f };
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  return v;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  int32_t i = *from;
+  Packet4i v = { i, i, i, i };
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  return v;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from,
+                                                        Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = from[0];
+  to += stride;
+  *to = from[1];
+  to += stride;
+  *to = from[2];
+  to += stride;
+  *to = from[3];
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
+                                                          Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = from[0];
+  to += stride;
+  *to = from[1];
+  to += stride;
+  *to = from[2];
+  to += stride;
+  *to = from[3];
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  EIGEN_MSA_DEBUG;
+
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
+  EIGEN_MSA_DEBUG;
+
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i zero = __builtin_msa_ldi_w(0);
+  return __builtin_msa_add_a_w(zero, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return s[0];
+}
+
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return s[0];
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return p[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return p[0];
+}
+
+// min
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  // Swap 64-bit halves of a.
+  Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+#if !EIGEN_FAST_MATH
+  // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
+  // masks of all zeroes/ones in low 64 bits.
+  v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
+  // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
+  unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
+#endif
+  // Continue with min computation.
+  Packet4f v = __builtin_msa_fmin_w(a, swapped);
+  v = __builtin_msa_fmin_w(
+      v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+#if !EIGEN_FAST_MATH
+  // Based on the mask select between v and 4 qNaNs.
+  v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
+  v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
+#endif
+  return v[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return m[0];
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  // Swap 64-bit halves of a.
+  Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+#if !EIGEN_FAST_MATH
+  // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
+  // masks of all zeroes/ones in low 64 bits.
+  v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
+  // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
+  unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
+#endif
+  // Continue with max computation.
+  Packet4f v = __builtin_msa_fmax_w(a, swapped);
+  v = __builtin_msa_fmax_w(
+      v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+#if !EIGEN_FAST_MATH
+  // Based on the mask select between v and 4 qNaNs.
+  v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
+  v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
+#endif
+  return v[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return m[0];
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4f, 4>& value) {
+  os << "[ " << value.packet[0] << "," << std::endl
+     << "  " << value.packet[1] << "," << std::endl
+     << "  " << value.packet[2] << "," << std::endl
+     << "  " << value.packet[3] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  v4i32 tmp1, tmp2, tmp3, tmp4;
+
+  tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
+  tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
+  tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
+  tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
+
+  kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
+  kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4i, 4>& value) {
+  os << "[ " << value.packet[0] << "," << std::endl
+     << "  " << value.packet[1] << "," << std::endl
+     << "  " << value.packet[2] << "," << std::endl
+     << "  " << value.packet[3] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  v4i32 tmp1, tmp2, tmp3, tmp4;
+
+  tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]);
+  tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]);
+  tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]);
+  tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
+  kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsqrt_w(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  return __builtin_msa_frsqrt_w(a);
+#else
+  Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1));
+  return pdiv(ones, psqrt(a));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+  Packet4f v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.w %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+  Packet4f v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.w %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
+  Packet4f v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.w %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+                                    const Packet4f& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
+                       ifPacket.select[3] };
+  Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
+  return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
+                                    const Packet4i& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
+                       ifPacket.select[3] };
+  Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
+  return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+//---------- double ----------
+
+typedef v2f64 Packet2d;
+typedef v2i64 Packet2l;
+typedef v2u64 Packet2ul;
+
+#define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X }
+#define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X }
+#define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X }
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) {
+  os << "[ " << value[0] << ", " << value[1] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) {
+  os << "[ " << value[0] << ", " << value[1] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) {
+  os << "[ " << value[0] << ", " << value[1] << " ]";
+  return os;
+}
+
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+    // FIXME check the Has*
+    HasDiv = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2d> {
+  typedef double type;
+  enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  typedef Packet2d half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d value = { from, from };
+  return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fadd_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+  EIGEN_MSA_DEBUG;
+
+  static const Packet2d countdown = { 0.0, 1.0 };
+  return padd(pset1<Packet2d>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsub_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmul_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fdiv_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmadd_d(c, a, b);
+}
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using MSA
+// intrinsics
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmin_d(a, b);
+#else
+  // This prefers NaNs to numbers.
+  v2i64 aNaN = __builtin_msa_fcun_d(a, a);
+  v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN);
+  return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmax_d(a, b);
+#else
+  // This prefers NaNs to numbers.
+  v2i64 aNaN = __builtin_msa_fcun_d(a, a);
+  v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN);
+  return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d value = { *from, *from };
+  return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d value;
+  value[0] = *from;
+  from += stride;
+  value[1] = *from;
+  return value;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from,
+                                                         Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = from[0];
+  to += stride;
+  *to = from[1];
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  EIGEN_MSA_DEBUG;
+
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d s = padd(a, preverse(a));
+  return s[0];
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d p = pmul(a, preverse(a));
+  return p[0];
+}
+
+// min
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+  Packet2d v = __builtin_msa_fmin_d(a, swapped);
+  return v[0];
+#else
+  double a0 = a[0], a1 = a[1];
+  return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1;
+#endif
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+  Packet2d v = __builtin_msa_fmax_d(a, swapped);
+  return v[0];
+#else
+  double a0 = a[0], a1 = a[1];
+  return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1;
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsqrt_d(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  return __builtin_msa_frsqrt_d(a);
+#else
+  Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1));
+  return pdiv(ones, psqrt(a));
+#endif
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2d, 2>& value) {
+  os << "[ " << value.packet[0] << "," << std::endl << "  " << value.packet[1] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
+  Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
+  kernel.packet[0] = trn1;
+  kernel.packet[1] = trn2;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+  Packet2d v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.d %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+  Packet2d v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.d %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
+  Packet2d v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.d %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
+                                    const Packet2d& elsePacket) {
+  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0);
+  return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_MSA_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/NEON/Complex.h b/src/3rdparty/eigen/Eigen/src/Core/arch/NEON/Complex.h
new file mode 100644
index 000000000..f40af7f87
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/NEON/Complex.h
@@ -0,0 +1,584 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_NEON_H
+#define EIGEN_COMPLEX_NEON_H
+
+namespace Eigen {
+
+namespace internal {
+
+inline uint32x4_t p4ui_CONJ_XOR()
+{
+// See bug 1325, clang fails to call vld1q_u64.
+#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
+  uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+  return ret;
+#else
+  static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+  return vld1q_u32( conj_XOR_DATA );
+#endif
+}
+
+inline uint32x2_t p2ui_CONJ_XOR()
+{
+  static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 };
+  return vld1_u32( conj_XOR_DATA );
+}
+
+//---------- float ----------
+
+struct Packet1cf
+{
+  EIGEN_STRONG_INLINE Packet1cf() {}
+  EIGEN_STRONG_INLINE explicit Packet1cf(const Packet2f& a) : v(a) {}
+  Packet2f v;
+};
+struct Packet2cf
+{
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+  Packet4f v;
+};
+
+template<> struct packet_traits<std::complex<float> > : default_packet_traits
+{
+  typedef Packet2cf type;
+  typedef Packet1cf half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasMul       = 1,
+    HasDiv       = 1,
+    HasNegate    = 1,
+    HasAbs       = 0,
+    HasAbs2      = 0,
+    HasMin       = 0,
+    HasMax       = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet1cf>
+{
+  typedef std::complex<float> type;
+  typedef Packet1cf half;
+  typedef Packet2f as_real;
+  enum
+  {
+    size = 1,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet2cf>
+{
+  typedef std::complex<float> type;
+  typedef Packet1cf half;
+  typedef Packet4f as_real;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet1cf pcast<float,Packet1cf>(const float& a)
+{ return Packet1cf(vset_lane_f32(a, vdup_n_f32(0.f), 0)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pcast<Packet2f,Packet2cf>(const Packet2f& a)
+{ return Packet2cf(vreinterpretq_f32_u64(vmovl_u32(vreinterpret_u32_f32(a)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf pset1<Packet1cf>(const std::complex<float>& from)
+{ return Packet1cf(vld1_f32(reinterpret_cast<const float*>(&from))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
+{
+  const float32x2_t r64 = vld1_f32(reinterpret_cast<const float*>(&from));
+  return Packet2cf(vcombine_f32(r64, r64));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cf padd<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(padd<Packet2f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(padd<Packet4f>(a.v, b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf psub<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(psub<Packet2f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(psub<Packet4f>(a.v, b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate<Packet2f>(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate<Packet4f>(a.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a)
+{
+  const Packet2ui b = vreinterpret_u32_f32(a.v);
+  return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR())));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
+{
+  const Packet4ui b = vreinterpretq_u32_f32(a.v);
+  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR())));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cf pmul<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{
+  Packet2f v1, v2;
+
+  // Get the real values of a | a1_re | a1_re |
+  v1 = vdup_lane_f32(a.v, 0);
+  // Get the imag values of a | a1_im | a1_im |
+  v2 = vdup_lane_f32(a.v, 1);
+  // Multiply the real a with b
+  v1 = vmul_f32(v1, b.v);
+  // Multiply the imag a with b
+  v2 = vmul_f32(v2, b.v);
+  // Conjugate v2
+  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR()));
+  // Swap real/imag elements in v2.
+  v2 = vrev64_f32(v2);
+  // Add and return the result
+  return Packet1cf(vadd_f32(v1, v2));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet4f v1, v2;
+
+  // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+  v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 0), vdup_lane_f32(vget_high_f32(a.v), 0));
+  // Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
+  v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 1), vdup_lane_f32(vget_high_f32(a.v), 1));
+  // Multiply the real a with b
+  v1 = vmulq_f32(v1, b.v);
+  // Multiply the imag a with b
+  v2 = vmulq_f32(v2, b.v);
+  // Conjugate v2
+  v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR()));
+  // Swap real/imag elements in v2.
+  v2 = vrev64q_f32(v2);
+  // Add and return the result
+  return Packet2cf(vaddq_f32(v1, v2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cf pcmp_eq(const Packet1cf& a, const Packet1cf& b)
+{
+  // Compare real and imaginary parts of a and b to get the mask vector:
+  // [re(a[0])==re(b[0]), im(a[0])==im(b[0])]
+  Packet2f eq = pcmp_eq<Packet2f>(a.v, b.v);
+  // Swap real/imag elements in the mask in to get:
+  // [im(a[0])==im(b[0]), re(a[0])==re(b[0])]
+  Packet2f eq_swapped = vrev64_f32(eq);
+  // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+  return Packet1cf(pand<Packet2f>(eq, eq_swapped));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
+{
+  // Compare real and imaginary parts of a and b to get the mask vector:
+  // [re(a[0])==re(b[0]), im(a[0])==im(b[0]), re(a[1])==re(b[1]), im(a[1])==im(b[1])]
+  Packet4f eq = pcmp_eq<Packet4f>(a.v, b.v);
+  // Swap real/imag elements in the mask in to get:
+  // [im(a[0])==im(b[0]), re(a[0])==re(b[0]), im(a[1])==im(b[1]), re(a[1])==re(b[1])]
+  Packet4f eq_swapped = vrev64q_f32(eq);
+  // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+  return Packet2cf(pand<Packet4f>(eq, eq_swapped));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cf pand<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf por<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
+template<> EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf pxor<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf pandnot<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf pload<Packet1cf>(const std::complex<float>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload<Packet2f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(reinterpret_cast<const float*>(from))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf ploadu<Packet1cf>(const std::complex<float>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cf(ploadu<Packet2f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(reinterpret_cast<const float*>(from))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf ploaddup<Packet1cf>(const std::complex<float>* from)
+{ return pset1<Packet1cf>(*from); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from)
+{ return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *to, const Packet1cf& from)
+{ EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *to, const Packet2cf& from)
+{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<float*>(to), from.v); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *to, const Packet1cf& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *to, const Packet2cf& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<float*>(to), from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet1cf pgather<std::complex<float>, Packet1cf>(
+    const std::complex<float>* from, Index stride)
+{
+  const Packet2f tmp = vdup_n_f32(std::real(from[0*stride]));
+  return Packet1cf(vset_lane_f32(std::imag(from[0*stride]), tmp, 1));
+}
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
+    const std::complex<float>* from, Index stride)
+{
+  Packet4f res = vdupq_n_f32(std::real(from[0*stride]));
+  res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1);
+  res = vsetq_lane_f32(std::real(from[1*stride]), res, 2);
+  res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3);
+  return Packet2cf(res);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet1cf>(
+    std::complex<float>* to, const Packet1cf& from, Index stride)
+{ to[stride*0] = std::complex<float>(vget_lane_f32(from.v, 0), vget_lane_f32(from.v, 1)); }
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(
+    std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
+  to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *addr)
+{ EIGEN_ARM_PREFETCH(reinterpret_cast<const float*>(addr)); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet1cf>(const Packet1cf& a)
+{
+  EIGEN_ALIGN16 std::complex<float> x;
+  vst1_f32(reinterpret_cast<float*>(&x), a.v);
+  return x;
+}
+template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
+{
+  EIGEN_ALIGN16 std::complex<float> x[2];
+  vst1q_f32(reinterpret_cast<float*>(x), a.v);
+  return x[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cf preverse(const Packet1cf& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+{ return Packet2cf(vcombine_f32(vget_high_f32(a.v), vget_low_f32(a.v))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf pcplxflip<Packet1cf>(const Packet1cf& a)
+{ return Packet1cf(vrev64_f32(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a)
+{ return Packet2cf(vrev64q_f32(a.v)); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet1cf>(const Packet1cf& a)
+{
+  std::complex<float> s;
+  vst1_f32((float *)&s, a.v);
+  return s;
+}
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> s;
+  vst1_f32(reinterpret_cast<float*>(&s), vadd_f32(vget_low_f32(a.v), vget_high_f32(a.v)));
+  return s;
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet1cf>(const Packet1cf& a)
+{
+  std::complex<float> s;
+  vst1_f32((float *)&s, a.v);
+  return s;
+}
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  float32x2_t a1, a2, v1, v2, prod;
+  std::complex<float> s;
+
+  a1 = vget_low_f32(a.v);
+  a2 = vget_high_f32(a.v);
+   // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+  v1 = vdup_lane_f32(a1, 0);
+  // Get the real values of a | a1_im | a1_im | a2_im | a2_im |
+  v2 = vdup_lane_f32(a1, 1);
+  // Multiply the real a with b
+  v1 = vmul_f32(v1, a2);
+  // Multiply the imag a with b
+  v2 = vmul_f32(v2, a2);
+  // Conjugate v2
+  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR()));
+  // Swap real/imag elements in v2.
+  v2 = vrev64_f32(v2);
+  // Add v1, v2
+  prod = vadd_f32(v1, v2);
+
+  vst1_f32(reinterpret_cast<float*>(&s), prod);
+
+  return s;
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cf,Packet2f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+
+template<> EIGEN_STRONG_INLINE Packet1cf pdiv<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{
+  // TODO optimize it for NEON
+  Packet1cf res = pmul(a, pconj(b));
+  Packet2f s, rev_s;
+
+  // this computes the norm
+  s = vmul_f32(b.v, b.v);
+  rev_s = vrev64_f32(s);
+
+  return Packet1cf(pdiv<Packet2f>(res.v, vadd_f32(s, rev_s)));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for NEON
+  Packet2cf res = pmul(a,pconj(b));
+  Packet4f s, rev_s;
+
+  // this computes the norm
+  s = vmulq_f32(b.v, b.v);
+  rev_s = vrev64q_f32(s);
+
+  return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s, rev_s)));
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1cf, 1>& /*kernel*/) {}
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel)
+{
+  Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
+  kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
+  kernel.packet[1].v = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cf psqrt<Packet1cf>(const Packet1cf& a) {
+  return psqrt_complex<Packet1cf>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+  return psqrt_complex<Packet2cf>(a);
+}
+
+//---------- double ----------
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+
+// See bug 1325, clang fails to call vld1q_u64.
+#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
+  static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000};
+#else
+  const uint64_t  p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 };
+  static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );
+#endif
+
+struct Packet1cd
+{
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+  Packet2d v;
+};
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet1cd>
+{
+  typedef std::complex<double> type;
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
+  enum
+  {
+    size=1,
+    alignment=Aligned16,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>(reinterpret_cast<const double*>(from))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>(reinterpret_cast<const double*>(from))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
+{
+  /* here we really have to use unaligned loads :( */
+  return ploadu<Packet1cd>(&from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{ return Packet1cd(padd<Packet2d>(a.v, b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{ return Packet1cd(psub<Packet2d>(a.v, b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a)
+{ return Packet1cd(pnegate<Packet2d>(a.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
+{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  Packet2d v1, v2;
+
+  // Get the real values of a
+  v1 = vdupq_lane_f64(vget_low_f64(a.v), 0);
+  // Get the imag values of a
+  v2 = vdupq_lane_f64(vget_high_f64(a.v), 0);
+  // Multiply the real a with b
+  v1 = vmulq_f64(v1, b.v);
+  // Multiply the imag a with b
+  v2 = vmulq_f64(v2, b.v);
+  // Conjugate v2
+  v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR));
+  // Swap real/imag elements in v2.
+  v2 = preverse<Packet2d>(v2);
+  // Add and return the result
+  return Packet1cd(vaddq_f64(v1, v2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
+{
+  // Compare real and imaginary parts of a and b to get the mask vector:
+  // [re(a)==re(b), im(a)==im(b)]
+  Packet2d eq = pcmp_eq<Packet2d>(a.v, b.v);
+  // Swap real/imag elements in the mask in to get:
+  // [im(a)==im(b), re(a)==re(b)]
+  Packet2d eq_swapped = vreinterpretq_f64_u32(vrev64q_u32(vreinterpretq_u32_f64(eq)));
+  // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+  return Packet1cd(pand<Packet2d>(eq, eq_swapped));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{ return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{ return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{ return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from)
+{ return pset1<Packet1cd>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *to, const Packet1cd& from)
+{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<double*>(to), from.v); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *to, const Packet1cd& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), from.v); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *addr)
+{ EIGEN_ARM_PREFETCH(reinterpret_cast<const double*>(addr)); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
+    const std::complex<double>* from, Index stride)
+{
+  Packet2d res = pset1<Packet2d>(0.0);
+  res = vsetq_lane_f64(std::real(from[0*stride]), res, 0);
+  res = vsetq_lane_f64(std::imag(from[0*stride]), res, 1);
+  return Packet1cd(res);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(
+    std::complex<double>* to, const Packet1cd& from, Index stride)
+{ to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
+{
+  EIGEN_ALIGN16 std::complex<double> res;
+  pstore<std::complex<double> >(&res, a);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for NEON
+  Packet1cd res = pmul(a,pconj(b));
+  Packet2d s = pmul<Packet2d>(b.v, b.v);
+  Packet2d rev_s = preverse<Packet2d>(s);
+
+  return Packet1cd(pdiv(res.v, padd<Packet2d>(s,rev_s)));
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+{ return Packet1cd(preverse(Packet2d(x.v))); }
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
+{
+  Packet2d tmp = vcombine_f64(vget_high_f64(kernel.packet[0].v), vget_high_f64(kernel.packet[1].v));
+  kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v));
+  kernel.packet[1].v = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+  return psqrt_complex<Packet1cd>(a);
+}
+
+#endif // EIGEN_ARCH_ARM64
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_NEON_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/src/3rdparty/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
new file mode 100644
index 000000000..116dbb448
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
@@ -0,0 +1,183 @@
+namespace Eigen {
+namespace internal {
+  
+#if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
+
+// Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
+// Here we specialize gebp_traits to eliminate these register spills.
+// See #2138.
+template<>
+struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
+ : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
+{
+  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+  { 
+    // This volatile inline ASM both acts as a barrier to prevent reordering,
+    // as well as enforces strict register use.
+    asm volatile(
+      "vmla.f32 %q[r], %q[c], %q[alpha]"
+      : [r] "+w" (r)
+      : [c] "w" (c),
+        [alpha] "w" (alpha)
+      : );
+  }
+
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
+                                Packet4f& c, Packet4f& /*tmp*/,
+                                const LaneIdType&) const {
+    acc(a, b, c);
+  }
+  
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b,
+                                Packet4f& c, Packet4f& tmp,
+                                const LaneIdType& lane) const {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+};
+
+#endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
+
+#if EIGEN_ARCH_ARM64
+
+template<>
+struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
+ : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
+{
+  typedef float RhsPacket;
+  typedef float32x4_t RhsPacketx4;
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = *b;
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    dest = vld1q_f32(b);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = *b;
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  {
+    loadRhs(b,dest);
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+  {
+    c = vfmaq_n_f32(c, a, b);
+  }
+
+  // NOTE: Template parameter inference failed when compiled with Android NDK:
+  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+  { madd_helper<0>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
+  { madd_helper<1>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
+  { madd_helper<2>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
+  { madd_helper<3>(a, b, c); }
+
+ private:
+  template<int LaneID>
+  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
+  {
+    #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
+    // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+    // vfmaq_laneq_f32 is implemented through a costly dup
+         if(LaneID==0)  asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) :  );
+    else if(LaneID==1)  asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) :  );
+    else if(LaneID==2)  asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) :  );
+    else if(LaneID==3)  asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) :  );
+    #else
+    c = vfmaq_laneq_f32(c, a, b, LaneID);
+    #endif
+  }
+};
+
+
+template<>
+struct gebp_traits <double,double,false,false,Architecture::NEON>
+ : gebp_traits<double,double,false,false,Architecture::Generic>
+{
+  typedef double RhsPacket;
+
+  struct RhsPacketx4 {
+    float64x2_t B_0, B_1;
+  };
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = *b;
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    dest.B_0 = vld1q_f64(b);
+    dest.B_1 = vld1q_f64(b+2);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    loadRhs(b,dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  {
+    loadRhs(b,dest);
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+  {
+    c = vfmaq_n_f64(c, a, b);
+  }
+
+  // NOTE: Template parameter inference failed when compiled with Android NDK:
+  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+  { madd_helper<0>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
+  { madd_helper<1>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
+  { madd_helper<2>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
+  { madd_helper<3>(a, b, c); }
+
+ private:
+  template <int LaneID>
+  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
+  {
+    #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
+    // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+    // vfmaq_laneq_f64 is implemented through a costly dup
+         if(LaneID==0)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
+    else if(LaneID==1)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
+    else if(LaneID==2)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
+    else if(LaneID==3)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
+    #else
+         if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
+    else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
+    else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
+    else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
+    #endif
+  }
+};
+
+#endif // EIGEN_ARCH_ARM64
+
+}  // namespace internal
+}  // namespace Eigen
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h b/src/3rdparty/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h
new file mode 100644
index 000000000..fa6615a85
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -0,0 +1,75 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_NEON_H
+#define EIGEN_MATH_FUNCTIONS_NEON_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pexp<Packet2f>(const Packet2f& x)
+{ return pexp_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp<Packet4f>(const Packet4f& x)
+{ return pexp_float(x); }
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f plog<Packet2f>(const Packet2f& x)
+{ return plog_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog<Packet4f>(const Packet4f& x)
+{ return plog_float(x); }
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f psin<Packet2f>(const Packet2f& x)
+{ return psin_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psin<Packet4f>(const Packet4f& x)
+{ return psin_float(x); }
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pcos<Packet2f>(const Packet2f& x)
+{ return pcos_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pcos<Packet4f>(const Packet4f& x)
+{ return pcos_float(x); }
+
+// Hyperbolic Tangent function.
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f ptanh<Packet2f>(const Packet2f& x)
+{ return internal::generic_fast_tanh_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f ptanh<Packet4f>(const Packet4f& x)
+{ return internal::generic_fast_tanh_float(x); }
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, psin)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pcos)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, plog)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, ptanh)
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pfrexp(const Packet4bf& a, Packet4bf& exponent) {
+  Packet4f fexponent;
+  const Packet4bf out = F32ToBf16(pfrexp<Packet4f>(Bf16ToF32(a), fexponent));
+  exponent = F32ToBf16(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pldexp(const Packet4bf& a, const Packet4bf& exponent) {
+  return F32ToBf16(pldexp<Packet4f>(Bf16ToF32(a), Bf16ToF32(exponent)));
+}
+
+//---------- double ----------
+
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp<Packet2d>(const Packet2d& x)
+{ return pexp_double(x); }
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d plog<Packet2d>(const Packet2d& x)
+{ return plog_double(x); }
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_NEON_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/NEON/PacketMath.h b/src/3rdparty/eigen/Eigen/src/Core/arch/NEON/PacketMath.h
new file mode 100644
index 000000000..34c9bbcdd
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -0,0 +1,4587 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
+// Heavily based on Gael's SSE version.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_NEON_H
+#define EIGEN_PACKET_MATH_NEON_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#if EIGEN_ARCH_ARM64
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#else
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#endif
+#endif
+
+#if EIGEN_COMP_MSVC_STRICT
+
+// In MSVC's arm_neon.h header file, all NEON vector types
+// are aliases to the same underlying type __n128.
+// We thus have to wrap them to make them different C++ types.
+// (See also bug 1428)
+typedef eigen_packet_wrapper<float32x2_t,0>  Packet2f;
+typedef eigen_packet_wrapper<float32x4_t,1>  Packet4f;
+typedef eigen_packet_wrapper<int32_t    ,2>  Packet4c;
+typedef eigen_packet_wrapper<int8x8_t   ,3>  Packet8c;
+typedef eigen_packet_wrapper<int8x16_t  ,4>  Packet16c;
+typedef eigen_packet_wrapper<uint32_t   ,5>  Packet4uc;
+typedef eigen_packet_wrapper<uint8x8_t  ,6>  Packet8uc;
+typedef eigen_packet_wrapper<uint8x16_t ,7>  Packet16uc;
+typedef eigen_packet_wrapper<int16x4_t  ,8>  Packet4s;
+typedef eigen_packet_wrapper<int16x8_t  ,9>  Packet8s;
+typedef eigen_packet_wrapper<uint16x4_t ,10> Packet4us;
+typedef eigen_packet_wrapper<uint16x8_t ,11> Packet8us;
+typedef eigen_packet_wrapper<int32x2_t  ,12> Packet2i;
+typedef eigen_packet_wrapper<int32x4_t  ,13> Packet4i;
+typedef eigen_packet_wrapper<uint32x2_t ,14> Packet2ui;
+typedef eigen_packet_wrapper<uint32x4_t ,15> Packet4ui;
+typedef eigen_packet_wrapper<int64x2_t  ,16> Packet2l;
+typedef eigen_packet_wrapper<uint64x2_t ,17> Packet2ul;
+
+#else
+
+typedef float32x2_t                          Packet2f;
+typedef float32x4_t                          Packet4f;
+typedef eigen_packet_wrapper<int32_t    ,2>  Packet4c;
+typedef int8x8_t                             Packet8c;
+typedef int8x16_t                            Packet16c;
+typedef eigen_packet_wrapper<uint32_t   ,5>  Packet4uc;
+typedef uint8x8_t                            Packet8uc;
+typedef uint8x16_t                           Packet16uc;
+typedef int16x4_t                            Packet4s;
+typedef int16x8_t                            Packet8s;
+typedef uint16x4_t                           Packet4us;
+typedef uint16x8_t                           Packet8us;
+typedef int32x2_t                            Packet2i;
+typedef int32x4_t                            Packet4i;
+typedef uint32x2_t                           Packet2ui;
+typedef uint32x4_t                           Packet4ui;
+typedef int64x2_t                            Packet2l;
+typedef uint64x2_t                           Packet2ul;
+
+#endif // EIGEN_COMP_MSVC_STRICT
+
+EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){
+  const float* a = reinterpret_cast<const float*>(&m);
+  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))};
+  return res;
+}
+
+// fuctionally equivalent to _mm_shuffle_ps in SSE when interleave
+// == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
+// interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
+// to enable a shared implementation for fast inversion of matrices of size 4. 
+template<bool interleave> 
+EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask)
+{
+  const float* a = reinterpret_cast<const float*>(&m);
+  const float* b = reinterpret_cast<const float*>(&n);
+  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
+  return res;
+}
+
+template<> 
+EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f &m, const Packet4f &n, int mask) 
+{
+  const float* a = reinterpret_cast<const float*>(&m);
+  const float* b = reinterpret_cast<const float*>(&n);
+  Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
+  return res;
+}
+
+EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));}
+
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s)
+{ 
+  return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s)
+{ 
+  return shuffle2<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
+{
+  return shuffle2<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
+{
+  return shuffle2<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
+{
+  return shuffle2<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
+{
+  return shuffle2<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
+}
+#define vec4f_duplane(a, p) \
+  vdupq_lane_f32(vget_low_f32(a), p)
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
+  const Packet4f p4f_##NAME = pset1<Packet4f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
+  const Packet4i p4i_##NAME = pset1<Packet4i>(X)
+
+#if EIGEN_ARCH_ARM64
+  // __builtin_prefetch tends to do nothing on ARM64 compilers because the
+  // prefetch instructions there are too detailed for __builtin_prefetch to map
+  // meaningfully to them.
+  #define EIGEN_ARM_PREFETCH(ADDR)  __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
+#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+  #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#elif defined __pld
+  #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
+#elif EIGEN_ARCH_ARM32
+  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
+#else
+  // by default no explicit prefetching
+  #define EIGEN_ARM_PREFETCH(ADDR)
+#endif
+
+template <>
+struct packet_traits<float> : default_packet_traits
+{
+  typedef Packet4f type;
+  typedef Packet2f half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+
+    HasDiv   = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+
+    HasSin  = EIGEN_FAST_MATH,
+    HasCos  = EIGEN_FAST_MATH,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf  = EIGEN_FAST_MATH,
+    HasBessel = 0,  // Issues with accuracy.
+    HasNdtri = 0
+  };
+};
+
+template <>
+struct packet_traits<int8_t> : default_packet_traits
+{
+  typedef Packet16c type;
+  typedef Packet8c half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasAbsDiff   = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0
+  };
+};
+
+template <>
+struct packet_traits<uint8_t> : default_packet_traits
+{
+  typedef Packet16uc type;
+  typedef Packet8uc half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 0,
+    HasAbs       = 1,
+    HasAbsDiff   = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+
+    HasSqrt = 1
+  };
+};
+
+template <>
+struct packet_traits<int16_t> : default_packet_traits
+{
+  typedef Packet8s type;
+  typedef Packet4s half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasAbsDiff   = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0
+  };
+};
+
+template <>
+struct packet_traits<uint16_t> : default_packet_traits
+{
+  typedef Packet8us type;
+  typedef Packet4us half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 0,
+    HasAbs       = 0,
+    HasAbsDiff   = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+    HasSqrt = 1
+  };
+};
+
+template <>
+struct packet_traits<int32_t> : default_packet_traits
+{
+  typedef Packet4i type;
+  typedef Packet2i half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0
+  };
+};
+
+template <>
+struct packet_traits<uint32_t> : default_packet_traits
+{
+  typedef Packet4ui type;
+  typedef Packet2ui half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 0,
+    HasAbs       = 0,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+
+    HasSqrt = 1
+  };
+};
+
+template <>
+struct packet_traits<int64_t> : default_packet_traits
+{
+  typedef Packet2l type;
+  typedef Packet2l half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasCmp       = 1,
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0
+  };
+};
+
+template <>
+struct packet_traits<uint64_t> : default_packet_traits
+{
+  typedef Packet2ul type;
+  typedef Packet2ul half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasCmp       = 1,
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 0,
+    HasAbs       = 0,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0
+  };
+};
+
+#if EIGEN_GNUC_AT_MOST(4, 4) && !EIGEN_COMP_LLVM
+// workaround gcc 4.2, 4.3 and 4.4 compilation issue
+EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
+EIGEN_STRONG_INLINE float32x2_t vld1_f32(const float* x) { return ::vld1_f32 ((const float32_t*)x); }
+EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32(const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
+EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
+EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
+#endif
+
+template<> struct unpacket_traits<Packet2f>
+{
+  typedef float type;
+  typedef Packet2f half;
+  typedef Packet2i integer_packet;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4f>
+{
+  typedef float type;
+  typedef Packet2f half;
+  typedef Packet4i integer_packet;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4c>
+{
+  typedef int8_t type;
+  typedef Packet4c half;
+  enum
+  {
+    size = 4,
+    alignment = Unaligned,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet8c>
+{
+  typedef int8_t type;
+  typedef Packet4c half;
+  enum
+  {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet16c>
+{
+  typedef int8_t type;
+  typedef Packet8c half;
+  enum
+  {
+    size = 16,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4uc>
+{
+  typedef uint8_t type;
+  typedef Packet4uc half;
+  enum
+  {
+    size = 4,
+    alignment = Unaligned,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet8uc>
+{
+  typedef uint8_t type;
+  typedef Packet4uc half;
+  enum
+  {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet16uc>
+{
+  typedef uint8_t type;
+  typedef Packet8uc half;
+  enum
+  {
+    size = 16,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false};
+};
+template<> struct unpacket_traits<Packet4s>
+{
+  typedef int16_t type;
+  typedef Packet4s half;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet8s>
+{
+  typedef int16_t type;
+  typedef Packet4s half;
+  enum
+  {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4us>
+{
+  typedef uint16_t type;
+  typedef Packet4us half;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet8us>
+{
+  typedef uint16_t type;
+  typedef Packet4us half;
+  enum
+  {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet2i>
+{
+  typedef int32_t type;
+  typedef Packet2i half;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4i>
+{
+  typedef int32_t type;
+  typedef Packet2i half;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet2ui>
+{
+  typedef uint32_t type;
+  typedef Packet2ui half;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4ui>
+{
+  typedef uint32_t type;
+  typedef Packet2ui half;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet2l>
+{
+  typedef int64_t type;
+  typedef Packet2l half;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet2ul>
+{
+  typedef uint64_t type;
+  typedef Packet2ul half;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(const float& from) { return vdup_n_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(const int8_t& from)
+{ return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(const int8_t& from) { return vdup_n_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) { return vdupq_n_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(const uint8_t& from)
+{ return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0); }
+template<> EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(const uint8_t& from) { return vdup_n_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) { return vdupq_n_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(const int16_t& from) { return vdup_n_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) { return vdupq_n_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(const uint16_t& from) { return vdup_n_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) { return vdupq_n_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(const int32_t& from) { return vdup_n_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(const uint32_t& from) { return vdup_n_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) { return vdupq_n_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) { return vdupq_n_s64(from); }
+template<> EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) { return vdupq_n_u64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(unsigned int from)
+{ return vreinterpret_f32_u32(vdup_n_u32(from)); }
+template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from)
+{ return vreinterpretq_f32_u32(vdupq_n_u32(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(const float& a)
+{
+  const float c[] = {0.0f,1.0f};
+  return vadd_f32(pset1<Packet2f>(a), vld1_f32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
+{
+  const float c[] = {0.0f,1.0f,2.0f,3.0f};
+  return vaddq_f32(pset1<Packet4f>(a), vld1q_f32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(const int8_t& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(const int8_t& a)
+{
+  const int8_t c[] = {0,1,2,3,4,5,6,7};
+  return vadd_s8(pset1<Packet8c>(a), vld1_s8(c));
+}
+template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a)
+{
+  const int8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  return vaddq_s8(pset1<Packet16c>(a), vld1q_s8(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(const uint8_t& a)
+{ return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(const uint8_t& a)
+{
+  const uint8_t c[] = {0,1,2,3,4,5,6,7};
+  return vadd_u8(pset1<Packet8uc>(a), vld1_u8(c));
+}
+template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a)
+{
+  const uint8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  return vaddq_u8(pset1<Packet16uc>(a), vld1q_u8(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(const int16_t& a)
+{
+  const int16_t c[] = {0,1,2,3};
+  return vadd_s16(pset1<Packet4s>(a), vld1_s16(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(const uint16_t& a)
+{
+  const uint16_t c[] = {0,1,2,3};
+  return vadd_u16(pset1<Packet4us>(a), vld1_u16(c));
+}
+template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a)
+{
+  const int16_t c[] = {0,1,2,3,4,5,6,7};
+  return vaddq_s16(pset1<Packet8s>(a), vld1q_s16(c));
+}
+template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a)
+{
+  const uint16_t c[] = {0,1,2,3,4,5,6,7};
+  return vaddq_u16(pset1<Packet8us>(a), vld1q_u16(c));
+}
+template<> EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(const int32_t& a)
+{
+  const int32_t c[] = {0,1};
+  return vadd_s32(pset1<Packet2i>(a), vld1_s32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
+{
+  const int32_t c[] = {0,1,2,3};
+  return vaddq_s32(pset1<Packet4i>(a), vld1q_s32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(const uint32_t& a)
+{
+  const uint32_t c[] = {0,1};
+  return vadd_u32(pset1<Packet2ui>(a), vld1_u32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a)
+{
+  const uint32_t c[] = {0,1,2,3};
+  return vaddq_u32(pset1<Packet4ui>(a), vld1q_u32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a)
+{
+  const int64_t c[] = {0,1};
+  return vaddq_s64(pset1<Packet2l>(a), vld1q_s64(c));
+}
+template<> EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a)
+{
+  const uint64_t c[] = {0,1};
+  return vaddq_u64(pset1<Packet2ul>(a), vld1q_u64(c));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(const Packet2f& a, const Packet2f& b) { return vadd_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(const Packet8c& a, const Packet8c& b) { return vadd_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) { return vaddq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vadd_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vaddq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(const Packet4s& a, const Packet4s& b) { return vadd_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) { return vaddq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(const Packet4us& a, const Packet4us& b) { return vadd_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) { return vaddq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(const Packet2i& a, const Packet2i& b) { return vadd_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vadd_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vaddq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) { return vaddq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vaddq_u64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(const Packet2f& a, const Packet2f& b) { return vsub_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vsub_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(const Packet8c& a, const Packet8c& b) { return vsub_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) { return vsubq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vsub_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vsub_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vsubq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(const Packet4s& a, const Packet4s& b) { return vsub_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) { return vsubq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(const Packet4us& a, const Packet4us& b) { return vsub_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) { return vsubq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(const Packet2i& a, const Packet2i& b) { return vsub_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vsub_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vsubq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
+template<> EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f & b) {
+  Packet2f mask = {numext::bit_cast<float>(0x80000000u), 0.0f};
+  return padd(a, pxor(mask, b));
+}
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f mask = {numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f};
+  return padd(a, pxor(mask, b));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) { return vneg_s8(a); }
+template<> EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) { return vnegq_s8(a); }
+template<> EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) { return vneg_s16(a); }
+template<> EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) { return vnegq_s16(a); }
+template<> EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) { return vneg_s32(a); }
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
+template<> EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
+#if EIGEN_ARCH_ARM64
+  return vnegq_s64(a);
+#else
+  return vcombine_s64(
+      vdup_n_s64(-vgetq_lane_s64(a, 0)),
+      vdup_n_s64(-vgetq_lane_s64(a, 1)));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmul_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vmul_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmul_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmulq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vmul_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmul_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmulq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmul_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmulq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmul_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmulq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmul_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmul_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmulq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s64(
+    vdup_n_s64(vgetq_lane_s64(a, 0)*vgetq_lane_s64(b, 0)),
+    vdup_n_s64(vgetq_lane_s64(a, 1)*vgetq_lane_s64(b, 1)));
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u64(
+    vdup_n_u64(vgetq_lane_u64(a, 0)*vgetq_lane_u64(b, 0)),
+    vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vdiv_f32(a,b);
+#else
+  Packet2f inv, restep, div;
+
+  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
+  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
+  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
+  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
+  // Newton-Raphson and vrecpsq_f32()
+  inv = vrecpe_f32(b);
+
+  // This returns a differential, by which we will have to multiply inv to get a better
+  // approximation of 1/b.
+  restep = vrecps_f32(b, inv);
+  inv = vmul_f32(restep, inv);
+
+  // Finally, multiply a by 1/b and get the wanted result of the division.
+  div = vmul_f32(a, inv);
+
+  return div;
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vdivq_f32(a,b);
+#else
+  Packet4f inv, restep, div;
+
+  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
+  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
+  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
+  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
+  // Newton-Raphson and vrecpsq_f32()
+  inv = vrecpeq_f32(b);
+
+  // This returns a differential, by which we will have to multiply inv to get a better
+  // approximation of 1/b.
+  restep = vrecpsq_f32(b, inv);
+  inv = vmulq_f32(restep, inv);
+
+  // Finally, multiply a by 1/b and get the wanted result of the division.
+  div = vmulq_f32(a, inv);
+
+  return div;
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4c>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(const Packet8c& /*a*/, const Packet8c& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet8c>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(const Packet16c& /*a*/, const Packet16c& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet16c>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(const Packet4uc& /*a*/, const Packet4uc& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4uc>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(const Packet8uc& /*a*/, const Packet8uc& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet8uc>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& /*a*/, const Packet16uc& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet16uc>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(const Packet4s& /*a*/, const Packet4s& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4s>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& /*a*/, const Packet8s& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet8s>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(const Packet4us& /*a*/, const Packet4us& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4us>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& /*a*/, const Packet8us& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet8us>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(const Packet2i& /*a*/, const Packet2i& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet2i>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4i>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(const Packet2ui& /*a*/, const Packet2ui& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet2ui>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& /*a*/, const Packet4ui& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4ui>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& /*a*/, const Packet2l& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet2l>(0LL);
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/, const Packet2ul& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet2ul>(0ULL);
+}
+
+
+#ifdef __ARM_FEATURE_FMA
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
+{ return vfmaq_f32(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
+{ return vfma_f32(c,a,b); }
+#else
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
+{
+  return vmlaq_f32(c,a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
+{
+  return vmla_f32(c,a,b);
+}
+#endif
+
+// No FMA instruction for int, so use MLA unconditionally.
+template<> EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vmla_s8(
+      vreinterpret_s8_s32(vdup_n_s32(c)),
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c)
+{ return vmla_s8(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c)
+{ return vmlaq_s8(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vmla_u8(
+      vreinterpret_u8_u32(vdup_n_u32(c)),
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c)
+{ return vmla_u8(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c)
+{ return vmlaq_u8(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c)
+{ return vmla_s16(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c)
+{ return vmlaq_s16(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c)
+{ return vmla_u16(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c)
+{ return vmlaq_u16(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c)
+{ return vmla_s32(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c)
+{ return vmlaq_s32(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c)
+{ return vmla_u32(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c)
+{ return vmlaq_u32(c,a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vabd_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vabdq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vabd_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return vabd_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vabdq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vabd_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vabd_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vabdq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vabd_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vabdq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vabd_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vabdq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(const Packet2i& a, const Packet2i& b)
+{ return vabd_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b)
+{ return vabdq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vabd_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vabdq_u32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmin_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vminnmq_f32(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vminnm_f32(a, b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmin<Packet4f>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmin<Packet2f>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vmin_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmin_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vminq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vmin_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmin_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vminq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmin_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vminq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmin_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vminq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmin_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmin_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vminq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s64(
+      vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
+      vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u64(
+      vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
+      vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmax_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxnmq_f32(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vmaxnm_f32(a, b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmax<Packet4f>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmax<Packet2f>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vmax_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmax_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmaxq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vmax_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmax_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmaxq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmax_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmaxq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmax_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmaxq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmax_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmax_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmaxq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s64(
+      vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
+      vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u64(
+      vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
+      vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vcle_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vcleq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_u8(vcle_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return vreinterpret_s8_u8(vcle_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vreinterpretq_s8_u8(vcleq_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vcle_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vcle_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vcleq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vreinterpret_s16_u16(vcle_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vreinterpretq_s16_u16(vcleq_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vcle_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vcleq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(const Packet2i& a, const Packet2i& b)
+{ return vreinterpret_s32_u32(vcle_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b)
+{ return vreinterpretq_s32_u32(vcleq_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vcle_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vcleq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vreinterpretq_s64_u64(vcleq_s64(a,b));
+#else
+  return vcombine_s64(
+      vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+      vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vcleq_u64(a,b);
+#else
+  return vcombine_u64(
+      vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+      vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vclt_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vcltq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_u8(vclt_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return vreinterpret_s8_u8(vclt_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vreinterpretq_s8_u8(vcltq_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vclt_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vclt_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vcltq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vreinterpret_s16_u16(vclt_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vreinterpretq_s16_u16(vcltq_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vclt_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vcltq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(const Packet2i& a, const Packet2i& b)
+{ return vreinterpret_s32_u32(vclt_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b)
+{ return vreinterpretq_s32_u32(vcltq_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vclt_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vcltq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vreinterpretq_s64_u64(vcltq_s64(a,b));
+#else
+  return vcombine_s64(
+      vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+      vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vcltq_u64(a,b);
+#else
+  return vcombine_u64(
+      vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+      vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vceq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vceqq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_u8(vceq_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return vreinterpret_s8_u8(vceq_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vreinterpretq_s8_u8(vceqq_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vceq_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vceq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vceqq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vreinterpret_s16_u16(vceq_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vreinterpretq_s16_u16(vceqq_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vceq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vceqq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(const Packet2i& a, const Packet2i& b)
+{ return vreinterpret_s32_u32(vceq_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b)
+{ return vreinterpretq_s32_u32(vceqq_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vceq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vceqq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vreinterpretq_s64_u64(vceqq_s64(a,b));
+#else
+  return vcombine_s64(
+      vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+      vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vceqq_u64(a,b);
+#else
+  return vcombine_u64(
+      vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+      vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a,b))); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); }
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
+template<> EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(const Packet4c& a, const Packet4c& b)
+{ return a & b; }
+template<> EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return vand_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vandq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{ return a & b; }
+template<> EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vand_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vandq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(const Packet4s& a, const Packet4s& b) { return vand_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) { return vandq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vand_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vandq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(const Packet2i& a, const Packet2i& b) { return vand_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vand_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vandq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) { return vandq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{ return vandq_u64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f por<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4c por<Packet4c>(const Packet4c& a, const Packet4c& b)
+{ return a | b; }
+template<> EIGEN_STRONG_INLINE Packet8c por<Packet8c>(const Packet8c& a, const Packet8c& b) { return vorr_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vorrq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{ return a | b; }
+template<> EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vorr_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vorrq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s por<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vorr_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vorrq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us por<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vorr_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vorrq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i por<Packet2i>(const Packet2i& a, const Packet2i& b) { return vorr_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vorr_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vorrq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b)
+{ return vorrq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{ return vorrq_u64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(const Packet4c& a, const Packet4c& b)
+{ return a ^ b; }
+template<> EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return veor_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return veorq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{ return a ^ b; }
+template<> EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return veor_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return veorq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(const Packet4s& a, const Packet4s& b) { return veor_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) { return veorq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return veor_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return veorq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(const Packet2i& a, const Packet2i& b) { return veor_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return veor_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return veorq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b)
+{ return veorq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{ return veorq_u64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(const Packet4c& a, const Packet4c& b)
+{ return a & ~b; }
+template<> EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(const Packet8c& a, const Packet8c& b) { return vbic_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) { return vbicq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{ return a & ~b; }
+template<> EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vbic_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vbicq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vbic_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vbicq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vbic_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vbicq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(const Packet2i& a, const Packet2i& b)
+{ return vbic_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b)
+{ return vbicq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vbic_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vbicq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b)
+{ return vbicq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{ return vbicq_u64(a,b); }
+
+
+template<int N> EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) { return vshr_n_s8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) { return vshrq_n_s8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a)
+{ return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) { return vshr_n_s16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) { return vshrq_n_s16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) { return vshr_n_s32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return vshrq_n_s32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) { return vshrq_n_s64(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
+
+template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a)
+{ return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a)
+{ return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a)
+{ return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a)
+{ return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a)
+{ return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a)
+{ return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a)
+{ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a)
+{ return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
+
+template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) { return vshl_n_s8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) { return vshlq_n_s8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a)
+{ return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) { return vshl_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) { return vshlq_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) { return vshl_n_s16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) { return vshlq_n_s16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) { return vshl_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) { return vshlq_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) { return vshl_n_s32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return vshlq_n_s32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) { return vshl_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) { return vshlq_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) { return vshlq_n_s64(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) { return vshlq_n_u64(a,N); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from)
+{
+  Packet4c res;
+  memcpy(static_cast<void*>(&res), from, sizeof(Packet4c));
+  return res;
+}
+template<> EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from)
+{
+  Packet4uc res;
+  memcpy(static_cast<void*>(&res), from, sizeof(Packet4c));
+  return res;
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); }
+template<> EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(const float* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(const int8_t* from)
+{
+  Packet4c res;
+  memcpy(static_cast<void*>(&res), from, sizeof(Packet4c));
+  return res;
+}
+template<> EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(const int8_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(const uint8_t* from)
+{
+  Packet4uc res;
+  memcpy(static_cast<void*>(&res), from, sizeof(Packet4c));
+  return res;
+}
+template<> EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(const uint8_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(const int16_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(const uint16_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(const int32_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(const uint32_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from); }
+template<> EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(const float* from)
+{ return vld1_dup_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
+{ return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from+1)); }
+template<> EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(const int8_t* from)
+{
+  const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload<Packet4c>(from)));
+  return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a,a).val[0]), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(const int8_t* from)
+{
+  const int8x8_t a = vld1_s8(from);
+  return vzip_s8(a,a).val[0];
+}
+template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from)
+{
+  const int8x8_t a = vld1_s8(from);
+  const int8x8x2_t b = vzip_s8(a,a);
+  return vcombine_s8(b.val[0], b.val[1]);
+}
+template<> EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(const uint8_t* from)
+{
+  const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload<Packet4uc>(from)));
+  return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a,a).val[0]), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(const uint8_t* from)
+{
+  const uint8x8_t a = vld1_u8(from);
+  return vzip_u8(a,a).val[0];
+}
+template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from)
+{
+  const uint8x8_t a = vld1_u8(from);
+  const uint8x8x2_t b = vzip_u8(a,a);
+  return vcombine_u8(b.val[0], b.val[1]);
+}
+template<> EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(const int16_t* from)
+{
+  return vreinterpret_s16_u32(vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)),
+      vreinterpret_u32_s16(vld1_dup_s16(from+1))).val[0]);
+}
+template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from)
+{
+  const int16x4_t a = vld1_s16(from);
+  const int16x4x2_t b = vzip_s16(a,a);
+  return vcombine_s16(b.val[0], b.val[1]);
+}
+template<> EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(const uint16_t* from)
+{
+  return vreinterpret_u16_u32(vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)),
+      vreinterpret_u32_u16(vld1_dup_u16(from+1))).val[0]);
+}
+template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from)
+{
+  const uint16x4_t a = vld1_u16(from);
+  const uint16x4x2_t b = vzip_u16(a,a);
+  return vcombine_u16(b.val[0], b.val[1]);
+}
+template<> EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(const int32_t* from)
+{ return vld1_dup_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
+{ return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from+1)); }
+template<> EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(const uint32_t* from)
+{ return vld1_dup_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from)
+{ return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from+1)); }
+template<> EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from)
+{ return vld1q_dup_s64(from); }
+template<> EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from)
+{ return vld1q_dup_u64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) { return vld1q_dup_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from)
+{ return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from)
+{
+  return vreinterpret_s8_u32(vzip_u32(
+      vreinterpret_u32_s8(vld1_dup_s8(from)),
+      vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
+}
+template<> EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from)
+{
+  const int8x8_t a = vreinterpret_s8_u32(vzip_u32(
+      vreinterpret_u32_s8(vld1_dup_s8(from)),
+      vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
+  const int8x8_t b = vreinterpret_s8_u32(vzip_u32(
+      vreinterpret_u32_s8(vld1_dup_s8(from+2)),
+      vreinterpret_u32_s8(vld1_dup_s8(from+3))).val[0]);
+  return vcombine_s8(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from)
+{ return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0); }
+template<> EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from)
+{
+  return vreinterpret_u8_u32(vzip_u32(
+      vreinterpret_u32_u8(vld1_dup_u8(from)),
+      vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
+}
+template<> EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from)
+{
+  const uint8x8_t a = vreinterpret_u8_u32(vzip_u32(
+      vreinterpret_u32_u8(vld1_dup_u8(from)),
+      vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
+  const uint8x8_t b = vreinterpret_u8_u32(vzip_u32(
+      vreinterpret_u32_u8(vld1_dup_u8(from+2)),
+      vreinterpret_u32_u8(vld1_dup_u8(from+3))).val[0]);
+  return vcombine_u8(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from)
+{ return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from+1)); }
+template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from)
+{ return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from+1)); }
+template<> EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) { return vld1q_dup_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) { return vld1q_dup_u32(from); }
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from)
+{ memcpy(to, &from, sizeof(from)); }
+template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from)
+{ memcpy(to, &from, sizeof(from)); }
+template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to,from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2f& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet4c& from)
+{ memcpy(to, &from, sizeof(from)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet8c& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet4uc& from)
+{ memcpy(to, &from, sizeof(from)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet8uc& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet4s& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet4us& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet2i& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet2ui& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to,from); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(const float* from, Index stride)
+{
+  Packet2f res = vld1_dup_f32(from);
+  res = vld1_lane_f32(from + 1*stride, res, 1);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  Packet4f res = vld1q_dup_f32(from);
+  res = vld1q_lane_f32(from + 1*stride, res, 1);
+  res = vld1q_lane_f32(from + 2*stride, res, 2);
+  res = vld1q_lane_f32(from + 3*stride, res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride)
+{
+  Packet4c res;
+  for (int i = 0; i != 4; i++)
+    reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride)
+{
+  Packet8c res = vld1_dup_s8(from);
+  res = vld1_lane_s8(from + 1*stride, res, 1);
+  res = vld1_lane_s8(from + 2*stride, res, 2);
+  res = vld1_lane_s8(from + 3*stride, res, 3);
+  res = vld1_lane_s8(from + 4*stride, res, 4);
+  res = vld1_lane_s8(from + 5*stride, res, 5);
+  res = vld1_lane_s8(from + 6*stride, res, 6);
+  res = vld1_lane_s8(from + 7*stride, res, 7);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride)
+{
+  Packet16c res = vld1q_dup_s8(from);
+  res = vld1q_lane_s8(from + 1*stride, res, 1);
+  res = vld1q_lane_s8(from + 2*stride, res, 2);
+  res = vld1q_lane_s8(from + 3*stride, res, 3);
+  res = vld1q_lane_s8(from + 4*stride, res, 4);
+  res = vld1q_lane_s8(from + 5*stride, res, 5);
+  res = vld1q_lane_s8(from + 6*stride, res, 6);
+  res = vld1q_lane_s8(from + 7*stride, res, 7);
+  res = vld1q_lane_s8(from + 8*stride, res, 8);
+  res = vld1q_lane_s8(from + 9*stride, res, 9);
+  res = vld1q_lane_s8(from + 10*stride, res, 10);
+  res = vld1q_lane_s8(from + 11*stride, res, 11);
+  res = vld1q_lane_s8(from + 12*stride, res, 12);
+  res = vld1q_lane_s8(from + 13*stride, res, 13);
+  res = vld1q_lane_s8(from + 14*stride, res, 14);
+  res = vld1q_lane_s8(from + 15*stride, res, 15);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride)
+{
+  Packet4uc res;
+  for (int i = 0; i != 4; i++)
+    reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride)
+{
+  Packet8uc res = vld1_dup_u8(from);
+  res = vld1_lane_u8(from + 1*stride, res, 1);
+  res = vld1_lane_u8(from + 2*stride, res, 2);
+  res = vld1_lane_u8(from + 3*stride, res, 3);
+  res = vld1_lane_u8(from + 4*stride, res, 4);
+  res = vld1_lane_u8(from + 5*stride, res, 5);
+  res = vld1_lane_u8(from + 6*stride, res, 6);
+  res = vld1_lane_u8(from + 7*stride, res, 7);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride)
+{
+  Packet16uc res = vld1q_dup_u8(from);
+  res = vld1q_lane_u8(from + 1*stride, res, 1);
+  res = vld1q_lane_u8(from + 2*stride, res, 2);
+  res = vld1q_lane_u8(from + 3*stride, res, 3);
+  res = vld1q_lane_u8(from + 4*stride, res, 4);
+  res = vld1q_lane_u8(from + 5*stride, res, 5);
+  res = vld1q_lane_u8(from + 6*stride, res, 6);
+  res = vld1q_lane_u8(from + 7*stride, res, 7);
+  res = vld1q_lane_u8(from + 8*stride, res, 8);
+  res = vld1q_lane_u8(from + 9*stride, res, 9);
+  res = vld1q_lane_u8(from + 10*stride, res, 10);
+  res = vld1q_lane_u8(from + 11*stride, res, 11);
+  res = vld1q_lane_u8(from + 12*stride, res, 12);
+  res = vld1q_lane_u8(from + 13*stride, res, 13);
+  res = vld1q_lane_u8(from + 14*stride, res, 14);
+  res = vld1q_lane_u8(from + 15*stride, res, 15);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride)
+{
+  Packet4s res = vld1_dup_s16(from);
+  res = vld1_lane_s16(from + 1*stride, res, 1);
+  res = vld1_lane_s16(from + 2*stride, res, 2);
+  res = vld1_lane_s16(from + 3*stride, res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride)
+{
+  Packet8s res = vld1q_dup_s16(from);
+  res = vld1q_lane_s16(from + 1*stride, res, 1);
+  res = vld1q_lane_s16(from + 2*stride, res, 2);
+  res = vld1q_lane_s16(from + 3*stride, res, 3);
+  res = vld1q_lane_s16(from + 4*stride, res, 4);
+  res = vld1q_lane_s16(from + 5*stride, res, 5);
+  res = vld1q_lane_s16(from + 6*stride, res, 6);
+  res = vld1q_lane_s16(from + 7*stride, res, 7);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride)
+{
+  Packet4us res = vld1_dup_u16(from);
+  res = vld1_lane_u16(from + 1*stride, res, 1);
+  res = vld1_lane_u16(from + 2*stride, res, 2);
+  res = vld1_lane_u16(from + 3*stride, res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride)
+{
+  Packet8us res = vld1q_dup_u16(from);
+  res = vld1q_lane_u16(from + 1*stride, res, 1);
+  res = vld1q_lane_u16(from + 2*stride, res, 2);
+  res = vld1q_lane_u16(from + 3*stride, res, 3);
+  res = vld1q_lane_u16(from + 4*stride, res, 4);
+  res = vld1q_lane_u16(from + 5*stride, res, 5);
+  res = vld1q_lane_u16(from + 6*stride, res, 6);
+  res = vld1q_lane_u16(from + 7*stride, res, 7);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride)
+{
+  Packet2i res = vld1_dup_s32(from);
+  res = vld1_lane_s32(from + 1*stride, res, 1);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
+{
+  Packet4i res = vld1q_dup_s32(from);
+  res = vld1q_lane_s32(from + 1*stride, res, 1);
+  res = vld1q_lane_s32(from + 2*stride, res, 2);
+  res = vld1q_lane_s32(from + 3*stride, res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride)
+{
+  Packet2ui res = vld1_dup_u32(from);
+  res = vld1_lane_u32(from + 1*stride, res, 1);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride)
+{
+  Packet4ui res = vld1q_dup_u32(from);
+  res = vld1q_lane_u32(from + 1*stride, res, 1);
+  res = vld1q_lane_u32(from + 2*stride, res, 2);
+  res = vld1q_lane_u32(from + 3*stride, res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride)
+{
+  Packet2l res = vld1q_dup_s64(from);
+  res = vld1q_lane_s64(from + 1*stride, res, 1);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride)
+{
+  Packet2ul res = vld1q_dup_u64(from);
+  res = vld1q_lane_u64(from + 1*stride, res, 1);
+  return res;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride)
+{
+  vst1_lane_f32(to + stride*0, from, 0);
+  vst1_lane_f32(to + stride*1, from, 1);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  vst1q_lane_f32(to + stride*0, from, 0);
+  vst1q_lane_f32(to + stride*1, from, 1);
+  vst1q_lane_f32(to + stride*2, from, 2);
+  vst1q_lane_f32(to + stride*3, from, 3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride)
+{
+  for (int i = 0; i != 4; i++)
+    *(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride)
+{
+  vst1_lane_s8(to + stride*0, from, 0);
+  vst1_lane_s8(to + stride*1, from, 1);
+  vst1_lane_s8(to + stride*2, from, 2);
+  vst1_lane_s8(to + stride*3, from, 3);
+  vst1_lane_s8(to + stride*4, from, 4);
+  vst1_lane_s8(to + stride*5, from, 5);
+  vst1_lane_s8(to + stride*6, from, 6);
+  vst1_lane_s8(to + stride*7, from, 7);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from, Index stride)
+{
+  vst1q_lane_s8(to + stride*0, from, 0);
+  vst1q_lane_s8(to + stride*1, from, 1);
+  vst1q_lane_s8(to + stride*2, from, 2);
+  vst1q_lane_s8(to + stride*3, from, 3);
+  vst1q_lane_s8(to + stride*4, from, 4);
+  vst1q_lane_s8(to + stride*5, from, 5);
+  vst1q_lane_s8(to + stride*6, from, 6);
+  vst1q_lane_s8(to + stride*7, from, 7);
+  vst1q_lane_s8(to + stride*8, from, 8);
+  vst1q_lane_s8(to + stride*9, from, 9);
+  vst1q_lane_s8(to + stride*10, from, 10);
+  vst1q_lane_s8(to + stride*11, from, 11);
+  vst1q_lane_s8(to + stride*12, from, 12);
+  vst1q_lane_s8(to + stride*13, from, 13);
+  vst1q_lane_s8(to + stride*14, from, 14);
+  vst1q_lane_s8(to + stride*15, from, 15);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from, Index stride)
+{
+  for (int i = 0; i != 4; i++)
+    *(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from, Index stride)
+{
+  vst1_lane_u8(to + stride*0, from, 0);
+  vst1_lane_u8(to + stride*1, from, 1);
+  vst1_lane_u8(to + stride*2, from, 2);
+  vst1_lane_u8(to + stride*3, from, 3);
+  vst1_lane_u8(to + stride*4, from, 4);
+  vst1_lane_u8(to + stride*5, from, 5);
+  vst1_lane_u8(to + stride*6, from, 6);
+  vst1_lane_u8(to + stride*7, from, 7);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from, Index stride)
+{
+  vst1q_lane_u8(to + stride*0, from, 0);
+  vst1q_lane_u8(to + stride*1, from, 1);
+  vst1q_lane_u8(to + stride*2, from, 2);
+  vst1q_lane_u8(to + stride*3, from, 3);
+  vst1q_lane_u8(to + stride*4, from, 4);
+  vst1q_lane_u8(to + stride*5, from, 5);
+  vst1q_lane_u8(to + stride*6, from, 6);
+  vst1q_lane_u8(to + stride*7, from, 7);
+  vst1q_lane_u8(to + stride*8, from, 8);
+  vst1q_lane_u8(to + stride*9, from, 9);
+  vst1q_lane_u8(to + stride*10, from, 10);
+  vst1q_lane_u8(to + stride*11, from, 11);
+  vst1q_lane_u8(to + stride*12, from, 12);
+  vst1q_lane_u8(to + stride*13, from, 13);
+  vst1q_lane_u8(to + stride*14, from, 14);
+  vst1q_lane_u8(to + stride*15, from, 15);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from, Index stride)
+{
+  vst1_lane_s16(to + stride*0, from, 0);
+  vst1_lane_s16(to + stride*1, from, 1);
+  vst1_lane_s16(to + stride*2, from, 2);
+  vst1_lane_s16(to + stride*3, from, 3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from, Index stride)
+{
+  vst1q_lane_s16(to + stride*0, from, 0);
+  vst1q_lane_s16(to + stride*1, from, 1);
+  vst1q_lane_s16(to + stride*2, from, 2);
+  vst1q_lane_s16(to + stride*3, from, 3);
+  vst1q_lane_s16(to + stride*4, from, 4);
+  vst1q_lane_s16(to + stride*5, from, 5);
+  vst1q_lane_s16(to + stride*6, from, 6);
+  vst1q_lane_s16(to + stride*7, from, 7);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from, Index stride)
+{
+  vst1_lane_u16(to + stride*0, from, 0);
+  vst1_lane_u16(to + stride*1, from, 1);
+  vst1_lane_u16(to + stride*2, from, 2);
+  vst1_lane_u16(to + stride*3, from, 3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from, Index stride)
+{
+  vst1q_lane_u16(to + stride*0, from, 0);
+  vst1q_lane_u16(to + stride*1, from, 1);
+  vst1q_lane_u16(to + stride*2, from, 2);
+  vst1q_lane_u16(to + stride*3, from, 3);
+  vst1q_lane_u16(to + stride*4, from, 4);
+  vst1q_lane_u16(to + stride*5, from, 5);
+  vst1q_lane_u16(to + stride*6, from, 6);
+  vst1q_lane_u16(to + stride*7, from, 7);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from, Index stride)
+{
+  vst1_lane_s32(to + stride*0, from, 0);
+  vst1_lane_s32(to + stride*1, from, 1);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
+{
+  vst1q_lane_s32(to + stride*0, from, 0);
+  vst1q_lane_s32(to + stride*1, from, 1);
+  vst1q_lane_s32(to + stride*2, from, 2);
+  vst1q_lane_s32(to + stride*3, from, 3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from, Index stride)
+{
+  vst1_lane_u32(to + stride*0, from, 0);
+  vst1_lane_u32(to + stride*1, from, 1);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride)
+{
+  vst1q_lane_u32(to + stride*0, from, 0);
+  vst1q_lane_u32(to + stride*1, from, 1);
+  vst1q_lane_u32(to + stride*2, from, 2);
+  vst1q_lane_u32(to + stride*3, from, 3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride)
+{
+  vst1q_lane_s64(to + stride*0, from, 0);
+  vst1q_lane_s64(to + stride*1, from, 1);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from, Index stride)
+{
+  vst1q_lane_u64(to + stride*0, from, 0);
+  vst1q_lane_u64(to + stride*1, from, 1);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE float pfirst<Packet2f>(const Packet2f& a) { return vget_lane_f32(a,0); }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return vgetq_lane_f32(a,0); }
+template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(const Packet4c& a) { return static_cast<int8_t>(a & 0xff); }
+template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(const Packet8c& a) { return vget_lane_s8(a,0); }
+template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) { return vgetq_lane_s8(a,0); }
+template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(const Packet4uc& a) { return static_cast<uint8_t>(a & 0xff); }
+template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(const Packet8uc& a) { return vget_lane_u8(a,0); }
+template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) { return vgetq_lane_u8(a,0); }
+template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(const Packet4s& a) { return vget_lane_s16(a,0); }
+template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) { return vgetq_lane_s16(a,0); }
+template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(const Packet4us& a) { return vget_lane_u16(a,0); }
+template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) { return vgetq_lane_u16(a,0); }
+template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(const Packet2i& a) { return vget_lane_s32(a,0); }
+template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { return vgetq_lane_s32(a,0); }
+template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(a,0); }
+template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { return vgetq_lane_u32(a,0); }
+template<> EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) { return vgetq_lane_s64(a,0); }
+template<> EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) { return vgetq_lane_u64(a,0); }
+
+template<> EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) { return vrev64_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  const float32x4_t a_r64 = vrev64q_f32(a);
+  return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) { return vrev64_s8(a); }
+template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
+{
+  const int8x16_t a_r64 = vrev64q_s8(a);
+  return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a)
+{ return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) { return vrev64_u8(a); }
+template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
+{
+  const uint8x16_t a_r64 = vrev64q_u8(a);
+  return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) { return vrev64_s16(a); }
+template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
+{
+  const int16x8_t a_r64 = vrev64q_s16(a);
+  return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) { return vrev64_u16(a); }
+template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
+{
+  const uint16x8_t a_r64 = vrev64q_u16(a);
+  return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) { return vrev64_s32(a); }
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
+{
+  const int32x4_t a_r64 = vrev64q_s32(a);
+  return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) { return vrev64_u32(a); }
+template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a)
+{
+  const uint32x4_t a_r64 = vrev64q_u32(a);
+  return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a)
+{ return vcombine_s64(vget_high_s64(a), vget_low_s64(a)); }
+template<> EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a)
+{ return vcombine_u64(vget_high_u64(a), vget_low_u64(a)); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) { return vabs_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(const Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) { return vabs_s8(a); }
+template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vabsq_s8(a); }
+template<> EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) { return vabs_s16(a); }
+template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vabsq_s16(a); }
+template<> EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) { return vabs_s32(a); }
+template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
+template<> EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
+#if EIGEN_ARCH_ARM64
+  return vabsq_s64(a);
+#else
+  return vcombine_s64(
+      vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))),
+      vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent)
+{ return pfrexp_generic(a,exponent); }
+template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent)
+{ return pfrexp_generic(a,exponent); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(const Packet2f& a, const Packet2f& exponent)
+{ return pldexp_generic(a,exponent); }
+template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent)
+{ return pldexp_generic(a,exponent); }
+
+template<> EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) { return vget_lane_f32(vpadd_f32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a));
+  return vget_lane_f32(vpadd_f32(sum, sum), 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux<Packet4c>(const Packet4c& a)
+{
+  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
+  int8x8_t sum = vpadd_s8(a_dup, a_dup);
+  sum = vpadd_s8(sum, sum);
+  return vget_lane_s8(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a)
+{
+  int8x8_t sum = vpadd_s8(a,a);
+  sum = vpadd_s8(sum, sum);
+  sum = vpadd_s8(sum, sum);
+  return vget_lane_s8(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a)
+{
+  int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a));
+  sum = vpadd_s8(sum, sum);
+  sum = vpadd_s8(sum, sum);
+  sum = vpadd_s8(sum, sum);
+  return vget_lane_s8(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(const Packet4uc& a)
+{
+  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
+  uint8x8_t sum = vpadd_u8(a_dup, a_dup);
+  sum = vpadd_u8(sum, sum);
+  return vget_lane_u8(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a)
+{
+  uint8x8_t sum = vpadd_u8(a,a);
+  sum = vpadd_u8(sum, sum);
+  sum = vpadd_u8(sum, sum);
+  return vget_lane_u8(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a)
+{
+  uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a));
+  sum = vpadd_u8(sum, sum);
+  sum = vpadd_u8(sum, sum);
+  sum = vpadd_u8(sum, sum);
+  return vget_lane_u8(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a)
+{
+  const int16x4_t sum = vpadd_s16(a,a);
+  return vget_lane_s16(vpadd_s16(sum, sum), 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a)
+{
+  int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a));
+  sum = vpadd_s16(sum, sum);
+  sum = vpadd_s16(sum, sum);
+  return vget_lane_s16(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a)
+{
+  const uint16x4_t sum = vpadd_u16(a,a);
+  return vget_lane_u16(vpadd_u16(sum, sum), 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a)
+{
+  uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a));
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  return vget_lane_u16(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) { return vget_lane_s32(vpadd_s32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
+{
+  const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a));
+  return vget_lane_s32(vpadd_s32(sum, sum), 0);
+}
+template<> EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(vpadd_u32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a)
+{
+  const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a));
+  return vget_lane_u32(vpadd_u32(sum, sum), 0);
+}
+template<> EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a)
+{ return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); }
+template<> EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a)
+{ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a,
+      vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a)
+{ return vadd_s8(vget_high_s8(a), vget_low_s8(a)); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a,
+      vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a)
+{ return vadd_u8(vget_high_u8(a), vget_low_u8(a)); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a)
+{ return vadd_s16(vget_high_s16(a), vget_low_s16(a)); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a)
+{ return vadd_u16(vget_high_u16(a), vget_low_u16(a)); }
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a)
+{ return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); }
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
+template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a)
+{
+  int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
+  prod = vmul_s8(prod, vrev16_s8(prod));
+  return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a)
+{
+  int8x8_t prod = vmul_s8(a, vrev16_s8(a));
+  prod = vmul_s8(prod, vrev32_s8(prod));
+  return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a)
+{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
+template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a)
+{
+  uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
+  prod = vmul_u8(prod, vrev16_u8(prod));
+  return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a)
+{
+  uint8x8_t prod = vmul_u8(a, vrev16_u8(a));
+  prod = vmul_u8(prod, vrev32_u8(prod));
+  return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a)
+{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
+template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a)
+{
+  const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
+  return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a)
+{
+  int16x4_t prod;
+
+  // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
+  prod = vmul_s16(vget_low_s16(a), vget_high_s16(a));
+  // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
+  prod = vmul_s16(prod, vrev32_s16(prod));
+  // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
+  return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(const Packet4us& a)
+{
+  const uint16x4_t prod = vmul_u16(a, vrev32_u16(a));
+  return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a)
+{
+  uint16x4_t prod;
+
+  // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
+  prod = vmul_u16(vget_low_u16(a), vget_high_u16(a));
+  // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
+  prod = vmul_u16(prod, vrev32_u16(prod));
+  // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
+  return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a)
+{ return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); }
+template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
+{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
+template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a)
+{ return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); }
+template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a)
+{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
+template<> EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a)
+{ return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); }
+template<> EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a)
+{ return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1); }
+
+// min
+template<> EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a)
+{ return vget_lane_f32(vpmin_f32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a));
+  return vget_lane_f32(vpmin_f32(min, min), 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(const Packet4c& a)
+{
+  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
+  int8x8_t min = vpmin_s8(a_dup, a_dup);
+  min = vpmin_s8(min, min);
+  return vget_lane_s8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a)
+{
+  int8x8_t min = vpmin_s8(a,a);
+  min = vpmin_s8(min, min);
+  min = vpmin_s8(min, min);
+  return vget_lane_s8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a)
+{
+  int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a));
+  min = vpmin_s8(min, min);
+  min = vpmin_s8(min, min);
+  min = vpmin_s8(min, min);
+  return vget_lane_s8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(const Packet4uc& a)
+{
+  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
+  uint8x8_t min = vpmin_u8(a_dup, a_dup);
+  min = vpmin_u8(min, min);
+  return vget_lane_u8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a)
+{
+  uint8x8_t min = vpmin_u8(a,a);
+  min = vpmin_u8(min, min);
+  min = vpmin_u8(min, min);
+  return vget_lane_u8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a)
+{
+  uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a));
+  min = vpmin_u8(min, min);
+  min = vpmin_u8(min, min);
+  min = vpmin_u8(min, min);
+  return vget_lane_u8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a)
+{
+  const int16x4_t min = vpmin_s16(a,a);
+  return vget_lane_s16(vpmin_s16(min, min), 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a)
+{
+  int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a));
+  min = vpmin_s16(min, min);
+  min = vpmin_s16(min, min);
+  return vget_lane_s16(min, 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a)
+{
+  const uint16x4_t min = vpmin_u16(a,a);
+  return vget_lane_u16(vpmin_u16(min, min), 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a)
+{
+  uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a));
+  min = vpmin_u16(min, min);
+  min = vpmin_u16(min, min);
+  return vget_lane_u16(min, 0);
+}
+template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a)
+{ return vget_lane_s32(vpmin_s32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
+{
+  const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a));
+  return vget_lane_s32(vpmin_s32(min, min), 0);
+}
+template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a)
+{ return vget_lane_u32(vpmin_u32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a)
+{
+  const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a));
+  return vget_lane_u32(vpmin_u32(min, min), 0);
+}
+template<> EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a)
+{ return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
+template<> EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a)
+{ return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
+
+// max
+template<> EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a)
+{ return vget_lane_f32(vpmax_f32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a));
+  return vget_lane_f32(vpmax_f32(max, max), 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(const Packet4c& a)
+{
+  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
+  int8x8_t max = vpmax_s8(a_dup, a_dup);
+  max = vpmax_s8(max, max);
+  return vget_lane_s8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a)
+{
+  int8x8_t max = vpmax_s8(a,a);
+  max = vpmax_s8(max, max);
+  max = vpmax_s8(max, max);
+  return vget_lane_s8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a)
+{
+  int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a));
+  max = vpmax_s8(max, max);
+  max = vpmax_s8(max, max);
+  max = vpmax_s8(max, max);
+  return vget_lane_s8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(const Packet4uc& a)
+{
+  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
+  uint8x8_t max = vpmax_u8(a_dup, a_dup);
+  max = vpmax_u8(max, max);
+  return vget_lane_u8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a)
+{
+  uint8x8_t max = vpmax_u8(a,a);
+  max = vpmax_u8(max, max);
+  max = vpmax_u8(max, max);
+  return vget_lane_u8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a)
+{
+  uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a));
+  max = vpmax_u8(max, max);
+  max = vpmax_u8(max, max);
+  max = vpmax_u8(max, max);
+  return vget_lane_u8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a)
+{
+  const int16x4_t max = vpmax_s16(a,a);
+  return vget_lane_s16(vpmax_s16(max, max), 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a)
+{
+  int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a));
+  max = vpmax_s16(max, max);
+  max = vpmax_s16(max, max);
+  return vget_lane_s16(max, 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a)
+{
+  const uint16x4_t max = vpmax_u16(a,a);
+  return vget_lane_u16(vpmax_u16(max, max), 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a)
+{
+  uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a));
+  max = vpmax_u16(max, max);
+  max = vpmax_u16(max, max);
+  return vget_lane_u16(max, 0);
+}
+template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a)
+{ return vget_lane_s32(vpmax_s32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
+{
+  const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a));
+  return vget_lane_s32(vpmax_s32(max, max), 0);
+}
+template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a)
+{ return vget_lane_u32(vpmax_u32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a)
+{
+  const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a));
+  return vget_lane_u32(vpmax_u32(max, max), 0);
+}
+template<> EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a)
+{ return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
+template<> EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a)
+{ return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+  uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
+                            vget_high_u32(vreinterpretq_u32_f32(x)));
+  return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
+}
+
+// Helpers for ptranspose.
+namespace detail {
+  
+template<typename Packet>
+void zip_in_place(Packet& p1, Packet& p2);
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet2f>(Packet2f& p1, Packet2f& p2) {
+  const float32x2x2_t tmp = vzip_f32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4f>(Packet4f& p1, Packet4f& p2) {
+  const float32x4x2_t tmp = vzipq_f32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8c>(Packet8c& p1, Packet8c& p2) {
+  const int8x8x2_t tmp = vzip_s8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet16c>(Packet16c& p1, Packet16c& p2) {
+  const int8x16x2_t tmp = vzipq_s8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8uc>(Packet8uc& p1, Packet8uc& p2) {
+  const uint8x8x2_t tmp = vzip_u8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet16uc>(Packet16uc& p1, Packet16uc& p2) {
+  const uint8x16x2_t tmp = vzipq_u8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet2i>(Packet2i& p1, Packet2i& p2) {
+  const int32x2x2_t tmp = vzip_s32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4i>(Packet4i& p1, Packet4i& p2) {
+  const int32x4x2_t tmp = vzipq_s32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet2ui>(Packet2ui& p1, Packet2ui& p2) {
+  const uint32x2x2_t tmp = vzip_u32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4ui>(Packet4ui& p1, Packet4ui& p2) {
+  const uint32x4x2_t tmp = vzipq_u32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4s>(Packet4s& p1, Packet4s& p2) {
+  const int16x4x2_t tmp = vzip_s16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8s>(Packet8s& p1, Packet8s& p2) {
+  const int16x8x2_t tmp = vzipq_s16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4us>(Packet4us& p1, Packet4us& p2) {
+  const uint16x4x2_t tmp = vzip_u16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8us>(Packet8us& p1, Packet8us& p2) {
+  const uint16x8x2_t tmp = vzipq_u16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[2]);
+  zip_in_place(kernel.packet[1], kernel.packet[3]);
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+  zip_in_place(kernel.packet[2], kernel.packet[3]);
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[4]);
+  zip_in_place(kernel.packet[1], kernel.packet[5]);
+  zip_in_place(kernel.packet[2], kernel.packet[6]);
+  zip_in_place(kernel.packet[3], kernel.packet[7]);
+
+  zip_in_place(kernel.packet[0], kernel.packet[2]);
+  zip_in_place(kernel.packet[1], kernel.packet[3]);
+  zip_in_place(kernel.packet[4], kernel.packet[6]);
+  zip_in_place(kernel.packet[5], kernel.packet[7]);
+  
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+  zip_in_place(kernel.packet[2], kernel.packet[3]);
+  zip_in_place(kernel.packet[4], kernel.packet[5]);
+  zip_in_place(kernel.packet[6], kernel.packet[7]);
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
+  EIGEN_UNROLL_LOOP
+  for (int i=0; i<4; ++i) {
+    const int m = (1 << i);
+    EIGEN_UNROLL_LOOP
+    for (int j=0; j<m; ++j) {
+      const int n = (1 << (3-i));
+      EIGEN_UNROLL_LOOP
+      for (int k=0; k<n; ++k) {
+        const int idx = 2*j*n+k;
+        zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
+      }
+    }
+  }
+}
+
+} // namespace detail
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2f, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel)
+{
+  const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
+  const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
+
+  const int8x8x2_t zip8 = vzip_s8(a,b);
+  const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
+
+  kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
+  kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1);
+  kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
+  kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel)
+{
+  const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
+  const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
+
+  const uint8x8x2_t zip8 = vzip_u8(a,b);
+  const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
+
+  kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
+  kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1);
+  kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
+  kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4s, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4us, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2i, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
+    detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ui, 2>& kernel) {
+  detail::zip_in_place(kernel.packet[0], kernel.packet[1]);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet2l, 2>& kernel)
+{
+#if EIGEN_ARCH_ARM64
+  const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[0] = tmp1;
+#else
+  const int64x1_t tmp[2][2] = {
+    { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) },
+    { vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1]) }
+  };
+
+  kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]);
+  kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet2ul, 2>& kernel)
+{
+#if EIGEN_ARCH_ARM64
+  const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[0] = tmp1;
+#else
+  const uint64x1_t tmp[2][2] = {
+    { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) },
+    { vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1]) }
+  };
+
+  kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]);
+  kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]);
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect( const Packet2f& mask, const Packet2f& a, const Packet2f& b)
+{ return vbsl_f32(vreinterpret_u32_f32(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b)
+{ return vbslq_f32(vreinterpretq_u32_f32(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b)
+{ return vbsl_s8(vreinterpret_u8_s8(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b)
+{ return vbslq_s8(vreinterpretq_u8_s8(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b)
+{ return vbsl_u8(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, const Packet16uc& b)
+{ return vbslq_u8(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b)
+{ return vbsl_s16(vreinterpret_u16_s16(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b)
+{ return vbslq_s16(vreinterpretq_u16_s16(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b)
+{ return vbsl_u16(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b)
+{ return vbslq_u16(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b)
+{ return vbsl_s32(vreinterpret_u32_s32(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b)
+{ return vbslq_s32(vreinterpretq_u32_s32(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b)
+{ return vbsl_u32(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b)
+{ return vbslq_u32(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b)
+{ return vbslq_s64(vreinterpretq_u64_s64(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b)
+{ return vbslq_u64(mask, a, b); }
+
+// Use armv8 rounding intinsics if available.
+#if EIGEN_ARCH_ARMV8
+template<> EIGEN_STRONG_INLINE Packet2f print<Packet2f>(const Packet2f& a)
+{ return vrndn_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
+{ return vrndnq_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
+{ return vrndm_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{ return vrndmq_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
+{ return vrndp_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
+{ return vrndpq_f32(a); }
+
+#else
+
+template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
+  // Adds and subtracts signum(a) * 2^23 to force rounding.
+  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
+  const Packet4f abs_a = pabs(a);
+  Packet4f r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) {
+  // Adds and subtracts signum(a) * 2^23 to force rounding.
+  const Packet2f limit = pset1<Packet2f>(static_cast<float>(1<<23));
+  const Packet2f abs_a = pabs(a);
+  Packet2f r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  Packet4f tmp  = print<Packet4f>(a);
+  // If greater, subtract one.
+  Packet4f mask = pcmp_lt(a, tmp);
+  mask = pand(mask, cst_1);
+  return psub(tmp, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
+{
+  const Packet2f cst_1 = pset1<Packet2f>(1.0f);
+  Packet2f tmp  = print<Packet2f>(a);
+  // If greater, subtract one.
+  Packet2f mask = pcmp_lt(a, tmp);
+  mask = pand(mask, cst_1);
+  return psub(tmp, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  Packet4f tmp  = print<Packet4f>(a);
+  // If smaller, add one.
+  Packet4f mask = pcmp_lt(tmp, a);
+  mask = pand(mask, cst_1);
+  return padd(tmp, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
+{
+  const Packet2f cst_1 = pset1<Packet2f>(1.0);
+  Packet2f tmp  = print<Packet2f>(a);
+  // If smaller, add one.
+  Packet2f mask = pcmp_lt(tmp, a);
+  mask = pand(mask, cst_1);
+  return padd(tmp, mask);
+}
+
+#endif
+
+/**
+ * Computes the integer square root
+ * @remarks The calculation is performed using an algorithm which iterates through each binary digit of the result
+ *   and tests whether setting that digit to 1 would cause the square of the value to be greater than the argument
+ *   value. The algorithm is described in detail here: http://ww1.microchip.com/downloads/en/AppNotes/91040a.pdf .
+ */
+template<> EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) {
+  uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a));
+  uint8x8_t res = vdup_n_u8(0);
+  uint8x8_t add = vdup_n_u8(0x8);
+  for (int i = 0; i < 4; i++)
+  {
+    const uint8x8_t temp = vorr_u8(res, add);
+    res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res);
+    add = vshr_n_u8(add, 1);
+  }
+  return vget_lane_u32(vreinterpret_u32_u8(res), 0);
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) {
+  uint8x8_t res = vdup_n_u8(0);
+  uint8x8_t add = vdup_n_u8(0x8);
+  for (int i = 0; i < 4; i++)
+  {
+    const uint8x8_t temp = vorr_u8(res, add);
+    res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res);
+    add = vshr_n_u8(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
+  uint8x16_t res = vdupq_n_u8(0);
+  uint8x16_t add = vdupq_n_u8(0x8);
+  for (int i = 0; i < 4; i++)
+  {
+    const uint8x16_t temp = vorrq_u8(res, add);
+    res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res);
+    add = vshrq_n_u8(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) {
+  uint16x4_t res = vdup_n_u16(0);
+  uint16x4_t add = vdup_n_u16(0x80);
+  for (int i = 0; i < 8; i++)
+  {
+    const uint16x4_t temp = vorr_u16(res, add);
+    res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res);
+    add = vshr_n_u16(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
+  uint16x8_t res = vdupq_n_u16(0);
+  uint16x8_t add = vdupq_n_u16(0x80);
+  for (int i = 0; i < 8; i++)
+  {
+    const uint16x8_t temp = vorrq_u16(res, add);
+    res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res);
+    add = vshrq_n_u16(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) {
+  uint32x2_t res = vdup_n_u32(0);
+  uint32x2_t add = vdup_n_u32(0x8000);
+  for (int i = 0; i < 16; i++)
+  {
+    const uint32x2_t temp = vorr_u32(res, add);
+    res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res);
+    add = vshr_n_u32(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
+  uint32x4_t res = vdupq_n_u32(0);
+  uint32x4_t add = vdupq_n_u32(0x8000);
+  for (int i = 0; i < 16; i++)
+  {
+    const uint32x4_t temp = vorrq_u32(res, add);
+    res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res);
+    add = vshrq_n_u32(add, 1);
+  }
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
+  // Compute approximate reciprocal sqrt.
+  Packet4f x = vrsqrteq_f32(a);
+  // Do Newton iterations for 1/sqrt(x).
+  x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x);
+  x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x);
+  const Packet4f infinity = pset1<Packet4f>(NumTraits<float>::infinity());
+  return pselect(pcmp_eq(a, pzero(a)), infinity, x);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
+  // Compute approximate reciprocal sqrt.
+  Packet2f x = vrsqrte_f32(a);
+  // Do Newton iterations for 1/sqrt(x).
+  x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x);
+  x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x);
+  const Packet2f infinity = pset1<Packet2f>(NumTraits<float>::infinity());
+  return pselect(pcmp_eq(a, pzero(a)), infinity, x);
+}
+
+// Unfortunately vsqrt_f32 is only available for A64.
+#if EIGEN_ARCH_ARM64
+template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){return vsqrtq_f32(_x);}
+template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x){return vsqrt_f32(_x); }
+#else
+template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+  const Packet4f infinity = pset1<Packet4f>(NumTraits<float>::infinity());
+  const Packet4f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity));
+  return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a)));
+}
+template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
+  const Packet2f infinity = pset1<Packet2f>(NumTraits<float>::infinity());
+  const Packet2f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity));
+  return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a)));
+}
+#endif
+
+//---------- bfloat16 ----------
+// TODO: Add support for native armv8.6-a bfloat16_t
+
+// TODO: Guard if we have native bfloat16 support
+typedef eigen_packet_wrapper<uint16x4_t, 19> Packet4bf;
+
+template<> struct is_arithmetic<Packet4bf> { enum { value = true }; };
+
+template<> struct packet_traits<bfloat16> : default_packet_traits
+{
+  typedef Packet4bf type;
+  typedef Packet4bf half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+
+    HasCmp       = 1,
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+    HasDiv       = 1,
+    HasFloor     = 1,
+    HasCeil      = 1,
+    HasRint      = 1,
+
+    HasSin  = EIGEN_FAST_MATH,
+    HasCos  = EIGEN_FAST_MATH,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 0,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf  = EIGEN_FAST_MATH,
+    HasBessel = 0,  // Issues with accuracy.
+    HasNdtri = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet4bf>
+{
+  typedef bfloat16 type;
+  typedef Packet4bf half;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+namespace detail {  
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4bf>(Packet4bf& p1, Packet4bf& p2) {
+  const uint16x4x2_t tmp = vzip_u16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+} // namespace detail
+
+EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p)
+{
+  // See the scalar implemention in BFloat16.h for a comprehensible explanation
+  // of this fast rounding algorithm
+  Packet4ui input = reinterpret_cast<Packet4ui>(p);
+
+  // lsb = (input >> 16) & 1
+  Packet4ui lsb =  vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
+
+  // rounding_bias = 0x7fff + lsb
+  Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff));
+
+  // input += rounding_bias
+  input = vaddq_u32(input, rounding_bias);
+
+  // input = input >> 16
+  input = vshrq_n_u32(input, 16);
+
+  // Replace float-nans by bfloat16-nans, that is 0x7fc0
+  const Packet4ui bf16_nan = vdupq_n_u32(0x7fc0);
+  const Packet4ui mask = vceqq_f32(p, p);
+  input = vbslq_u32(mask, input, bf16_nan);
+
+  // output = static_cast<uint16_t>(input)
+  return vmovn_u32(input);
+}
+
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p)
+{
+  return reinterpret_cast<Packet4f>(vshlq_n_u32(vmovl_u16(p), 16));
+}
+
+EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) {
+  return vmovn_u32(vreinterpretq_u32_f32(p));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
+  return pset1<Packet4us>(from.value);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
+  return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(from)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from)
+{
+  return pload<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from)
+{
+  return ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet4bf& from)
+{
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from)
+{
+  return ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+}
+
+template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
+  return F32ToBf16(pabs<Packet4f>(Bf16ToF32(a)));
+}
+
+template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(const Packet4bf &a,
+                                                                            const Packet4bf &b)
+{
+  return F32ToBf16(pmin<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(const Packet4bf &a,
+                                                                        const Packet4bf &b)
+{
+  return F32ToBf16(pmin<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <> EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(const Packet4bf &a,
+                                                          const Packet4bf &b)
+{
+  return F32ToBf16(pmin<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(const Packet4bf &a,
+                                                                            const Packet4bf &b)
+{
+  return F32ToBf16(pmax<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(const Packet4bf &a,
+                                                                        const Packet4bf &b)
+{
+  return F32ToBf16(pmax<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <> EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(const Packet4bf &a,
+                                                          const Packet4bf &b)
+{
+  return F32ToBf16(pmax<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a)
+{
+  return F32ToBf16(plset<Packet4f>(static_cast<float>(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) {
+  return por<Packet4us>(a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) {
+  return pxor<Packet4us>(a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) {
+  return pand<Packet4us>(a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) {
+  return pandnot<Packet4us>(a, b);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a,
+                                                      const Packet4bf& b)
+{
+  return pselect<Packet4us>(mask, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a)
+{
+  return F32ToBf16(print<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a)
+{
+  return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(const Packet4bf& a)
+{
+  return F32ToBf16(pceil<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(padd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf psub<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(psub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(pmul<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<>
+EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride)
+{
+  return pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride);
+}
+
+template<>
+EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride)
+{
+  pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), from, stride);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a)
+{
+  return static_cast<bfloat16>(predux<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet4bf>(const Packet4bf& a)
+{
+  return static_cast<bfloat16>(predux_max<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet4bf>(const Packet4bf& a)
+{
+  return static_cast<bfloat16>(predux_min<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a)
+{
+  return static_cast<bfloat16>(predux_mul<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a)
+{
+  return preverse<Packet4us>(a);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel)
+{
+  detail::ptranspose_impl(kernel);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32ToBf16(pabsdiff<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32MaskToBf16Mask(pcmp_eq<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32MaskToBf16Mask(pcmp_lt_or_nan<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a)
+{
+  return pxor<Packet4us>(a, pset1<Packet4us>(static_cast<uint16_t>(0x8000)));
+}
+
+//---------- double ----------
+
+// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
+// Confirmed at least with __apple_build_version__ = 6000054.
+#ifdef __apple_build_version__
+// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
+// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
+// major toolchain updates.
+#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
+#else
+#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
+#endif
+
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+
+// Bug 907: workaround missing declarations of the following two functions in the ADK
+// Defining these functions as templates ensures that if these intrinsics are
+// already defined in arm_neon.h, then our workaround doesn't cause a conflict
+// and has lower priority in overload resolution.
+template <typename T> uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; }
+
+template <typename T> float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; }
+
+typedef float64x2_t Packet2d;
+typedef float64x1_t Packet1d;
+
+// fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
+// Currently used in LU/arch/InverseSize4.h to enable a shared implementation
+// for fast inversion of matrices of size 4.
+EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask)
+{
+  const double* a = reinterpret_cast<const double*>(&m);
+  const double* b = reinterpret_cast<const double*>(&n);
+  Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))};
+  return res;
+}
+
+EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask)
+{
+  return shuffle(a, b, mask);
+}
+EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a,const Packet2d& b)
+{
+  return shuffle(a, b, 0);
+}
+EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b)
+{
+  return shuffle(a, b, 3);
+}
+#define vec2d_duplane(a, p) \
+  vdupq_laneq_f64(a, p)
+
+template<> struct packet_traits<double>  : default_packet_traits
+{
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasCmp       = 1,
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+
+    HasDiv   = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh = 0,
+    HasErf  = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2d>
+{
+  typedef double type;
+  typedef Packet2d half;
+  typedef Packet2l integer_packet;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) { return vdupq_n_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
+{
+  const double c[] = {0.0,1.0};
+  return vaddq_f64(pset1<Packet2d>(a), vld1q_f64(c));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
+template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b){
+  const Packet2d mask = {numext::bit_cast<double>(0x8000000000000000ull),0.0};
+  return padd(a, pxor(mask, b));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
+
+#ifdef __ARM_FEATURE_FMA
+// See bug 936. See above comment about FMA for float.
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
+{ return vfmaq_f64(c,a,b); }
+#else
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
+{ return vmlaq_f64(c,a,b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vminnmq_f64(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxnmq_f64(a, b); }
+
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmin<Packet2d>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
+
+
+template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmax<Packet2d>(a, b); }
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(vcleq_f64(a,b)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(vcltq_f64(a,b)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a,b)))); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(vceqq_f64(a,b)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) { return vld1q_dup_f64(from); }
+template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to,from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to,from); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+{
+  Packet2d res = pset1<Packet2d>(0.0);
+  res = vld1q_lane_f64(from + 0*stride, res, 0);
+  res = vld1q_lane_f64(from + 1*stride, res, 1);
+  return res;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+  vst1q_lane_f64(to + stride*0, from, 0);
+  vst1q_lane_f64(to + stride*1, from, 1);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
+
+// FIXME only store the 2 first elements ?
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a,0); }
+
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
+{ return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
+
+#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+// workaround ICE, see bug 907
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{ return (vget_low_f64(a) + vget_high_f64(a))[0]; }
+#else
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{ return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
+#endif
+
+// Other reduction functions:
+// mul
+#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
+{ return (vget_low_f64(a) * vget_high_f64(a))[0]; }
+#else
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
+{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
+#endif
+
+// min
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
+{ return vgetq_lane_f64(vpminq_f64(a,a), 0); }
+
+// max
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
+{ return vgetq_lane_f64(vpmaxq_f64(a,a), 0); }
+
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet2d, 2>& kernel)
+{
+  const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
+  const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
+
+  kernel.packet[0] = tmp1;
+  kernel.packet[1] = tmp2;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b)
+{ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
+{ return vrndnq_f64(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
+{ return vrndmq_f64(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
+{ return vrndpq_f64(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent)
+{ return pldexp_generic(a, exponent); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent)
+{ return pfrexp_generic(a,exponent); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from)
+{ return vreinterpretq_f64_u64(vdupq_n_u64(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
+  // Compute approximate reciprocal sqrt.
+  Packet2d x = vrsqrteq_f64(a);
+  // Do Newton iterations for 1/sqrt(x).
+  x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
+  x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
+  x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
+  const Packet2d infinity = pset1<Packet2d>(NumTraits<double>::infinity());
+  return pselect(pcmp_eq(a, pzero(a)), infinity, x);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); }
+
+#endif // EIGEN_ARCH_ARM64
+
+// Do we have an fp16 types and supporting Neon intrinsics?
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+typedef float16x4_t Packet4hf;
+typedef float16x8_t Packet8hf;
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet8hf type;
+  typedef Packet4hf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 1,
+
+    HasCmp = 1,
+    HasCast = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasInsert = 1,
+    HasReduxp = 1,
+    HasDiv = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 0,
+    HasExp = 0,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasErf = EIGEN_FAST_MATH,
+    HasBessel = 0,  // Issues with accuracy.
+    HasNdtri = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4hf> {
+  typedef Eigen::half type;
+  typedef Packet4hf half;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet8hf> {
+  typedef Eigen::half type;
+  typedef Packet4hf half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(const Packet8hf& a) {
+  return vadd_f16(vget_low_f16(a), vget_high_f16(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pset1<Packet8hf>(const Eigen::half& from) {
+  return vdupq_n_f16(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pset1<Packet4hf>(const Eigen::half& from) {
+  return vdup_n_f16(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf plset<Packet8hf>(const Eigen::half& a) {
+  const float16_t f[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  Packet8hf countdown = vld1q_f16(f);
+  return vaddq_f16(pset1<Packet8hf>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf plset<Packet4hf>(const Eigen::half& a) {
+  const float16_t f[] = {0, 1, 2, 3};
+  Packet4hf countdown = vld1_f16(f);
+  return vadd_f16(pset1<Packet4hf>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf padd<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vaddq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf padd<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vadd_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf psub<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vsubq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf psub<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vsub_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pnegate(const Packet8hf& a) {
+  return vnegq_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pnegate(const Packet4hf& a) {
+  return vneg_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pconj(const Packet8hf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pconj(const Packet4hf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmul<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vmulq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmul<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vmul_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pdiv<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vdivq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pdiv<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vdiv_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return vfmaq_f16(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
+  return vfma_f16(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vminq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmin<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vmin_f16(a, b);
+}
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vminnm_f16(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vminnmq_f16(a, b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmin<Packet4hf>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmin<Packet8hf>(a, b); }
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmax<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vmaxq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmax<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vmax_f16(a, b);
+}
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vmaxnm_f16(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vmaxnmq_f16(a, b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmax<Packet4hf>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmax<Packet8hf>(a, b); }
+
+#define EIGEN_MAKE_ARM_FP16_CMP_8(name)                                               \
+  template <>                                                                         \
+  EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \
+    return vreinterpretq_f16_u16(vc##name##q_f16(a, b));                              \
+  }
+
+#define EIGEN_MAKE_ARM_FP16_CMP_4(name)                                               \
+  template <>                                                                         \
+  EIGEN_STRONG_INLINE Packet4hf pcmp_##name(const Packet4hf& a, const Packet4hf& b) { \
+    return vreinterpret_f16_u16(vc##name##_f16(a, b));                                \
+  }
+
+EIGEN_MAKE_ARM_FP16_CMP_8(eq)
+EIGEN_MAKE_ARM_FP16_CMP_8(lt)
+EIGEN_MAKE_ARM_FP16_CMP_8(le)
+
+EIGEN_MAKE_ARM_FP16_CMP_4(eq)
+EIGEN_MAKE_ARM_FP16_CMP_4(lt)
+EIGEN_MAKE_ARM_FP16_CMP_4(le)
+
+#undef EIGEN_MAKE_ARM_FP16_CMP_8
+#undef EIGEN_MAKE_ARM_FP16_CMP_4
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pcmp_lt_or_nan<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(vmvnq_u16(vcgeq_f16(a, b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(const Packet8hf& a)
+{ return vrndnq_f16(a); }
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(const Packet4hf& a)
+{ return vrndn_f16(a); }
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a)
+{ return vrndmq_f16(a); }
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a)
+{ return vrndm_f16(a); }
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a)
+{ return vrndpq_f16(a); }
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a)
+{ return vrndp_f16(a); }
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(const Packet8hf& a) {
+  return vsqrtq_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf psqrt<Packet4hf>(const Packet4hf& a) {
+  return vsqrt_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pand<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pand<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf por<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf por<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pxor<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pxor<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(veor_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pandnot<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pandnot<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(vbic_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf ploadu<Packet8hf>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf ploadu<Packet4hf>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf ploaddup<Packet8hf>(const Eigen::half* from) {
+  Packet8hf packet;
+  packet[0] = from[0].x;
+  packet[1] = from[0].x;
+  packet[2] = from[1].x;
+  packet[3] = from[1].x;
+  packet[4] = from[2].x;
+  packet[5] = from[2].x;
+  packet[6] = from[3].x;
+  packet[7] = from[3].x;
+  return packet;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf ploaddup<Packet4hf>(const Eigen::half* from) {
+  float16x4_t packet;
+  float16_t* tmp;
+  tmp = (float16_t*)&packet;
+  tmp[0] = from[0].x;
+  tmp[1] = from[0].x;
+  tmp[2] = from[1].x;
+  tmp[3] = from[1].x;
+  return packet;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf ploadquad<Packet8hf>(const Eigen::half* from) {
+  Packet4hf lo, hi;
+  lo = vld1_dup_f16(reinterpret_cast<const float16_t*>(from));
+  hi = vld1_dup_f16(reinterpret_cast<const float16_t*>(from+1));
+  return vcombine_f16(lo, hi);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 0); }
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 0); }
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) {
+  return vbslq_f16(vreinterpretq_u16_f16(mask), a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(const Packet4hf& mask, const Packet4hf& a, const Packet4hf& b) {
+  return vbsl_f16(vreinterpret_u16_f16(mask), a, b);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 7); }
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 3); }
+
+template <>
+EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather<Eigen::half, Packet8hf>(const Eigen::half* from, Index stride) {
+  Packet8hf res = pset1<Packet8hf>(Eigen::half(0.f));
+  res = vsetq_lane_f16(from[0 * stride].x, res, 0);
+  res = vsetq_lane_f16(from[1 * stride].x, res, 1);
+  res = vsetq_lane_f16(from[2 * stride].x, res, 2);
+  res = vsetq_lane_f16(from[3 * stride].x, res, 3);
+  res = vsetq_lane_f16(from[4 * stride].x, res, 4);
+  res = vsetq_lane_f16(from[5 * stride].x, res, 5);
+  res = vsetq_lane_f16(from[6 * stride].x, res, 6);
+  res = vsetq_lane_f16(from[7 * stride].x, res, 7);
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather<Eigen::half, Packet4hf>(const Eigen::half* from, Index stride) {
+  Packet4hf res = pset1<Packet4hf>(Eigen::half(0.f));
+  res = vset_lane_f16(from[0 * stride].x, res, 0);
+  res = vset_lane_f16(from[1 * stride].x, res, 1);
+  res = vset_lane_f16(from[2 * stride].x, res, 2);
+  res = vset_lane_f16(from[3 * stride].x, res, 3);
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from, Index stride) {
+  to[stride * 0].x = vgetq_lane_f16(from, 0);
+  to[stride * 1].x = vgetq_lane_f16(from, 1);
+  to[stride * 2].x = vgetq_lane_f16(from, 2);
+  to[stride * 3].x = vgetq_lane_f16(from, 3);
+  to[stride * 4].x = vgetq_lane_f16(from, 4);
+  to[stride * 5].x = vgetq_lane_f16(from, 5);
+  to[stride * 6].x = vgetq_lane_f16(from, 6);
+  to[stride * 7].x = vgetq_lane_f16(from, 7);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from, Index stride) {
+  to[stride * 0].x = vget_lane_f16(from, 0);
+  to[stride * 1].x = vget_lane_f16(from, 1);
+  to[stride * 2].x = vget_lane_f16(from, 2);
+  to[stride * 3].x = vget_lane_f16(from, 3);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<Eigen::half>(const Eigen::half* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8hf>(const Packet8hf& a) {
+  float16_t x[8];
+  vst1q_f16(x, a);
+  Eigen::half h;
+  h.x = x[0];
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4hf>(const Packet4hf& a) {
+  float16_t x[4];
+  vst1_f16(x, a);
+  Eigen::half h;
+  h.x = x[0];
+  return h;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi;
+  Packet8hf a_r64;
+
+  a_r64 = vrev64q_f16(a);
+  a_lo = vget_low_f16(a_r64);
+  a_hi = vget_high_f16(a_r64);
+  return vcombine_f16(a_hi, a_lo);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf preverse<Packet4hf>(const Packet4hf& a) {
+  return vrev64_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pabs<Packet8hf>(const Packet8hf& a) {
+  return vabsq_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pabs<Packet4hf>(const Packet4hf& a) {
+  return vabs_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux<Packet8hf>(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi, sum;
+
+  a_lo = vget_low_f16(a);
+  a_hi = vget_high_f16(a);
+  sum = vpadd_f16(a_lo, a_hi);
+  sum = vpadd_f16(sum, sum);
+  sum = vpadd_f16(sum, sum);
+
+  Eigen::half h;
+  h.x = vget_lane_f16(sum, 0);
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux<Packet4hf>(const Packet4hf& a) {
+  float16x4_t sum;
+
+  sum = vpadd_f16(a, a);
+  sum = vpadd_f16(sum, sum);
+  Eigen::half h;
+  h.x = vget_lane_f16(sum, 0);
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8hf>(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi, prod;
+
+  a_lo = vget_low_f16(a);
+  a_hi = vget_high_f16(a);
+  prod = vmul_f16(a_lo, a_hi);
+  prod = vmul_f16(prod, vrev64_f16(prod));
+
+  Eigen::half h;
+  h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4hf>(const Packet4hf& a) {
+  float16x4_t prod;
+  prod = vmul_f16(a, vrev64_f16(a));
+  Eigen::half h;
+  h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8hf>(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi, min;
+
+  a_lo = vget_low_f16(a);
+  a_hi = vget_high_f16(a);
+  min = vpmin_f16(a_lo, a_hi);
+  min = vpmin_f16(min, min);
+  min = vpmin_f16(min, min);
+
+  Eigen::half h;
+  h.x = vget_lane_f16(min, 0);
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4hf>(const Packet4hf& a) {
+  Packet4hf tmp;
+  tmp = vpmin_f16(a, a);
+  tmp = vpmin_f16(tmp, tmp);
+  Eigen::half h;
+  h.x = vget_lane_f16(tmp, 0);
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8hf>(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi, max;
+
+  a_lo = vget_low_f16(a);
+  a_hi = vget_high_f16(a);
+  max = vpmax_f16(a_lo, a_hi);
+  max = vpmax_f16(max, max);
+  max = vpmax_f16(max, max);
+
+  Eigen::half h;
+  h.x = vget_lane_f16(max, 0);
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4hf>(const Packet4hf& a) {
+  Packet4hf tmp;
+  tmp = vpmax_f16(a, a);
+  tmp = vpmax_f16(tmp, tmp);
+  Eigen::half h;
+  h.x = vget_lane_f16(tmp, 0);
+  return h;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel)
+{
+  const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
+  const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
+
+  const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0]));
+  const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1]));
+
+  kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]);
+  kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]);
+  kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]);
+  kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {
+  EIGEN_ALIGN16 float16x4x4_t tmp_x4;
+  float16_t* tmp = (float16_t*)&kernel;
+  tmp_x4 = vld4_f16(tmp);
+
+  kernel.packet[0] = tmp_x4.val[0];
+  kernel.packet[1] = tmp_x4.val[1];
+  kernel.packet[2] = tmp_x4.val[2];
+  kernel.packet[3] = tmp_x4.val[3];
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 8>& kernel) {
+  float16x8x2_t T_1[4];
+
+  T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]);
+  T_1[1] = vuzpq_f16(kernel.packet[2], kernel.packet[3]);
+  T_1[2] = vuzpq_f16(kernel.packet[4], kernel.packet[5]);
+  T_1[3] = vuzpq_f16(kernel.packet[6], kernel.packet[7]);
+
+  float16x8x2_t T_2[4];
+  T_2[0] = vuzpq_f16(T_1[0].val[0], T_1[1].val[0]);
+  T_2[1] = vuzpq_f16(T_1[0].val[1], T_1[1].val[1]);
+  T_2[2] = vuzpq_f16(T_1[2].val[0], T_1[3].val[0]);
+  T_2[3] = vuzpq_f16(T_1[2].val[1], T_1[3].val[1]);
+
+  float16x8x2_t T_3[4];
+  T_3[0] = vuzpq_f16(T_2[0].val[0], T_2[2].val[0]);
+  T_3[1] = vuzpq_f16(T_2[0].val[1], T_2[2].val[1]);
+  T_3[2] = vuzpq_f16(T_2[1].val[0], T_2[3].val[0]);
+  T_3[3] = vuzpq_f16(T_2[1].val[1], T_2[3].val[1]);
+
+  kernel.packet[0] = T_3[0].val[0];
+  kernel.packet[1] = T_3[2].val[0];
+  kernel.packet[2] = T_3[1].val[0];
+  kernel.packet[3] = T_3[3].val[0];
+  kernel.packet[4] = T_3[0].val[1];
+  kernel.packet[5] = T_3[2].val[1];
+  kernel.packet[6] = T_3[1].val[1];
+  kernel.packet[7] = T_3[3].val[1];
+}
+#endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_NEON_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h b/src/3rdparty/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h
new file mode 100644
index 000000000..54f97336e
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h
@@ -0,0 +1,1419 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
+// Copyright (C) 2020 Antonio Sanchez <cantonios@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_NEON_H
+#define EIGEN_TYPE_CASTING_NEON_H
+
+namespace Eigen {
+
+namespace internal {
+
+//==============================================================================
+// pcast, SrcType = float
+//==============================================================================
+template <>
+struct type_casting_traits<float, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4f, Packet4f>(const Packet4f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet2f, Packet2f>(const Packet2f& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<float, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+struct type_casting_traits<float, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+// If float64 exists, first convert to that to keep as much precision as possible.
+#if EIGEN_ARCH_ARM64
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4f, Packet2l>(const Packet4f& a) {
+  // Discard second half of input.
+  return vcvtq_s64_f64(vcvt_f64_f32(vget_low_f32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4f, Packet2ul>(const Packet4f& a) {
+  // Discard second half of input.
+  return vcvtq_u64_f64(vcvt_f64_f32(vget_low_f32(a)));
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4f, Packet2l>(const Packet4f& a) {
+  // Discard second half of input.
+  return vmovl_s32(vget_low_s32(vcvtq_s32_f32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4f, Packet2ul>(const Packet4f& a) {
+  // Discard second half of input.
+  return vmovl_u32(vget_low_u32(vcvtq_u32_f32(a)));
+}
+#endif  // EIGEN_ARCH_ARM64
+
+template <>
+struct type_casting_traits<float, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return vcvtq_s32_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet2f, Packet2i>(const Packet2f& a) {
+  return vcvt_s32_f32(a);
+}
+
+template <>
+struct type_casting_traits<float, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
+  return vcvtq_u32_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet2f, Packet2ui>(const Packet2f& a) {
+  return vcvt_u32_f32(a);
+}
+
+template <>
+struct type_casting_traits<float, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet4f, Packet8s>(const Packet4f& a, const Packet4f& b) {
+  return vcombine_s16(vmovn_s32(vcvtq_s32_f32(a)), vmovn_s32(vcvtq_s32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet2f, Packet4s>(const Packet2f& a, const Packet2f& b) {
+  return vmovn_s32(vcombine_s32(vcvt_s32_f32(a), vcvt_s32_f32(b)));
+}
+
+template <>
+struct type_casting_traits<float, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet4f, Packet8us>(const Packet4f& a, const Packet4f& b) {
+  return vcombine_u16(vmovn_u32(vcvtq_u32_f32(a)), vmovn_u32(vcvtq_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet2f, Packet4us>(const Packet2f& a, const Packet2f& b) {
+  return vmovn_u32(vcombine_u32(vcvt_u32_f32(a), vcvt_u32_f32(b)));
+}
+
+template <>
+struct type_casting_traits<float, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet4f, Packet16c>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
+                                                         const Packet4f& d) {
+  const int16x8_t ab_s16 = pcast<Packet4f, Packet8s>(a, b);
+  const int16x8_t cd_s16 = pcast<Packet4f, Packet8s>(c, d);
+  return vcombine_s8(vmovn_s16(ab_s16), vmovn_s16(cd_s16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet2f, Packet8c>(const Packet2f& a, const Packet2f& b, const Packet2f& c,
+                                                       const Packet2f& d) {
+  const int16x4_t ab_s16 = pcast<Packet2f, Packet4s>(a, b);
+  const int16x4_t cd_s16 = pcast<Packet2f, Packet4s>(c, d);
+  return vmovn_s16(vcombine_s16(ab_s16, cd_s16));
+}
+
+template <>
+struct type_casting_traits<float, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet4f, Packet16uc>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
+                                                           const Packet4f& d) {
+  const uint16x8_t ab_u16 = pcast<Packet4f, Packet8us>(a, b);
+  const uint16x8_t cd_u16 = pcast<Packet4f, Packet8us>(c, d);
+  return vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet2f, Packet8uc>(const Packet2f& a, const Packet2f& b, const Packet2f& c,
+                                                         const Packet2f& d) {
+  const uint16x4_t ab_u16 = pcast<Packet2f, Packet4us>(a, b);
+  const uint16x4_t cd_u16 = pcast<Packet2f, Packet4us>(c, d);
+  return vmovn_u16(vcombine_u16(ab_u16, cd_u16));
+}
+
+//==============================================================================
+// pcast, SrcType = int8_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::int8_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet16c, Packet4f>(const Packet16c& a) {
+  // Discard all but first 4 bytes.
+  return vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet8c, Packet2f>(const Packet8c& a) {
+  // Discard all but first 2 bytes.
+  return vcvt_f32_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(a)))));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet16c, Packet2l>(const Packet16c& a) {
+  // Discard all but first two bytes.
+  return vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a))))));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet16c, Packet2ul>(const Packet16c& a) {
+  return vreinterpretq_u64_s64(pcast<Packet16c, Packet2l>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet16c, Packet4i>(const Packet16c& a) {
+  // Discard all but first 4 bytes.
+  return vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet8c, Packet2i>(const Packet8c& a) {
+  // Discard all but first 2 bytes.
+  return vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(a))));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet16c, Packet4ui>(const Packet16c& a) {
+  return vreinterpretq_u32_s32(pcast<Packet16c, Packet4i>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet8c, Packet2ui>(const Packet8c& a) {
+  return vreinterpret_u32_s32(pcast<Packet8c, Packet2i>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet16c, Packet8s>(const Packet16c& a) {
+  // Discard second half of input.
+  return vmovl_s8(vget_low_s8(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet8c, Packet4s>(const Packet8c& a) {
+  // Discard second half of input.
+  return vget_low_s16(vmovl_s8(a));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet16c, Packet8us>(const Packet16c& a) {
+  return vreinterpretq_u16_s16(pcast<Packet16c, Packet8s>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet8c, Packet4us>(const Packet8c& a) {
+  return vreinterpret_u16_s16(pcast<Packet8c, Packet4s>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet16c, Packet16c>(const Packet16c& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet8c, Packet8c>(const Packet8c& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcast<Packet4c, Packet4c>(const Packet4c& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet16c, Packet16uc>(const Packet16c& a) {
+  return vreinterpretq_u8_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet8c, Packet8uc>(const Packet8c& a) {
+  return vreinterpret_u8_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcast<Packet4c, Packet4uc>(const Packet4c& a) {
+  return static_cast<Packet4uc>(a);
+}
+
+//==============================================================================
+// pcast, SrcType = uint8_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::uint8_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet16uc, Packet4f>(const Packet16uc& a) {
+  // Discard all but first 4 bytes.
+  return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a)))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet8uc, Packet2f>(const Packet8uc& a) {
+  // Discard all but first 2 bytes.
+  return vcvt_f32_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(a)))));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet16uc, Packet2ul>(const Packet16uc& a) {
+  // Discard all but first two bytes.
+  return vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet16uc, Packet2l>(const Packet16uc& a) {
+  return vreinterpretq_s64_u64(pcast<Packet16uc, Packet2ul>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet16uc, Packet4ui>(const Packet16uc& a) {
+  // Discard all but first 4 bytes.
+  return vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet8uc, Packet2ui>(const Packet8uc& a) {
+  // Discard all but first 2 bytes.
+  return vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(a))));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet16uc, Packet4i>(const Packet16uc& a) {
+  return vreinterpretq_s32_u32(pcast<Packet16uc, Packet4ui>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet8uc, Packet2i>(const Packet8uc& a) {
+  return vreinterpret_s32_u32(pcast<Packet8uc, Packet2ui>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet16uc, Packet8us>(const Packet16uc& a) {
+  // Discard second half of input.
+  return vmovl_u8(vget_low_u8(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet8uc, Packet4us>(const Packet8uc& a) {
+  // Discard second half of input.
+  return vget_low_u16(vmovl_u8(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet16uc, Packet8s>(const Packet16uc& a) {
+  return vreinterpretq_s16_u16(pcast<Packet16uc, Packet8us>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet8uc, Packet4s>(const Packet8uc& a) {
+  return vreinterpret_s16_u16(pcast<Packet8uc, Packet4us>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet16uc, Packet16uc>(const Packet16uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet8uc, Packet8uc>(const Packet8uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcast<Packet4uc, Packet4uc>(const Packet4uc& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet16uc, Packet16c>(const Packet16uc& a) {
+  return vreinterpretq_s8_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet8uc, Packet8c>(const Packet8uc& a) {
+  return vreinterpret_s8_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcast<Packet4uc, Packet4c>(const Packet4uc& a) {
+  return static_cast<Packet4c>(a);
+}
+
+//==============================================================================
+// pcast, SrcType = int16_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::int16_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8s, Packet4f>(const Packet8s& a) {
+  // Discard second half of input.
+  return vcvtq_f32_s32(vmovl_s16(vget_low_s16(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet4s, Packet2f>(const Packet4s& a) {
+  // Discard second half of input.
+  return vcvt_f32_s32(vget_low_s32(vmovl_s16(a)));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet8s, Packet2l>(const Packet8s& a) {
+  // Discard all but first two values.
+  return vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(a))));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet8s, Packet2ul>(const Packet8s& a) {
+  return vreinterpretq_u64_s64(pcast<Packet8s, Packet2l>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet8s, Packet4i>(const Packet8s& a) {
+  // Discard second half of input.
+  return vmovl_s16(vget_low_s16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet4s, Packet2i>(const Packet4s& a) {
+  // Discard second half of input.
+  return vget_low_s32(vmovl_s16(a));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet8s, Packet4ui>(const Packet8s& a) {
+  return vreinterpretq_u32_s32(pcast<Packet8s, Packet4i>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet4s, Packet2ui>(const Packet4s& a) {
+  return vreinterpret_u32_s32(pcast<Packet4s, Packet2i>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet8s, Packet8s>(const Packet8s& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet4s, Packet4s>(const Packet4s& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet8s, Packet8us>(const Packet8s& a) {
+  return vreinterpretq_u16_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet4s, Packet4us>(const Packet4s& a) {
+  return vreinterpret_u16_s16(a);
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet8s, Packet16c>(const Packet8s& a, const Packet8s& b) {
+  return vcombine_s8(vmovn_s16(a), vmovn_s16(b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet4s, Packet8c>(const Packet4s& a, const Packet4s& b) {
+  return vmovn_s16(vcombine_s16(a, b));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet8s, Packet16uc>(const Packet8s& a, const Packet8s& b) {
+  return vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(a)), vmovn_u16(vreinterpretq_u16_s16(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet4s, Packet8uc>(const Packet4s& a, const Packet4s& b) {
+  return vmovn_u16(vcombine_u16(vreinterpret_u16_s16(a), vreinterpret_u16_s16(b)));
+}
+
+//==============================================================================
+// pcast, SrcType = uint16_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::uint16_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8us, Packet4f>(const Packet8us& a) {
+  // Discard second half of input.
+  return vcvtq_f32_u32(vmovl_u16(vget_low_u16(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet4us, Packet2f>(const Packet4us& a) {
+  // Discard second half of input.
+  return vcvt_f32_u32(vget_low_u32(vmovl_u16(a)));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet8us, Packet2ul>(const Packet8us& a) {
+  // Discard all but first two values.
+  return vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(a))));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet8us, Packet2l>(const Packet8us& a) {
+  return vreinterpretq_s64_u64(pcast<Packet8us, Packet2ul>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet8us, Packet4ui>(const Packet8us& a) {
+  // Discard second half of input.
+  return vmovl_u16(vget_low_u16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet4us, Packet2ui>(const Packet4us& a) {
+  // Discard second half of input.
+  return vget_low_u32(vmovl_u16(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet8us, Packet4i>(const Packet8us& a) {
+  return vreinterpretq_s32_u32(pcast<Packet8us, Packet4ui>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet4us, Packet2i>(const Packet4us& a) {
+  return vreinterpret_s32_u32(pcast<Packet4us, Packet2ui>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet8us, Packet8us>(const Packet8us& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet4us, Packet4us>(const Packet4us& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet8us, Packet8s>(const Packet8us& a) {
+  return vreinterpretq_s16_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet4us, Packet4s>(const Packet4us& a) {
+  return vreinterpret_s16_u16(a);
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet8us, Packet16uc>(const Packet8us& a, const Packet8us& b) {
+  return vcombine_u8(vmovn_u16(a), vmovn_u16(b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet4us, Packet8uc>(const Packet4us& a, const Packet4us& b) {
+  return vmovn_u16(vcombine_u16(a, b));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet8us, Packet16c>(const Packet8us& a, const Packet8us& b) {
+  return vreinterpretq_s8_u8(pcast<Packet8us, Packet16uc>(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet4us, Packet8c>(const Packet4us& a, const Packet4us& b) {
+  return vreinterpret_s8_u8(pcast<Packet4us, Packet8uc>(a, b));
+}
+
+//==============================================================================
+// pcast, SrcType = int32_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::int32_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return vcvtq_f32_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet2i, Packet2f>(const Packet2i& a) {
+  return vcvt_f32_s32(a);
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4i, Packet2l>(const Packet4i& a) {
+  // Discard second half of input.
+  return vmovl_s32(vget_low_s32(a));
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4i, Packet2ul>(const Packet4i& a) {
+  return vreinterpretq_u64_s64(pcast<Packet4i, Packet2l>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4i, Packet4i>(const Packet4i& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet2i, Packet2i>(const Packet2i& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet4i, Packet4ui>(const Packet4i& a) {
+  return vreinterpretq_u32_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet2i, Packet2ui>(const Packet2i& a) {
+  return vreinterpret_u32_s32(a);
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet4i, Packet8s>(const Packet4i& a, const Packet4i& b) {
+  return vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet2i, Packet4s>(const Packet2i& a, const Packet2i& b) {
+  return vmovn_s32(vcombine_s32(a, b));
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet4i, Packet8us>(const Packet4i& a, const Packet4i& b) {
+  return vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(a)), vmovn_u32(vreinterpretq_u32_s32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet2i, Packet4us>(const Packet2i& a, const Packet2i& b) {
+  return vmovn_u32(vreinterpretq_u32_s32(vcombine_s32(a, b)));
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet4i, Packet16c>(const Packet4i& a, const Packet4i& b, const Packet4i& c,
+                                                         const Packet4i& d) {
+  const int16x8_t ab_s16 = pcast<Packet4i, Packet8s>(a, b);
+  const int16x8_t cd_s16 = pcast<Packet4i, Packet8s>(c, d);
+  return vcombine_s8(vmovn_s16(ab_s16), vmovn_s16(cd_s16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet2i, Packet8c>(const Packet2i& a, const Packet2i& b, const Packet2i& c,
+                                                       const Packet2i& d) {
+  const int16x4_t ab_s16 = vmovn_s32(vcombine_s32(a, b));
+  const int16x4_t cd_s16 = vmovn_s32(vcombine_s32(c, d));
+  return vmovn_s16(vcombine_s16(ab_s16, cd_s16));
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet4i, Packet16uc>(const Packet4i& a, const Packet4i& b, const Packet4i& c,
+                                                           const Packet4i& d) {
+  const uint16x8_t ab_u16 = pcast<Packet4i, Packet8us>(a, b);
+  const uint16x8_t cd_u16 = pcast<Packet4i, Packet8us>(c, d);
+  return vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet2i, Packet8uc>(const Packet2i& a, const Packet2i& b, const Packet2i& c,
+                                                         const Packet2i& d) {
+  const uint16x4_t ab_u16 = pcast<Packet2i, Packet4us>(a, b);
+  const uint16x4_t cd_u16 = pcast<Packet2i, Packet4us>(c, d);
+  return vmovn_u16(vcombine_u16(ab_u16, cd_u16));
+}
+
+//==============================================================================
+// pcast, SrcType = uint32_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::uint32_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
+  return vcvtq_f32_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet2ui, Packet2f>(const Packet2ui& a) {
+  return vcvt_f32_u32(a);
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4ui, Packet2ul>(const Packet4ui& a) {
+  // Discard second half of input.
+  return vmovl_u32(vget_low_u32(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4ui, Packet2l>(const Packet4ui& a) {
+  return vreinterpretq_s64_u64(pcast<Packet4ui, Packet2ul>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet4ui, Packet4ui>(const Packet4ui& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet2ui, Packet2ui>(const Packet2ui& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4ui, Packet4i>(const Packet4ui& a) {
+  return vreinterpretq_s32_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet2ui, Packet2i>(const Packet2ui& a) {
+  return vreinterpret_s32_u32(a);
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet4ui, Packet8us>(const Packet4ui& a, const Packet4ui& b) {
+  return vcombine_u16(vmovn_u32(a), vmovn_u32(b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet2ui, Packet4us>(const Packet2ui& a, const Packet2ui& b) {
+  return vmovn_u32(vcombine_u32(a, b));
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet4ui, Packet8s>(const Packet4ui& a, const Packet4ui& b) {
+  return vreinterpretq_s16_u16(pcast<Packet4ui, Packet8us>(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet2ui, Packet4s>(const Packet2ui& a, const Packet2ui& b) {
+  return vreinterpret_s16_u16(pcast<Packet2ui, Packet4us>(a, b));
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet4ui, Packet16uc>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
+                                                            const Packet4ui& d) {
+  const uint16x8_t ab_u16 = vcombine_u16(vmovn_u32(a), vmovn_u32(b));
+  const uint16x8_t cd_u16 = vcombine_u16(vmovn_u32(c), vmovn_u32(d));
+  return vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet2ui, Packet8uc>(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c,
+                                                          const Packet2ui& d) {
+  const uint16x4_t ab_u16 = vmovn_u32(vcombine_u32(a, b));
+  const uint16x4_t cd_u16 = vmovn_u32(vcombine_u32(c, d));
+  return vmovn_u16(vcombine_u16(ab_u16, cd_u16));
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet4ui, Packet16c>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
+                                                          const Packet4ui& d) {
+  return vreinterpretq_s8_u8(pcast<Packet4ui, Packet16uc>(a, b, c, d));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet2ui, Packet8c>(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c,
+                                                        const Packet2ui& d) {
+  return vreinterpret_s8_u8(pcast<Packet2ui, Packet8uc>(a, b, c, d));
+}
+
+//==============================================================================
+// pcast, SrcType = int64_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::int64_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2l, Packet4f>(const Packet2l& a, const Packet2l& b) {
+  return vcvtq_f32_s32(vcombine_s32(vmovn_s64(a), vmovn_s64(b)));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet2l, Packet2l>(const Packet2l& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet2l, Packet2ul>(const Packet2l& a) {
+  return vreinterpretq_u64_s64(a);
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2l, Packet4i>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s32(vmovn_s64(a), vmovn_s64(b));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet2l, Packet4ui>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_u32(vmovn_u64(vreinterpretq_u64_s64(a)), vmovn_u64(vreinterpretq_u64_s64(b)));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet2l, Packet8s>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                       const Packet2l& d) {
+  const int32x4_t ab_s32 = pcast<Packet2l, Packet4i>(a, b);
+  const int32x4_t cd_s32 = pcast<Packet2l, Packet4i>(c, d);
+  return vcombine_s16(vmovn_s32(ab_s32), vmovn_s32(cd_s32));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet2l, Packet8us>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                         const Packet2l& d) {
+  const uint32x4_t ab_u32 = pcast<Packet2l, Packet4ui>(a, b);
+  const uint32x4_t cd_u32 = pcast<Packet2l, Packet4ui>(c, d);
+  return vcombine_u16(vmovn_u32(ab_u32), vmovn_u32(cd_u32));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet2l, Packet16c>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                         const Packet2l& d, const Packet2l& e, const Packet2l& f,
+                                                         const Packet2l& g, const Packet2l& h) {
+  const int16x8_t abcd_s16 = pcast<Packet2l, Packet8s>(a, b, c, d);
+  const int16x8_t efgh_s16 = pcast<Packet2l, Packet8s>(e, f, g, h);
+  return vcombine_s8(vmovn_s16(abcd_s16), vmovn_s16(efgh_s16));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet2l, Packet16uc>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                           const Packet2l& d, const Packet2l& e, const Packet2l& f,
+                                                           const Packet2l& g, const Packet2l& h) {
+  const uint16x8_t abcd_u16 = pcast<Packet2l, Packet8us>(a, b, c, d);
+  const uint16x8_t efgh_u16 = pcast<Packet2l, Packet8us>(e, f, g, h);
+  return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16));
+}
+
+//==============================================================================
+// pcast, SrcType = uint64_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::uint64_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2ul, Packet4f>(const Packet2ul& a, const Packet2ul& b) {
+  return vcvtq_f32_u32(vcombine_u32(vmovn_u64(a), vmovn_u64(b)));
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet2ul, Packet2ul>(const Packet2ul& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet2ul, Packet2l>(const Packet2ul& a) {
+  return vreinterpretq_s64_u64(a);
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet2ul, Packet4ui>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u32(vmovn_u64(a), vmovn_u64(b));
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2ul, Packet4i>(const Packet2ul& a, const Packet2ul& b) {
+  return vreinterpretq_s32_u32(pcast<Packet2ul, Packet4ui>(a, b));
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet2ul, Packet8us>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                          const Packet2ul& d) {
+  const uint16x4_t ab_u16 = vmovn_u32(vcombine_u32(vmovn_u64(a), vmovn_u64(b)));
+  const uint16x4_t cd_u16 = vmovn_u32(vcombine_u32(vmovn_u64(c), vmovn_u64(d)));
+  return vcombine_u16(ab_u16, cd_u16);
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet2ul, Packet8s>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                        const Packet2ul& d) {
+  return vreinterpretq_s16_u16(pcast<Packet2ul, Packet8us>(a, b, c, d));
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet2ul, Packet16uc>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                            const Packet2ul& d, const Packet2ul& e, const Packet2ul& f,
+                                                            const Packet2ul& g, const Packet2ul& h) {
+  const uint16x8_t abcd_u16 = pcast<Packet2ul, Packet8us>(a, b, c, d);
+  const uint16x8_t efgh_u16 = pcast<Packet2ul, Packet8us>(e, f, g, h);
+  return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16));
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet2ul, Packet16c>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                          const Packet2ul& d, const Packet2ul& e, const Packet2ul& f,
+                                                          const Packet2ul& g, const Packet2ul& h) {
+  return vreinterpretq_s8_u8(pcast<Packet2ul, Packet16uc>(a, b, c, d, e, f, g, h));
+}
+
+//==============================================================================
+// preinterpret
+//==============================================================================
+template <>
+EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2i>(const Packet2i& a) {
+  return vreinterpret_f32_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2ui>(const Packet2ui& a) {
+  return vreinterpret_f32_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
+  return vreinterpretq_f32_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4ui>(const Packet4ui& a) {
+  return vreinterpretq_f32_u32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4c preinterpret<Packet4c, Packet4uc>(const Packet4uc& a) {
+  return static_cast<Packet4c>(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c preinterpret<Packet8c, Packet8uc>(const Packet8uc& a) {
+  return vreinterpret_s8_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c preinterpret<Packet16c, Packet16uc>(const Packet16uc& a) {
+  return vreinterpretq_s8_u8(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4uc preinterpret<Packet4uc, Packet4c>(const Packet4c& a) {
+  return static_cast<Packet4uc>(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc preinterpret<Packet8uc, Packet8c>(const Packet8c& a) {
+  return vreinterpret_u8_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc preinterpret<Packet16uc, Packet16c>(const Packet16c& a) {
+  return vreinterpretq_u8_s8(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4s preinterpret<Packet4s, Packet4us>(const Packet4us& a) {
+  return vreinterpret_s16_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8us>(const Packet8us& a) {
+  return vreinterpretq_s16_u16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4us preinterpret<Packet4us, Packet4s>(const Packet4s& a) {
+  return vreinterpret_u16_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us preinterpret<Packet8us, Packet8s>(const Packet8s& a) {
+  return vreinterpretq_u16_s16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2f>(const Packet2f& a) {
+  return vreinterpret_s32_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2ui>(const Packet2ui& a) {
+  return vreinterpret_s32_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
+  return vreinterpretq_s32_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
+  return vreinterpretq_s32_u32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2f>(const Packet2f& a) {
+  return vreinterpret_u32_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2i>(const Packet2i& a) {
+  return vreinterpret_u32_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4f>(const Packet4f& a) {
+  return vreinterpretq_u32_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
+  return vreinterpretq_u32_s32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2ul>(const Packet2ul& a) {
+  return vreinterpretq_s64_u64(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2l>(const Packet2l& a) {
+  return vreinterpretq_u64_s64(a);
+}
+
+#if EIGEN_ARCH_ARM64
+
+//==============================================================================
+// pcast/preinterpret, Double
+//==============================================================================
+
+template <>
+struct type_casting_traits<double, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2d, Packet2d>(const Packet2d& a) {
+  return a;
+}
+
+template <>
+struct type_casting_traits<double, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+  return vcombine_f32(vcvt_f32_f64(a), vcvt_f32_f64(b));
+}
+
+template <>
+struct type_casting_traits<double, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet2d, Packet2l>(const Packet2d& a) {
+  return vcvtq_s64_f64(a);
+}
+
+template <>
+struct type_casting_traits<double, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet2d, Packet2ul>(const Packet2d& a) {
+  return vcvtq_u64_f64(a);
+}
+
+template <>
+struct type_casting_traits<double, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2d, Packet4i>(const Packet2d& a, const Packet2d& b) {
+  return vcombine_s32(vmovn_s64(vcvtq_s64_f64(a)), vmovn_s64(vcvtq_s64_f64(b)));
+}
+
+template <>
+struct type_casting_traits<double, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet2d, Packet4ui>(const Packet2d& a, const Packet2d& b) {
+  return vcombine_u32(vmovn_u64(vcvtq_u64_f64(a)), vmovn_u64(vcvtq_u64_f64(b)));
+}
+
+template <>
+struct type_casting_traits<double, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet2d, Packet8s>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                       const Packet2d& d) {
+  const int32x4_t ab_s32 = pcast<Packet2d, Packet4i>(a, b);
+  const int32x4_t cd_s32 = pcast<Packet2d, Packet4i>(c, d);
+  return vcombine_s16(vmovn_s32(ab_s32), vmovn_s32(cd_s32));
+}
+
+template <>
+struct type_casting_traits<double, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet2d, Packet8us>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                         const Packet2d& d) {
+  const uint32x4_t ab_u32 = pcast<Packet2d, Packet4ui>(a, b);
+  const uint32x4_t cd_u32 = pcast<Packet2d, Packet4ui>(c, d);
+  return vcombine_u16(vmovn_u32(ab_u32), vmovn_u32(cd_u32));
+}
+
+template <>
+struct type_casting_traits<double, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet2d, Packet16c>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                         const Packet2d& d, const Packet2d& e, const Packet2d& f,
+                                                         const Packet2d& g, const Packet2d& h) {
+  const int16x8_t abcd_s16 = pcast<Packet2d, Packet8s>(a, b, c, d);
+  const int16x8_t efgh_s16 = pcast<Packet2d, Packet8s>(e, f, g, h);
+  return vcombine_s8(vmovn_s16(abcd_s16), vmovn_s16(efgh_s16));
+}
+
+template <>
+struct type_casting_traits<double, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet2d, Packet16uc>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                           const Packet2d& d, const Packet2d& e, const Packet2d& f,
+                                                           const Packet2d& g, const Packet2d& h) {
+  const uint16x8_t abcd_u16 = pcast<Packet2d, Packet8us>(a, b, c, d);
+  const uint16x8_t efgh_u16 = pcast<Packet2d, Packet8us>(e, f, g, h);
+  return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16));
+}
+
+template <>
+struct type_casting_traits<float, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
+  // Discard second-half of input.
+  return vcvt_f64_f32(vget_low_f32(a));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet16c, Packet2d>(const Packet16c& a) {
+  // Discard all but first two values.
+  return vcvt_f64_f32(pcast<Packet8c, Packet2f>(vget_low_s8(a)));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet16uc, Packet2d>(const Packet16uc& a) {
+  // Discard all but first two values.
+  return vcvt_f64_f32(pcast<Packet8uc, Packet2f>(vget_low_u8(a)));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet8s, Packet2d>(const Packet8s& a) {
+  // Discard all but first two values.
+  return vcvt_f64_f32(pcast<Packet4s, Packet2f>(vget_low_s16(a)));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet8us, Packet2d>(const Packet8us& a) {
+  // Discard all but first two values.
+  return vcvt_f64_f32(pcast<Packet4us, Packet2f>(vget_low_u16(a)));
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4i, Packet2d>(const Packet4i& a) {
+  // Discard second half of input.
+  return vcvtq_f64_s64(vmovl_s32(vget_low_s32(a)));
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4ui, Packet2d>(const Packet4ui& a) {
+  // Discard second half of input.
+  return vcvtq_f64_u64(vmovl_u32(vget_low_u32(a)));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2l, Packet2d>(const Packet2l& a) {
+  return vcvtq_f64_s64(a);
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2ul, Packet2d>(const Packet2ul& a) {
+  return vcvtq_f64_u64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
+  return vreinterpretq_f64_s64(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2ul>(const Packet2ul& a) {
+  return vreinterpretq_f64_u64(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
+  return vreinterpretq_s64_f64(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2d>(const Packet2d& a) {
+  return vreinterpretq_u64_f64(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
+  return vreinterpretq_f64_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
+  return vreinterpretq_s32_f64(a);
+}
+
+#endif  // EIGEN_ARCH_ARM64
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_NEON_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/SSE/Complex.h b/src/3rdparty/eigen/Eigen/src/Core/arch/SSE/Complex.h
new file mode 100644
index 000000000..8fe22da46
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/SSE/Complex.h
@@ -0,0 +1,351 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_SSE_H
+#define EIGEN_COMPLEX_SSE_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet2cf
+{
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
+  Packet4f v;
+};
+
+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+{
+  typedef Packet2cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasSqrt   = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0,
+    HasBlend  = 1
+  };
+};
+#endif
+
+template<> struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float> type;
+  typedef Packet2cf half;
+  typedef Packet4f as_real;
+  enum {
+    size=2,
+    alignment=Aligned16,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a)
+{
+  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
+  return Packet2cf(_mm_xor_ps(a.v,mask));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
+{
+  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
+  return Packet2cf(_mm_xor_ps(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  #ifdef EIGEN_VECTORIZE_SSE3
+  return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
+                                 _mm_mul_ps(_mm_movehdup_ps(a.v),
+                                            vec4f_swizzle1(b.v, 1, 0, 3, 2))));
+//   return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
+//                                  _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
+//                                             vec4f_swizzle1(b.v, 1, 0, 3, 2))));
+  #else
+  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000));
+  return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
+                              _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
+                                                    vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
+  #endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ptrue  <Packet2cf>(const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(b.v,a.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from))); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from))); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+{
+  Packet2cf res;
+#ifdef EIGEN_VECTORIZE_SSE3
+  res.v = _mm_castpd_ps(_mm_loaddup_pd(reinterpret_cast<double const*>(&from)));
+#else
+  res.v = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<double const*>(&from)));
+  res.v = _mm_movelh_ps(res.v, res.v);
+#endif
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); }
+
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]),
+                              std::imag(from[0*stride]), std::real(from[0*stride])));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
+  to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
+{
+  #if EIGEN_GNUC_AT_MOST(4,3)
+  // Workaround gcc 4.2 ICE - this is not performance wise ideal, but who cares...
+  // This workaround also fix invalid code generation with gcc 4.3
+  EIGEN_ALIGN16 std::complex<float> res[2];
+  _mm_store_ps((float*)res, a.v);
+  return res[0];
+  #else
+  std::complex<float> res;
+  _mm_storel_pi((__m64*)&res, a.v);
+  return res;
+  #endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v))));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v))));
+}
+
+EIGEN_STRONG_INLINE Packet2cf pcplxflip/* <Packet2cf> */(const Packet2cf& x)
+{
+  return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+
+template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for SSE3 and 4
+  Packet2cf res = pmul(a, pconj(b));
+  __m128 s = _mm_mul_ps(b.v,b.v);
+  return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,vec4f_swizzle1(s, 1, 0, 3, 2))));
+}
+
+
+
+//---------- double ----------
+struct Packet1cd
+{
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
+  Packet2d v;
+};
+
+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasSqrt   = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+#endif
+
+template<> struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
+  enum {
+    size=1,
+    alignment=Aligned16,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
+{
+  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
+  return Packet1cd(_mm_xor_pd(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  #ifdef EIGEN_VECTORIZE_SSE3
+  return Packet1cd(_mm_addsub_pd(_mm_mul_pd(_mm_movedup_pd(a.v), b.v),
+                                 _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
+                                            vec2d_swizzle1(b.v, 1, 0))));
+  #else
+  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
+  return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
+                              _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
+                                                    vec2d_swizzle1(b.v, 1, 0)), mask)));
+  #endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd ptrue  <Packet1cd>(const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(b.v,a.v)); }
+
+// FIXME force unaligned load, this is a temporary fix
+template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
+{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
+
+// FIXME force unaligned store, this is a temporary fix
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+
+template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
+{
+  EIGEN_ALIGN16 double res[2];
+  _mm_store_pd(res, a.v);
+  return std::complex<double>(res[0],res[1]);
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
+{
+  return pfirst(a);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
+{
+  return pfirst(a);
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for SSE3 and 4
+  Packet1cd res = pmul(a,pconj(b));
+  __m128d s = _mm_mul_pd(b.v,b.v);
+  return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1))));
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/* <Packet1cd> */(const Packet1cd& x)
+{
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2cf,2>& kernel) {
+  __m128d w1 = _mm_castps_pd(kernel.packet[0].v);
+  __m128d w2 = _mm_castps_pd(kernel.packet[1].v);
+
+  __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2));
+  kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2));
+  kernel.packet[1].v = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
+{
+  __m128 eq = _mm_cmpeq_ps(a.v, b.v);
+  return Packet2cf(pand<Packet4f>(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
+{
+  __m128d eq = _mm_cmpeq_pd(a.v, b.v);
+  return Packet1cd(pand<Packet2d>(eq, vec2d_swizzle1(eq, 1, 0)));
+}
+
+template<>  EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  __m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
+  return Packet2cf(_mm_castpd_ps(result));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+  return psqrt_complex<Packet1cd>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+  return psqrt_complex<Packet2cf>(a);
+}
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_SSE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h b/src/3rdparty/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h
new file mode 100644
index 000000000..8736d0d6b
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -0,0 +1,199 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin and cos and functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_SSE_H
+#define EIGEN_MATH_FUNCTIONS_SSE_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog<Packet4f>(const Packet4f& _x) {
+  return plog_float(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d plog<Packet2d>(const Packet2d& _x) {
+  return plog_double(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog2<Packet4f>(const Packet4f& _x) {
+  return plog2_float(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d plog2<Packet2d>(const Packet2d& _x) {
+  return plog2_double(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog1p<Packet4f>(const Packet4f& _x) {
+  return generic_plog1p(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexpm1<Packet4f>(const Packet4f& _x) {
+  return generic_expm1(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& _x)
+{
+  return pexp_float(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d pexp<Packet2d>(const Packet2d& x)
+{
+  return pexp_double(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psin<Packet4f>(const Packet4f& _x)
+{
+  return psin_float(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pcos<Packet4f>(const Packet4f& _x)
+{
+  return pcos_float(_x);
+}
+
+#if EIGEN_FAST_MATH
+
+// Functions for sqrt.
+// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
+// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
+// exact solution. It does not handle +inf, or denormalized numbers correctly.
+// The main advantage of this approach is not just speed, but also the fact that
+// it can be inlined and pipelined with other computations, further reducing its
+// effective latency. This is similar to Quake3's fast inverse square root.
+// For detail see here: http://www.beyond3d.com/content/articles/8/
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& _x)
+{
+  Packet4f minus_half_x = pmul(_x, pset1<Packet4f>(-0.5f));
+  Packet4f denormal_mask = pandnot(
+      pcmp_lt(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())),
+      pcmp_lt(_x, pzero(_x)));
+
+  // Compute approximate reciprocal sqrt.
+  Packet4f x = _mm_rsqrt_ps(_x);
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1<Packet4f>(1.5f)));
+  // Flush results for denormals to zero.
+  return pandnot(pmul(_x,x), denormal_mask);
+}
+
+#else
+
+template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
+
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet16b psqrt<Packet16b>(const Packet16b& x) { return x; }
+
+#if EIGEN_FAST_MATH
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
+  _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u);
+
+  Packet4f neg_half = pmul(_x, p4f_minus_half);
+
+  // Identity infinite, zero, negative and denormal arguments.
+  Packet4f lt_min_mask = _mm_cmplt_ps(_x, p4f_flt_min);
+  Packet4f inf_mask = _mm_cmpeq_ps(_x, p4f_inf);
+  Packet4f not_normal_finite_mask = _mm_or_ps(lt_min_mask, inf_mask);
+
+  // Compute an approximate result using the rsqrt intrinsic.
+  Packet4f y_approx = _mm_rsqrt_ps(_x);
+
+  // Do a single step of Newton-Raphson iteration to improve the approximation.
+  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
+  // It is essential to evaluate the inner term like this because forming
+  // y_n^2 may over- or underflow.
+  Packet4f y_newton = pmul(
+      y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p4f_one_point_five));
+
+  // Select the result of the Newton-Raphson step for positive normal arguments.
+  // For other arguments, choose the output of the intrinsic. This will
+  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if
+  // x is zero or a positive denormalized float (equivalent to flushing positive
+  // denormalized inputs to zero).
+  return pselect<Packet4f>(not_normal_finite_mask, y_approx, y_newton);
+}
+
+#else
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+  // Unfortunately we can't use the much faster mm_rsqrt_ps since it only provides an approximation.
+  return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x));
+}
+
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d prsqrt<Packet2d>(const Packet2d& x) {
+  return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x));
+}
+
+// Hyperbolic Tangent function.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+ptanh<Packet4f>(const Packet4f& x) {
+  return internal::generic_fast_tanh_float(x);
+}
+
+} // end namespace internal
+
+namespace numext {
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float sqrt(const float &x)
+{
+  return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x))));
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double sqrt(const double &x)
+{
+#if EIGEN_COMP_GNUC_STRICT
+  // This works around a GCC bug generating poor code for _mm_sqrt_pd
+  // See https://gitlab.com/libeigen/eigen/commit/8dca9f97e38970
+  return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x))));
+#else
+  return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x))));
+#endif
+}
+
+} // end namespace numex
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_SSE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/SSE/PacketMath.h b/src/3rdparty/eigen/Eigen/src/Core/arch/SSE/PacketMath.h
new file mode 100644
index 000000000..db102c73a
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -0,0 +1,1505 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_SSE_H
+#define EIGEN_PACKET_MATH_SSE_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
+// 32 bits =>  8 registers
+// 64 bits => 16 registers
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#endif
+
+#ifdef EIGEN_VECTORIZE_FMA
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+#endif
+
+#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX
+// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
+// have overloads for both types without linking error.
+// One solution is to increase ABI version using -fabi-version=4 (or greater).
+// Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
+// structure:
+typedef eigen_packet_wrapper<__m128>  Packet4f;
+typedef eigen_packet_wrapper<__m128d> Packet2d;
+#else
+typedef __m128  Packet4f;
+typedef __m128d Packet2d;
+#endif
+
+typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
+typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
+
+template<> struct is_arithmetic<__m128>  { enum { value = true }; };
+template<> struct is_arithmetic<__m128i> { enum { value = true }; };
+template<> struct is_arithmetic<__m128d> { enum { value = true }; };
+template<> struct is_arithmetic<Packet4i>  { enum { value = true }; };
+template<> struct is_arithmetic<Packet16b>  { enum { value = true }; };
+
+template<int p, int q, int r, int s>
+struct shuffle_mask{
+ enum { mask = (s)<<6|(r)<<4|(q)<<2|(p) };
+};
+
+// TODO: change the implementation of all swizzle* ops from macro to template,
+#define vec4f_swizzle1(v,p,q,r,s) \
+  Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), (shuffle_mask<p,q,r,s>::mask))))
+
+#define vec4i_swizzle1(v,p,q,r,s) \
+  Packet4i(_mm_shuffle_epi32( v, (shuffle_mask<p,q,r,s>::mask)))
+
+#define vec2d_swizzle1(v,p,q) \
+  Packet2d(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), (shuffle_mask<2*p,2*p+1,2*q,2*q+1>::mask))))
+
+#define vec4f_swizzle2(a,b,p,q,r,s) \
+  Packet4f(_mm_shuffle_ps( (a), (b), (shuffle_mask<p,q,r,s>::mask)))
+
+#define vec4i_swizzle2(a,b,p,q,r,s) \
+  Packet4i(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p,q,r,s>::mask)))))
+
+EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
+{
+  return Packet4f(_mm_movelh_ps(a,b));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
+{
+  return Packet4f(_mm_movehl_ps(a,b));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
+{
+  return Packet4f(_mm_unpacklo_ps(a,b));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
+{
+  return Packet4f(_mm_unpackhi_ps(a,b));
+}
+#define vec4f_duplane(a,p) \
+  vec4f_swizzle2(a,a,p,p,p,p)
+
+#define vec2d_swizzle2(a,b,mask) \
+  Packet2d(_mm_shuffle_pd(a,b,mask))
+
+EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b)
+{
+  return Packet2d(_mm_unpacklo_pd(a,b));
+}
+EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b)
+{
+  return Packet2d(_mm_unpackhi_pd(a,b));
+}
+#define vec2d_duplane(a,p) \
+  vec2d_swizzle2(a,a,(p<<1)|p)
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
+  const Packet4f p4f_##NAME = pset1<Packet4f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
+  const Packet2d p2d_##NAME = pset1<Packet2d>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
+  const Packet4i p4i_##NAME = pset1<Packet4i>(X)
+
+
+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet4f type;
+  typedef Packet4f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+
+    HasCmp  = 1,
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasNdtri = 1,
+    HasExp = 1,
+    HasBessel = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasBlend = 1,
+    HasCeil = 1,
+    HasFloor = 1,
+#ifdef EIGEN_VECTORIZE_SSE4_1
+    HasRound = 1,
+#endif
+    HasRint = 1
+  };
+};
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 0,
+
+    HasCmp  = 1,
+    HasDiv  = 1,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasBlend = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+#ifdef EIGEN_VECTORIZE_SSE4_1
+    HasRound = 1,
+#endif
+    HasRint = 1
+  };
+};
+#endif
+template<> struct packet_traits<int>    : default_packet_traits
+{
+  typedef Packet4i type;
+  typedef Packet4i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+
+    HasShift = 1,
+    HasBlend = 1
+  };
+};
+
+template<> struct packet_traits<bool> : default_packet_traits
+{
+  typedef Packet16b type;
+  typedef Packet16b half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    HasHalfPacket = 0,
+    size=16,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 0,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 0,
+    HasAbs2      = 0,
+    HasMin       = 0,
+    HasMax       = 0,
+    HasConj      = 0,
+    HasSqrt      = 1
+  };
+};
+
+template<> struct unpacket_traits<Packet4f> {
+  typedef float     type;
+  typedef Packet4f  half;
+  typedef Packet4i  integer_packet;
+  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet2d> {
+  typedef double    type;
+  typedef Packet2d  half;
+  enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet4i> {
+  typedef int       type;
+  typedef Packet4i  half;
+  enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet16b> {
+  typedef bool       type;
+  typedef Packet16b  half;
+  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+
+#ifndef EIGEN_VECTORIZE_AVX
+template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
+template<> struct scalar_div_cost<double,true> { enum { value = 8 }; };
+#endif
+
+#if EIGEN_COMP_MSVC==1500
+// Workaround MSVC 9 internal compiler error.
+// TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
+// TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)).
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps(from,from,from,from); }
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set_epi32(from,from,from,from); }
+#else
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps1(from); }
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
+#endif
+template<> EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(const bool&    from) { return _mm_set1_epi8(static_cast<char>(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return _mm_castsi128_ps(pset1<Packet4i>(from)); }
+template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) { return _mm_castsi128_pd(_mm_set1_epi64x(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) { return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1)); }
+template<> EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) { return _mm_set_epi32(0, -1, 0, -1); }
+template<> EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& /*a*/) { return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); }
+template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); }
+template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); }
+
+// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
+// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
+// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
+// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
+// Also note that with AVX, we want it to generate a vbroadcastss.
+#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
+template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
+  return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+#ifdef EIGEN_VECTORIZE_SSE3
+  return _mm_addsub_ps(a,b);
+#else
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x0,0x80000000,0x0));
+  return padd(a, pxor(mask, b));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
+template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) 
+{
+#ifdef EIGEN_VECTORIZE_SSE3  
+  return _mm_addsub_pd(a,b); 
+#else
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x0)); 
+  return padd(a, pxor(mask, b));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
+{
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
+  return _mm_xor_ps(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
+{
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
+  return _mm_xor_pd(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
+{
+  return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16b pnegate(const Packet16b& a)
+{
+  return psub(pset1<Packet16b>(false), a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
+{
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_mullo_epi32(a,b);
+#else
+  // this version is slightly faster than 4 scalar products
+  return vec4i_swizzle1(
+            vec4i_swizzle2(
+              _mm_mul_epu32(a,b),
+              _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2),
+                            vec4i_swizzle1(b,1,0,3,2)),
+              0,2,0,2),
+            0,2,1,3);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
+
+// for some weird raisons, it has to be overloaded for packet of integers
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
+#ifdef EIGEN_VECTORIZE_FMA
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
+#endif
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+  return _mm_blendv_ps(b,a,mask);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
+  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b),_mm_castsi128_ps(a),_mm_castsi128_ps(mask)));
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {  return _mm_blendv_pd(b,a,mask); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
+  return _mm_blendv_epi8(b,a,mask);
+}
+#else
+template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
+  Packet16b a_part = _mm_and_si128(mask, a);
+  Packet16b b_part = _mm_andnot_si128(mask, b);
+  return _mm_or_si128(a_part, b_part);
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
+template<> EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& a) { return _mm_cmpeq_epi8(a, a); }
+template<> EIGEN_STRONG_INLINE Packet4f
+ptrue<Packet4f>(const Packet4f& a) {
+  Packet4i b = _mm_castps_si128(a);
+  return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
+}
+template<> EIGEN_STRONG_INLINE Packet2d
+ptrue<Packet2d>(const Packet2d& a) {
+  Packet4i b = _mm_castpd_si128(a);
+  return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b por<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { return _mm_cmpnge_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // There appears to be a bug in GCC, by which the optimizer may
+  // flip the argument order in calls to _mm_min_ps, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  #ifdef EIGEN_VECTORIZE_AVX
+  Packet4f res;
+  asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  #else
+  Packet4f res = b;
+  asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  #endif
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm_min_ps(b, a);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // There appears to be a bug in GCC, by which the optimizer may
+  // flip the argument order in calls to _mm_min_pd, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  #ifdef EIGEN_VECTORIZE_AVX
+  Packet2d res;
+  asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  #else
+  Packet2d res = b;
+  asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  #endif
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm_min_pd(b, a);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
+{
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_min_epi32(a,b);
+#else
+  // after some bench, this version *is* faster than a scalar implementation
+  Packet4i mask = _mm_cmplt_epi32(a,b);
+  return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
+#endif
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // There appears to be a bug in GCC, by which the optimizer may
+  // flip the argument order in calls to _mm_max_ps, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  #ifdef EIGEN_VECTORIZE_AVX
+  Packet4f res;
+  asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  #else
+  Packet4f res = b;
+  asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  #endif
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm_max_ps(b, a);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // There appears to be a bug in GCC, by which the optimizer may
+  // flip the argument order in calls to _mm_max_pd, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  #ifdef EIGEN_VECTORIZE_AVX
+  Packet2d res;
+  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  #else
+  Packet2d res = b;
+  asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  #endif
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm_max_pd(b, a);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
+{
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_max_epi32(a,b);
+#else
+  // after some bench, this version *is* faster than a scalar implementation
+  Packet4i mask = _mm_cmpgt_epi32(a,b);
+  return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
+#endif
+}
+
+template <typename Packet, typename Op>
+EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet& a, const Packet& b, Op op) {
+  // In this implementation, we take advantage of the fact that pmin/pmax for SSE
+  // always return a if either a or b is NaN.
+  Packet not_nan_mask_a = pcmp_eq(a, a);
+  Packet m = op(a, b);
+  return pselect<Packet>(not_nan_mask_a, m, b);
+}
+
+template <typename Packet, typename Op>
+EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet& a, const Packet& b, Op op) {
+  // In this implementation, we take advantage of the fact that pmin/pmax for SSE
+  // always return a if either a or b is NaN.
+  Packet not_nan_mask_a = pcmp_eq(a, a);
+  Packet m = op(b, a);
+  return pselect<Packet>(not_nan_mask_a, m, a);
+}
+
+// Add specializations for min/max with prescribed NaN progation.
+template<>
+EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet4f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet2d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet4f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet2d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet4f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet2d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet4f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet2d>);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) { return _mm_srai_epi32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right   (const Packet4i& a) { return _mm_srli_epi32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left    (const Packet4i& a) { return _mm_slli_epi32(a,N); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
+{
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
+  return _mm_and_ps(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
+{
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
+  return _mm_and_pd(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
+{
+  #ifdef EIGEN_VECTORIZE_SSSE3
+  return _mm_abs_epi32(a);
+  #else
+  Packet4i aux = _mm_srai_epi32(a,31);
+  return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
+  #endif
+}
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+  // Unfortunatly _mm_round_ps doesn't have a rounding mode to implement numext::round.
+  const Packet4f mask = pset1frombits<Packet4f>(0x80000000u);
+  const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu);
+  return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
+{
+  const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
+  const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
+  return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
+#else
+template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
+  // Adds and subtracts signum(a) * 2^23 to force rounding.
+  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
+  const Packet4f abs_a = pabs(a);
+  Packet4f r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
+  // Adds and subtracts signum(a) * 2^52 to force rounding.
+  const Packet2d limit = pset1<Packet2d>(static_cast<double>(1ull<<52));
+  const Packet2d abs_a = pabs(a);
+  Packet2d r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  Packet4f tmp  = print<Packet4f>(a);
+  // If greater, subtract one.
+  Packet4f mask = _mm_cmpgt_ps(tmp, a);
+  mask = pand(mask, cst_1);
+  return psub(tmp, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
+{
+  const Packet2d cst_1 = pset1<Packet2d>(1.0);
+  Packet2d tmp  = print<Packet2d>(a);
+  // If greater, subtract one.
+  Packet2d mask = _mm_cmpgt_pd(tmp, a);
+  mask = pand(mask, cst_1);
+  return psub(tmp, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  Packet4f tmp  = print<Packet4f>(a);
+  // If smaller, add one.
+  Packet4f mask = _mm_cmplt_ps(tmp, a);
+  mask = pand(mask, cst_1);
+  return padd(tmp, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
+{
+  const Packet2d cst_1 = pset1<Packet2d>(1.0);
+  Packet2d tmp  = print<Packet2d>(a);
+  // If smaller, add one.
+  Packet2d mask = _mm_cmplt_pd(tmp, a);
+  mask = pand(mask, cst_1);
+  return padd(tmp, mask);
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(const bool*     from) { EIGEN_DEBUG_ALIGNED_LOAD return  _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
+
+#if EIGEN_COMP_MSVC
+  template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) {
+    EIGEN_DEBUG_UNALIGNED_LOAD
+    #if (EIGEN_COMP_MSVC==1600)
+    // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
+    // (i.e., it does not generate an unaligned load!!
+    __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));
+    res = _mm_loadh_pi(res, (const __m64*)(from+2));
+    return res;
+    #else
+    return _mm_loadu_ps(from);
+    #endif
+  }
+#else
+// NOTE: with the code below, MSVC's compiler crashes!
+
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return _mm_loadu_ps(from);
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return _mm_loadu_pd(from);
+}
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
+template<> EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(const bool*     from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
+{
+  return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
+}
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*  from)
+{ return pset1<Packet2d>(from[0]); }
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
+{
+  Packet4i tmp;
+  tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
+  return vec4i_swizzle1(tmp, 0, 0, 1, 1);
+}
+
+// Loads 8 bools from memory and returns the packet
+// {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7}
+template<> EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool*     from)
+{
+  __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(reinterpret_cast<const double*>(from)));
+  return  _mm_unpacklo_epi8(tmp, tmp);
+}
+
+// Loads 4 bools from memory and returns the packet
+// {b0, b0  b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3}
+template<> EIGEN_STRONG_INLINE Packet16b
+ploadquad<Packet16b>(const bool* from) {
+  __m128i tmp = _mm_castps_si128(pload1<Packet4f>(reinterpret_cast<const float*>(from)));
+  tmp = _mm_unpacklo_epi8(tmp, tmp);
+  return  _mm_unpacklo_epi16(tmp, tmp);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
+template<> EIGEN_STRONG_INLINE void pstore<bool>(bool*     to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<bool>(bool*     to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+ return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+{
+ return _mm_set_pd(from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+{
+ return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet16b pgather<bool, Packet16b>(const bool* from, Index stride)
+{
+  return _mm_set_epi8(from[15*stride], from[14*stride], from[13*stride], from[12*stride],
+                      from[11*stride], from[10*stride], from[9*stride], from[8*stride],
+                      from[7*stride], from[6*stride], from[5*stride], from[4*stride],
+                      from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  to[stride*0] = _mm_cvtss_f32(from);
+  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
+  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
+  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+  to[stride*0] = _mm_cvtsd_f64(from);
+  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+{
+  to[stride*0] = _mm_cvtsi128_si32(from);
+  to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
+  to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
+  to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride)
+{
+  to[4*stride*0] = _mm_cvtsi128_si32(from);
+  to[4*stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
+  to[4*stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
+  to[4*stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
+}
+
+
+// some compilers might be tempted to perform multiple moves instead of using a vector path.
+template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
+{
+  Packet4f pa = _mm_set_ss(a);
+  pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
+}
+// some compilers might be tempted to perform multiple moves instead of using a vector path.
+template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
+{
+  Packet2d pa = _mm_set_sd(a);
+  pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
+}
+
+#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
+typedef const void * SsePrefetchPtrType;
+#else
+typedef const char * SsePrefetchPtrType;
+#endif
+
+#ifndef EIGEN_VECTORIZE_AVX
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+#endif
+
+#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
+// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
+// Direct of the struct members fixed bug #62.
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
+#elif EIGEN_COMP_MSVC_STRICT
+// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
+#else
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
+#endif
+template<> EIGEN_STRONG_INLINE bool   pfirst<Packet16b>(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast<bool>(x & 1); }
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return _mm_shuffle_ps(a,a,0x1B); }
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return _mm_shuffle_pd(a,a,0x1); }
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); }
+template<> EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
+#ifdef EIGEN_VECTORIZE_SSSE3
+  __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return _mm_shuffle_epi8(a, mask);
+#else
+  Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
+  tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
+  return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a,exponent);
+}
+
+// Extract exponent without existence of Packet2l.
+template<>
+EIGEN_STRONG_INLINE  
+Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
+  const Packet2d cst_exp_mask  = pset1frombits<Packet2d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+  __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52);
+  return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a,exponent);
+}
+
+// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
+// supported by SSE, and has more range than is needed for exponents.
+template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
+  const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent);
+  
+  // Convert e to integer and swizzle to low-order bits.
+  const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
+  
+  // Split 2^e into four factors and multiply:
+  const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
+  Packet4i b = parithmetic_shift_right<2>(ei);  // floor(e/4)
+  Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));  // 2^b
+  Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
+  b = psub(psub(psub(ei, b), b), b);  // e - 3b
+  c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));  // 2^(e - 3b)
+  out = pmul(out, c);  // a * 2^e
+  return out;
+}
+
+// with AVX, the default implementations based on pload1 are faster
+#ifndef __AVX__
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec4f_swizzle1(a3, 0,0,0,0);
+  a1 = vec4f_swizzle1(a3, 1,1,1,1);
+  a2 = vec4f_swizzle1(a3, 2,2,2,2);
+  a3 = vec4f_swizzle1(a3, 3,3,3,3);
+}
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet2d>(const double *a,
+                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
+{
+#ifdef EIGEN_VECTORIZE_SSE3
+  a0 = _mm_loaddup_pd(a+0);
+  a1 = _mm_loaddup_pd(a+1);
+  a2 = _mm_loaddup_pd(a+2);
+  a3 = _mm_loaddup_pd(a+3);
+#else
+  a1 = pload<Packet2d>(a);
+  a0 = vec2d_swizzle1(a1, 0,0);
+  a1 = vec2d_swizzle1(a1, 1,1);
+  a3 = pload<Packet2d>(a+2);
+  a2 = vec2d_swizzle1(a3, 0,0);
+  a3 = vec2d_swizzle1(a3, 1,1);
+#endif
+}
+#endif
+
+EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
+{
+  vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
+  vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
+  vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
+  vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
+  // (from Nehalem to Haswell)
+// #ifdef EIGEN_VECTORIZE_SSE3
+//   Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3));
+//   return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp));
+// #else
+  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
+  return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+// #endif
+}
+
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{
+  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
+  // (from Nehalem to Haswell)
+// #ifdef EIGEN_VECTORIZE_SSE3
+//   return pfirst<Packet2d>(_mm_hadd_pd(a, a));
+// #else
+  return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
+// #endif
+}
+
+#ifdef EIGEN_VECTORIZE_SSSE3
+template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+{
+  Packet4i tmp0 = _mm_hadd_epi32(a,a);
+  return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0));
+}
+
+#else
+template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+{
+  Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
+  return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
+  Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a));
+  return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
+}
+
+// Other reduction functions:
+
+
+// mul
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
+  return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+}
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
+{
+  return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
+}
+template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+{
+  // after some experiments, it is seems this is the fastest way to implement it
+  // for GCC (eg., reusing pmul is very slow !)
+  // TODO try to call _mm_mul_epu32 directly
+  EIGEN_ALIGN16 int aux[4];
+  pstore(aux, a);
+  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);
+}
+
+template<> EIGEN_STRONG_INLINE bool predux_mul<Packet16b>(const Packet16b& a) {
+  Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a,a));
+  return ((pfirst<Packet4i>(tmp) == 0x01010101) &&
+          (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
+}
+
+// min
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
+  return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+}
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
+{
+  return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
+}
+template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
+{
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
+  return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
+#else
+  // after some experiments, it is seems this is the fastest way to implement it
+  // for GCC (eg., it does not like using std::min after the pstore !!)
+  EIGEN_ALIGN16 int aux[4];
+  pstore(aux, a);
+  int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
+  int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
+  return aux0<aux2 ? aux0 : aux2;
+#endif // EIGEN_VECTORIZE_SSE4_1
+}
+
+// max
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
+  return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+}
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
+{
+  return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
+}
+template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
+{
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
+  return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
+#else
+  // after some experiments, it is seems this is the fastest way to implement it
+  // for GCC (eg., it does not like using std::min after the pstore !!)
+  EIGEN_ALIGN16 int aux[4];
+  pstore(aux, a);
+  int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
+  int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
+  return aux0>aux2 ? aux0 : aux2;
+#endif // EIGEN_VECTORIZE_SSE4_1
+}
+
+// not needed yet
+// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
+// {
+//   return _mm_movemask_ps(x) == 0xF;
+// }
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+  return _mm_movemask_ps(x) != 0x0;
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2d,2>& kernel) {
+  __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[1] = tmp;
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
+  __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
+  __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
+  __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
+
+  kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
+  kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
+  kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
+  kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16b,4>& kernel) {
+  __m128i T0 =  _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
+  __m128i T1 =  _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
+  __m128i T2 =  _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
+  __m128i T3 =  _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
+  kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
+  kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
+  kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
+  kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16b,16>& kernel) {
+  // If we number the elements in the input thus:
+  // kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f}
+  // kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f}
+  // ...
+  // kernel.packet[15] = {f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, fa, fb, fc, fd, fe, ff},
+  //
+  // the desired output is:
+  // kernel.packet[ 0] = {00, 10, 20, 30, 40, 50, 60, 70, 80, 90, a0, b0, c0, d0, e0, f0}
+  // kernel.packet[ 1] = {01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1}
+  // ...
+  // kernel.packet[15] = {0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff},
+  __m128i t0 =  _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  __m128i t1 =  _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
+  __m128i t2 =  _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); // 20 30 21 31 22 32 ...                     27 37
+  __m128i t3 =  _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); // 28 38 29 39 2a 3a ...                     2f 3f
+  __m128i t4 =  _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]); // 40 50 41 51 42 52                         47 57
+  __m128i t5 =  _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]); // 48 58 49 59 4a 5a
+  __m128i t6 =  _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
+  __m128i t7 =  _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
+  __m128i t8 =  _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
+  __m128i t9 =  _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
+  __m128i ta =  _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
+  __m128i tb =  _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
+  __m128i tc =  _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
+  __m128i td =  _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
+  __m128i te =  _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
+  __m128i tf =  _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
+
+  __m128i s0 =  _mm_unpacklo_epi16(t0, t2); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  __m128i s1 =  _mm_unpackhi_epi16(t0, t2); // 04 14 24 34
+  __m128i s2 =  _mm_unpacklo_epi16(t1, t3); // 08 18 28 38 ...
+  __m128i s3 =  _mm_unpackhi_epi16(t1, t3); // 0c 1c 2c 3c ...
+  __m128i s4 =  _mm_unpacklo_epi16(t4, t6); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+  __m128i s5 =  _mm_unpackhi_epi16(t4, t6); // 44 54 64 74 ...
+  __m128i s6 =  _mm_unpacklo_epi16(t5, t7);
+  __m128i s7 =  _mm_unpackhi_epi16(t5, t7);
+  __m128i s8 =  _mm_unpacklo_epi16(t8, ta);
+  __m128i s9 =  _mm_unpackhi_epi16(t8, ta);
+  __m128i sa =  _mm_unpacklo_epi16(t9, tb);
+  __m128i sb =  _mm_unpackhi_epi16(t9, tb);
+  __m128i sc =  _mm_unpacklo_epi16(tc, te);
+  __m128i sd =  _mm_unpackhi_epi16(tc, te);
+  __m128i se =  _mm_unpacklo_epi16(td, tf);
+  __m128i sf =  _mm_unpackhi_epi16(td, tf);
+
+  __m128i u0 =  _mm_unpacklo_epi32(s0, s4); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  __m128i u1 =  _mm_unpackhi_epi32(s0, s4); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  __m128i u2 =  _mm_unpacklo_epi32(s1, s5);
+  __m128i u3 =  _mm_unpackhi_epi32(s1, s5);
+  __m128i u4 =  _mm_unpacklo_epi32(s2, s6);
+  __m128i u5 =  _mm_unpackhi_epi32(s2, s6);
+  __m128i u6 =  _mm_unpacklo_epi32(s3, s7);
+  __m128i u7 =  _mm_unpackhi_epi32(s3, s7);
+  __m128i u8 =  _mm_unpacklo_epi32(s8, sc);
+  __m128i u9 =  _mm_unpackhi_epi32(s8, sc);
+  __m128i ua =  _mm_unpacklo_epi32(s9, sd);
+  __m128i ub =  _mm_unpackhi_epi32(s9, sd);
+  __m128i uc =  _mm_unpacklo_epi32(sa, se);
+  __m128i ud =  _mm_unpackhi_epi32(sa, se);
+  __m128i ue =  _mm_unpacklo_epi32(sb, sf);
+  __m128i uf =  _mm_unpackhi_epi32(sb, sf);
+
+  kernel.packet[0]  = _mm_unpacklo_epi64(u0, u8);
+  kernel.packet[1]  = _mm_unpackhi_epi64(u0, u8);
+  kernel.packet[2]  = _mm_unpacklo_epi64(u1, u9);
+  kernel.packet[3]  = _mm_unpackhi_epi64(u1, u9);
+  kernel.packet[4]  = _mm_unpacklo_epi64(u2, ua);
+  kernel.packet[5]  = _mm_unpackhi_epi64(u2, ua);
+  kernel.packet[6]  = _mm_unpacklo_epi64(u3, ub);
+  kernel.packet[7]  = _mm_unpackhi_epi64(u3, ub);
+  kernel.packet[8]  = _mm_unpacklo_epi64(u4, uc);
+  kernel.packet[9]  = _mm_unpackhi_epi64(u4, uc);
+  kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
+  kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
+  kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
+  kernel.packet[13] = _mm_unpackhi_epi64(u6, ue);
+  kernel.packet[14] = _mm_unpacklo_epi64(u7, uf);
+  kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m128i false_mask = _mm_cmpeq_epi32(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
+#else
+  return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  const __m128 zero = _mm_setzero_ps();
+  const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m128 false_mask = _mm_cmpeq_ps(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
+#else
+  return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+  const __m128d zero = _mm_setzero_pd();
+  const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
+  __m128d false_mask = _mm_cmpeq_pd(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
+#else
+  return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
+#endif
+}
+
+// Scalar path for pmadd with FMA to ensure consistency with vectorized path.
+#ifdef EIGEN_VECTORIZE_FMA
+template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
+  return ::fmaf(a,b,c);
+}
+template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
+  return ::fma(a,b,c);
+}
+#endif
+
+
+// Packet math for Eigen::half
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#if 0
+
+typedef struct {
+  __m64 x;
+} Packet4h;
+
+
+template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet4h type;
+  // There is no half-size packet for Packet4h.
+  typedef Packet4h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasExp = 0,
+    HasLog = 0,
+    HasBlend = 0
+  };
+};
+
+
+template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; };
+
+template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
+  Packet4h result;
+  result.x = _mm_set1_pi16(from.x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha + hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha - hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha * hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha / hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
+  Packet4h result;
+  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
+  Packet4h result;
+  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+  __int64_t r = _mm_cvtm64_si64(from.x);
+  *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+  __int64_t r = _mm_cvtm64_si64(from.x);
+  *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h
+ploadquad<Packet4h>(const Eigen::half* from) {
+  return pset1<Packet4h>(*from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
+{
+  Packet4h result;
+  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
+{
+  __int64_t a = _mm_cvtm64_si64(from.x);
+  to[stride*0].x = static_cast<unsigned short>(a);
+  to[stride*1].x = static_cast<unsigned short>(a >> 16);
+  to[stride*2].x = static_cast<unsigned short>(a >> 32);
+  to[stride*3].x = static_cast<unsigned short>(a >> 48);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet4h,4>& kernel) {
+  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
+  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
+  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
+  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
+
+  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
+  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
+  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
+  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
+}
+
+#endif
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
+// PGI++ does not define the following intrinsics in C++ mode.
+static inline __m128  _mm_castpd_ps   (__m128d x) { return reinterpret_cast<__m128&>(x);  }
+static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
+static inline __m128d _mm_castps_pd   (__m128  x) { return reinterpret_cast<__m128d&>(x); }
+static inline __m128i _mm_castps_si128(__m128  x) { return reinterpret_cast<__m128i&>(x); }
+static inline __m128  _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x);  }
+static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
+#endif
+
+#endif // EIGEN_PACKET_MATH_SSE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h b/src/3rdparty/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h
new file mode 100644
index 000000000..d2a0037e0
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -0,0 +1,142 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_SSE_H
+#define EIGEN_TYPE_CASTING_SSE_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_VECTORIZE_AVX
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<double, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 2,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<float, double> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 2
+  };
+};
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return _mm_cvttps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return _mm_cvtepi32_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
+  // Simply discard the second half of the input
+  return _mm_cvtps_pd(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+  return _mm_castps_si128(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+  return _mm_castsi128_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d,Packet4i>(const Packet4i& a) {
+  return _mm_castsi128_pd(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet2d>(const Packet2d& a) {
+  return _mm_castpd_si128(a);
+}
+
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#if 0
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
+  float f1 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  float f2 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  float f3 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  float f4 = static_cast<float>(h);
+  return _mm_set_ps(f4, f3, f2, f1);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
+  EIGEN_ALIGN16 float aux[4];
+  pstore(aux, a);
+  Eigen::half h0(aux[0]);
+  Eigen::half h1(aux[1]);
+  Eigen::half h2(aux[2]);
+  Eigen::half h3(aux[3]);
+
+  Packet4h result;
+  result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
+  return result;
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_SSE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h b/src/3rdparty/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h
new file mode 100644
index 000000000..b139ea2e4
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h
@@ -0,0 +1,44 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_SVE_H
+#define EIGEN_MATH_FUNCTIONS_SVE_H
+
+namespace Eigen {
+namespace internal {
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf pexp<PacketXf>(const PacketXf& x) {
+  return pexp_float(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf plog<PacketXf>(const PacketXf& x) {
+  return plog_float(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf psin<PacketXf>(const PacketXf& x) {
+  return psin_float(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf pcos<PacketXf>(const PacketXf& x) {
+  return pcos_float(x);
+}
+
+// Hyperbolic Tangent function.
+template <>
+EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf ptanh<PacketXf>(const PacketXf& x) {
+  return internal::generic_fast_tanh_float(x);
+}
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_SVE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/SVE/PacketMath.h b/src/3rdparty/eigen/Eigen/src/Core/arch/SVE/PacketMath.h
new file mode 100644
index 000000000..9060b372f
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/SVE/PacketMath.h
@@ -0,0 +1,752 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_SVE_H
+#define EIGEN_PACKET_MATH_SVE_H
+
+namespace Eigen
+{
+namespace internal
+{
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+
+template <typename Scalar, int SVEVectorLength>
+struct sve_packet_size_selector {
+  enum { size = SVEVectorLength / (sizeof(Scalar) * CHAR_BIT) };
+};
+
+/********************************* int32 **************************************/
+typedef svint32_t PacketXi __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
+
+template <>
+struct packet_traits<numext::int32_t> : default_packet_traits {
+  typedef PacketXi type;
+  typedef PacketXi half;  // Half not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0  // Not implemented in SVE
+  };
+};
+
+template <>
+struct unpacket_traits<PacketXi> {
+  typedef numext::int32_t type;
+  typedef PacketXi half;  // Half not yet implemented
+  enum {
+    size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<numext::int32_t>(const numext::int32_t* addr)
+{
+  svprfw(svptrue_b32(), addr, SV_PLDL1KEEP);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pset1<PacketXi>(const numext::int32_t& from)
+{
+  return svdup_n_s32(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const numext::int32_t& a)
+{
+  numext::int32_t c[packet_traits<numext::int32_t>::size];
+  for (int i = 0; i < packet_traits<numext::int32_t>::size; i++) c[i] = i;
+  return svadd_s32_z(svptrue_b32(), pset1<PacketXi>(a), svld1_s32(svptrue_b32(), c));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi padd<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svadd_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi psub<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svsub_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a)
+{
+  return svneg_s32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a)
+{
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmul<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svmul_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pdiv<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svdiv_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c)
+{
+  return svmla_s32_z(svptrue_b32(), c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmin<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svmin_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmax<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svmax_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcmp_le<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcmp_lt<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcmp_eq<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svdup_n_s32_z(svcmpeq_s32(svptrue_b32(), a, b), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ptrue<PacketXi>(const PacketXi& /*a*/)
+{
+  return svdup_n_s32_z(svptrue_b32(), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pzero<PacketXi>(const PacketXi& /*a*/)
+{
+  return svdup_n_s32_z(svptrue_b32(), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pand<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svand_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi por<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svorr_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pxor<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return sveor_s32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pandnot<PacketXi>(const PacketXi& a, const PacketXi& b)
+{
+  return svbic_s32_z(svptrue_b32(), a, b);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a)
+{
+  return svasrd_n_s32_z(svptrue_b32(), a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a)
+{
+  return svreinterpret_s32_u32(svlsr_u32_z(svptrue_b32(), svreinterpret_u32_s32(a), svdup_n_u32_z(svptrue_b32(), N)));
+}
+
+template <int N>
+EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a)
+{
+  return svlsl_s32_z(svptrue_b32(), a, svdup_n_u32_z(svptrue_b32(), N));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pload<PacketXi>(const numext::int32_t* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadu<PacketXi>(const numext::int32_t* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const numext::int32_t* from)
+{
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const numext::int32_t* from)
+{
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
+  return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int32_t>(numext::int32_t* to, const PacketXi& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int32_t>(numext::int32_t* to, const PacketXi& from)
+{
+  EIGEN_DEBUG_UNALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline PacketXi pgather<numext::int32_t, PacketXi>(const numext::int32_t* from, Index stride)
+{
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  return svld1_gather_s32index_s32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, PacketXi>(numext::int32_t* to, const PacketXi& from, Index stride)
+{
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  svst1_scatter_s32index_s32(svptrue_b32(), to, indices, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t pfirst<PacketXi>(const PacketXi& a)
+{
+  // svlasta returns the first element if all predicate bits are 0
+  return svlasta_s32(svpfalse_b(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a)
+{
+  return svrev_s32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a)
+{
+  return svabs_s32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux<PacketXi>(const PacketXi& a)
+{
+  return static_cast<numext::int32_t>(svaddv_s32(svptrue_b32(), a));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_mul<PacketXi>(const PacketXi& a)
+{
+  EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
+                      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
+
+  // Multiply the vector by its reverse
+  svint32_t prod = svmul_s32_z(svptrue_b32(), a, svrev_s32(a));
+  svint32_t half_prod;
+
+  // Extract the high half of the vector. Depending on the VL more reductions need to be done
+  if (EIGEN_ARM64_SVE_VL >= 2048) {
+    half_prod = svtbl_s32(prod, svindex_u32(32, 1));
+    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 1024) {
+    half_prod = svtbl_s32(prod, svindex_u32(16, 1));
+    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 512) {
+    half_prod = svtbl_s32(prod, svindex_u32(8, 1));
+    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 256) {
+    half_prod = svtbl_s32(prod, svindex_u32(4, 1));
+    prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+  }
+  // Last reduction
+  half_prod = svtbl_s32(prod, svindex_u32(2, 1));
+  prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
+
+  // The reduction is done to the first element.
+  return pfirst<PacketXi>(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_min<PacketXi>(const PacketXi& a)
+{
+  return svminv_s32(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_max<PacketXi>(const PacketXi& a)
+{
+  return svmaxv_s32(svptrue_b32(), a);
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXi, N>& kernel) {
+  int buffer[packet_traits<numext::int32_t>::size * N] = {0};
+  int i = 0;
+
+  PacketXi stride_index = svindex_s32(0, N);
+
+  for (i = 0; i < N; i++) {
+    svst1_scatter_s32index_s32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
+  }
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] = svld1_s32(svptrue_b32(), buffer + i * packet_traits<numext::int32_t>::size);
+  }
+}
+
+/********************************* float32 ************************************/
+
+typedef svfloat32_t PacketXf __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef PacketXf type;
+  typedef PacketXf half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,  // Not implemented in SVE
+
+    HasDiv = 1,
+    HasFloor = 1,
+
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 0,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH
+  };
+};
+
+template <>
+struct unpacket_traits<PacketXf> {
+  typedef float type;
+  typedef PacketXf half;  // Half not yet implemented
+  typedef PacketXi integer_packet;
+
+  enum {
+    size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pset1<PacketXf>(const float& from)
+{
+  return svdup_n_f32(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(numext::uint32_t from)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), from));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a)
+{
+  float c[packet_traits<float>::size];
+  for (int i = 0; i < packet_traits<float>::size; i++) c[i] = i;
+  return svadd_f32_z(svptrue_b32(), pset1<PacketXf>(a), svld1_f32(svptrue_b32(), c));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf padd<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svadd_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf psub<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svsub_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a)
+{
+  return svneg_f32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a)
+{
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmul<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svmul_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pdiv<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svdiv_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c)
+{
+  return svmla_f32_z(svptrue_b32(), c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmin<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svmin_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmin<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return pmin<PacketXf>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmin<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svminnm_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmax<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svmax_f32_z(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmax<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return pmax<PacketXf>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmax<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svmaxnm_f32_z(svptrue_b32(), a, b);
+}
+
+// Float comparisons in SVE return svbool (predicate). Use svdup to set active
+// lanes to 1 (0xffffffffu) and inactive lanes to 0.
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_le<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_lt<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_eq<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(), a, b), 0xffffffffu));
+}
+
+// Do a predicate inverse (svnot_b_z) on the predicate resulted from the
+// greater/equal comparison (svcmpge_f32). Then fill a float vector with the
+// active elements.
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(), a, b)), 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pfloor<PacketXf>(const PacketXf& a)
+{
+  return svrintm_f32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ptrue<PacketXf>(const PacketXf& /*a*/)
+{
+  return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), 0xffffffffu));
+}
+
+// Logical Operations are not supported for float, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE PacketXf pand<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svand_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf por<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svorr_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pxor<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(sveor_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pandnot<PacketXf>(const PacketXf& a, const PacketXf& b)
+{
+  return svreinterpret_f32_u32(svbic_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pload<PacketXf>(const float* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadu<PacketXf>(const float* from)
+{
+  EIGEN_DEBUG_UNALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from)
+{
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from)
+{
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
+  return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const PacketXf& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const PacketXf& from)
+{
+  EIGEN_DEBUG_UNALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline PacketXf pgather<float, PacketXf>(const float* from, Index stride)
+{
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  return svld1_gather_s32index_f32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, PacketXf>(float* to, const PacketXf& from, Index stride)
+{
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  svst1_scatter_s32index_f32(svptrue_b32(), to, indices, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<PacketXf>(const PacketXf& a)
+{
+  // svlasta returns the first element if all predicate bits are 0
+  return svlasta_f32(svpfalse_b(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a)
+{
+  return svrev_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a)
+{
+  return svabs_f32_z(svptrue_b32(), a);
+}
+
+// TODO(tellenbach): Should this go into MathFunctions.h? If so, change for 
+// all vector extensions and the generic version.
+template <>
+EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent)
+{
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a)
+{
+  return svaddv_f32(svptrue_b32(), a);
+}
+
+// Other reduction functions:
+// mul
+// Only works for SVE Vls multiple of 128
+template <>
+EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a)
+{
+  EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
+                      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
+  // Multiply the vector by its reverse
+  svfloat32_t prod = svmul_f32_z(svptrue_b32(), a, svrev_f32(a));
+  svfloat32_t half_prod;
+
+  // Extract the high half of the vector. Depending on the VL more reductions need to be done
+  if (EIGEN_ARM64_SVE_VL >= 2048) {
+    half_prod = svtbl_f32(prod, svindex_u32(32, 1));
+    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 1024) {
+    half_prod = svtbl_f32(prod, svindex_u32(16, 1));
+    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 512) {
+    half_prod = svtbl_f32(prod, svindex_u32(8, 1));
+    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 256) {
+    half_prod = svtbl_f32(prod, svindex_u32(4, 1));
+    prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+  }
+  // Last reduction
+  half_prod = svtbl_f32(prod, svindex_u32(2, 1));
+  prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
+
+  // The reduction is done to the first element.
+  return pfirst<PacketXf>(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PacketXf>(const PacketXf& a)
+{
+  return svminv_f32(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PacketXf>(const PacketXf& a)
+{
+  return svmaxv_f32(svptrue_b32(), a);
+}
+
+template<int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXf, N>& kernel)
+{
+  float buffer[packet_traits<float>::size * N] = {0};
+  int i = 0;
+
+  PacketXi stride_index = svindex_s32(0, N);
+
+  for (i = 0; i < N; i++) {
+    svst1_scatter_s32index_f32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] = svld1_f32(svptrue_b32(), buffer + i * packet_traits<float>::size);
+  }
+}
+
+template<>
+EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf& exponent)
+{
+  return pldexp_generic(a, exponent);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_SVE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h b/src/3rdparty/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h
new file mode 100644
index 000000000..7ba5d9cd1
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h
@@ -0,0 +1,49 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_SVE_H
+#define EIGEN_TYPE_CASTING_SVE_H
+
+namespace Eigen {
+namespace internal {
+
+template <>
+struct type_casting_traits<float, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+struct type_casting_traits<numext::int32_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXi, PacketXf>(const PacketXi& a) {
+  return svcvt_f32_s32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXf, PacketXi>(const PacketXf& a) {
+  return svcvt_s32_f32_z(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preinterpret<PacketXf, PacketXi>(const PacketXi& a) {
+  return svreinterpret_f32_s32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi preinterpret<PacketXi, PacketXf>(const PacketXf& a) {
+  return svreinterpret_s32_f32(a);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_SVE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/src/3rdparty/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h
new file mode 100644
index 000000000..10856ff5e
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@@ -0,0 +1,232 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * InteropHeaders.h
+ *
+ * \brief:
+ *  InteropHeaders
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_INTEROP_HEADERS_SYCL_H
+#define EIGEN_INTEROP_HEADERS_SYCL_H
+
+namespace Eigen {
+
+#if !defined(EIGEN_DONT_VECTORIZE_SYCL)
+
+namespace internal {
+
+template <int has_blend, int lengths>
+struct sycl_packet_traits : default_packet_traits {
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = lengths,
+    HasHalfPacket = 0,
+    HasDiv = 1,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasSin = 1,
+    HasCos = 1,
+    HasTan = 1,
+    HasASin = 1,
+    HasACos = 1,
+    HasATan = 1,
+    HasSinh = 1,
+    HasCosh = 1,
+    HasTanh = 1,
+    HasLGamma = 0,
+    HasDiGamma = 0,
+    HasZeta = 0,
+    HasPolygamma = 0,
+    HasErf = 0,
+    HasErfc = 0,
+    HasNdtri = 0,
+    HasIGamma = 0,
+    HasIGammac = 0,
+    HasBetaInc = 0,
+    HasBlend = has_blend,
+    // This flag is used to indicate whether packet comparison is supported.
+    // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true.
+    HasCmp = 1,
+    HasMax = 1,
+    HasMin = 1,
+    HasMul = 1,
+    HasAdd = 1,
+    HasFloor = 1,
+    HasRound = 1,
+    HasRint = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasCeil = 1,
+  };
+};
+
+#ifdef SYCL_DEVICE_ONLY
+#define SYCL_PACKET_TRAITS(packet_type, has_blend, unpacket_type, lengths) \
+  template <>                                                              \
+  struct packet_traits<unpacket_type>                                      \
+      : sycl_packet_traits<has_blend, lengths> {                           \
+    typedef packet_type type;                                              \
+    typedef packet_type half;                                              \
+  };
+
+SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
+SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
+SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
+SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2)
+#undef SYCL_PACKET_TRAITS
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#define SYCL_ARITHMETIC(packet_type)  \
+  template <>                         \
+  struct is_arithmetic<packet_type> { \
+    enum { value = true };            \
+  };
+SYCL_ARITHMETIC(cl::sycl::cl_float4)
+SYCL_ARITHMETIC(cl::sycl::cl_double2)
+#undef SYCL_ARITHMETIC
+
+#define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)        \
+  template <>                                                            \
+  struct unpacket_traits<packet_type> {                                  \
+    typedef unpacket_type type;                                          \
+    enum { size = lengths, vectorizable = true, alignment = Aligned16 }; \
+    typedef packet_type half;                                            \
+  };
+SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
+SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)
+
+#undef SYCL_UNPACKET_TRAITS
+#endif
+
+}  // end namespace internal
+
+#endif
+
+namespace TensorSycl {
+namespace internal {
+
+template <typename PacketReturnType, int PacketSize>
+struct PacketWrapper;
+// This function should never get called on the device
+#ifndef SYCL_DEVICE_ONLY
+template <typename PacketReturnType, int PacketSize>
+struct PacketWrapper {
+  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
+      Scalar;
+  template <typename Index>
+  EIGEN_DEVICE_FUNC static Scalar scalarize(Index, PacketReturnType &) {
+    eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR  THE CHOSEN TYPE");
+    abort();
+  }
+  EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(Scalar in,
+                                                                   Scalar) {
+    return ::Eigen::internal::template plset<PacketReturnType>(in);
+  }
+  EIGEN_DEVICE_FUNC static void set_packet(PacketReturnType, Scalar *) {
+    eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR  THE CHOSEN TYPE");
+    abort();
+  }
+};
+
+#elif defined(SYCL_DEVICE_ONLY)
+template <typename PacketReturnType>
+struct PacketWrapper<PacketReturnType, 4> {
+  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
+      Scalar;
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) {
+    switch (index) {
+      case 0:
+        return in.x();
+      case 1:
+        return in.y();
+      case 2:
+        return in.z();
+      case 3:
+        return in.w();
+      default:
+      //INDEX MUST BE BETWEEN 0 and 3.There is no abort function in SYCL kernel. so we cannot use abort here. 
+      // The code will never reach here
+      __builtin_unreachable();
+    }
+    __builtin_unreachable();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(
+      Scalar in, Scalar other) {
+    return PacketReturnType(in, other, other, other);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
+    lhs = PacketReturnType(rhs[0], rhs[1], rhs[2], rhs[3]);
+  }
+};
+
+template <typename PacketReturnType>
+struct PacketWrapper<PacketReturnType, 1> {
+  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
+      Scalar;
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index, PacketReturnType &in) {
+    return in;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in,
+                                                                   Scalar) {
+    return PacketReturnType(in);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
+    lhs = rhs[0];
+  }
+};
+
+template <typename PacketReturnType>
+struct PacketWrapper<PacketReturnType, 2> {
+  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
+      Scalar;
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) {
+    switch (index) {
+      case 0:
+        return in.x();
+      case 1:
+        return in.y();
+      default:
+        //INDEX MUST BE BETWEEN 0 and 1.There is no abort function in SYCL kernel. so we cannot use abort here. 
+      // The code will never reach here
+        __builtin_unreachable();
+    }
+    __builtin_unreachable();
+  }
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(
+      Scalar in, Scalar other) {
+    return PacketReturnType(in, other);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
+    lhs = PacketReturnType(rhs[0], rhs[1]);
+  }
+};
+
+#endif
+
+}  // end namespace internal
+}  // end namespace TensorSycl
+}  // end namespace Eigen
+
+#endif  // EIGEN_INTEROP_HEADERS_SYCL_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h b/src/3rdparty/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h
new file mode 100644
index 000000000..2ab0f2a76
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h
@@ -0,0 +1,301 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * MathFunctions.h
+ *
+ * \brief:
+ *  MathFunctions
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_MATH_FUNCTIONS_SYCL_H
+#define EIGEN_MATH_FUNCTIONS_SYCL_H
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(SYCL_DEVICE_ONLY)
+#define SYCL_PLOG(packet_type)                                         \
+  template <>                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog<packet_type>( \
+      const packet_type& a) {                                          \
+    return cl::sycl::log(a);                                           \
+  }
+
+SYCL_PLOG(cl::sycl::cl_float4)
+SYCL_PLOG(cl::sycl::cl_double2)
+#undef SYCL_PLOG
+
+#define SYCL_PLOG1P(packet_type)                                         \
+  template <>                                                            \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog1p<packet_type>( \
+      const packet_type& a) {                                            \
+    return cl::sycl::log1p(a);                                           \
+  }
+
+SYCL_PLOG1P(cl::sycl::cl_float4)
+SYCL_PLOG1P(cl::sycl::cl_double2)
+#undef SYCL_PLOG1P
+
+#define SYCL_PLOG10(packet_type)                                         \
+  template <>                                                            \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog10<packet_type>( \
+      const packet_type& a) {                                            \
+    return cl::sycl::log10(a);                                           \
+  }
+
+SYCL_PLOG10(cl::sycl::cl_float4)
+SYCL_PLOG10(cl::sycl::cl_double2)
+#undef SYCL_PLOG10
+
+#define SYCL_PEXP(packet_type)                                         \
+  template <>                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexp<packet_type>( \
+      const packet_type& a) {                                          \
+    return cl::sycl::exp(a);                                           \
+  }
+
+SYCL_PEXP(cl::sycl::cl_float4)
+SYCL_PEXP(cl::sycl::cl_float)
+SYCL_PEXP(cl::sycl::cl_double2)
+#undef SYCL_PEXP
+
+#define SYCL_PEXPM1(packet_type)                                         \
+  template <>                                                            \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexpm1<packet_type>( \
+      const packet_type& a) {                                            \
+    return cl::sycl::expm1(a);                                           \
+  }
+
+SYCL_PEXPM1(cl::sycl::cl_float4)
+SYCL_PEXPM1(cl::sycl::cl_double2)
+#undef SYCL_PEXPM1
+
+#define SYCL_PSQRT(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psqrt<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::sqrt(a);                                           \
+  }
+
+SYCL_PSQRT(cl::sycl::cl_float4)
+SYCL_PSQRT(cl::sycl::cl_double2)
+#undef SYCL_PSQRT
+
+#define SYCL_PRSQRT(packet_type)                                         \
+  template <>                                                            \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type prsqrt<packet_type>( \
+      const packet_type& a) {                                            \
+    return cl::sycl::rsqrt(a);                                           \
+  }
+
+SYCL_PRSQRT(cl::sycl::cl_float4)
+SYCL_PRSQRT(cl::sycl::cl_double2)
+#undef SYCL_PRSQRT
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+#define SYCL_PSIN(packet_type)                                         \
+  template <>                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psin<packet_type>( \
+      const packet_type& a) {                                          \
+    return cl::sycl::sin(a);                                           \
+  }
+
+SYCL_PSIN(cl::sycl::cl_float4)
+SYCL_PSIN(cl::sycl::cl_double2)
+#undef SYCL_PSIN
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+#define SYCL_PCOS(packet_type)                                         \
+  template <>                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcos<packet_type>( \
+      const packet_type& a) {                                          \
+    return cl::sycl::cos(a);                                           \
+  }
+
+SYCL_PCOS(cl::sycl::cl_float4)
+SYCL_PCOS(cl::sycl::cl_double2)
+#undef SYCL_PCOS
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+#define SYCL_PTAN(packet_type)                                         \
+  template <>                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptan<packet_type>( \
+      const packet_type& a) {                                          \
+    return cl::sycl::tan(a);                                           \
+  }
+
+SYCL_PTAN(cl::sycl::cl_float4)
+SYCL_PTAN(cl::sycl::cl_double2)
+#undef SYCL_PTAN
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+#define SYCL_PASIN(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pasin<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::asin(a);                                           \
+  }
+
+SYCL_PASIN(cl::sycl::cl_float4)
+SYCL_PASIN(cl::sycl::cl_double2)
+#undef SYCL_PASIN
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+#define SYCL_PACOS(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pacos<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::acos(a);                                           \
+  }
+
+SYCL_PACOS(cl::sycl::cl_float4)
+SYCL_PACOS(cl::sycl::cl_double2)
+#undef SYCL_PACOS
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+#define SYCL_PATAN(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type patan<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::atan(a);                                           \
+  }
+
+SYCL_PATAN(cl::sycl::cl_float4)
+SYCL_PATAN(cl::sycl::cl_double2)
+#undef SYCL_PATAN
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+#define SYCL_PSINH(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psinh<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::sinh(a);                                           \
+  }
+
+SYCL_PSINH(cl::sycl::cl_float4)
+SYCL_PSINH(cl::sycl::cl_double2)
+#undef SYCL_PSINH
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+#define SYCL_PCOSH(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcosh<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::cosh(a);                                           \
+  }
+
+SYCL_PCOSH(cl::sycl::cl_float4)
+SYCL_PCOSH(cl::sycl::cl_double2)
+#undef SYCL_PCOSH
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+#define SYCL_PTANH(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptanh<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::tanh(a);                                           \
+  }
+
+SYCL_PTANH(cl::sycl::cl_float4)
+SYCL_PTANH(cl::sycl::cl_double2)
+#undef SYCL_PTANH
+
+#define SYCL_PCEIL(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pceil<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::ceil(a);                                           \
+  }
+
+SYCL_PCEIL(cl::sycl::cl_float4)
+SYCL_PCEIL(cl::sycl::cl_double2)
+#undef SYCL_PCEIL
+
+#define SYCL_PROUND(packet_type)                                         \
+  template <>                                                            \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pround<packet_type>( \
+      const packet_type& a) {                                            \
+    return cl::sycl::round(a);                                           \
+  }
+
+SYCL_PROUND(cl::sycl::cl_float4)
+SYCL_PROUND(cl::sycl::cl_double2)
+#undef SYCL_PROUND
+
+#define SYCL_PRINT(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type print<packet_type>( \
+      const packet_type& a) {                                           \
+    return cl::sycl::rint(a);                                           \
+  }
+
+SYCL_PRINT(cl::sycl::cl_float4)
+SYCL_PRINT(cl::sycl::cl_double2)
+#undef SYCL_PRINT
+
+#define SYCL_FLOOR(packet_type)                                          \
+  template <>                                                            \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pfloor<packet_type>( \
+      const packet_type& a) {                                            \
+    return cl::sycl::floor(a);                                           \
+  }
+
+SYCL_FLOOR(cl::sycl::cl_float4)
+SYCL_FLOOR(cl::sycl::cl_double2)
+#undef SYCL_FLOOR
+
+#define SYCL_PMIN(packet_type, expr)                                   \
+  template <>                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmin<packet_type>( \
+      const packet_type& a, const packet_type& b) {                    \
+    return expr;                                                       \
+  }
+
+SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
+SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
+#undef SYCL_PMIN
+
+#define SYCL_PMAX(packet_type, expr)                                   \
+  template <>                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmax<packet_type>( \
+      const packet_type& a, const packet_type& b) {                    \
+    return expr;                                                       \
+  }
+
+SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
+SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
+#undef SYCL_PMAX
+
+#define SYCL_PLDEXP(packet_type)                                             \
+  template <>                                                                \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pldexp(                  \
+      const packet_type& a, const packet_type& exponent) {                   \
+    return cl::sycl::ldexp(                                                  \
+        a, exponent.template convert<cl::sycl::cl_int,                       \
+                                     cl::sycl::rounding_mode::automatic>()); \
+  }
+
+SYCL_PLDEXP(cl::sycl::cl_float4)
+SYCL_PLDEXP(cl::sycl::cl_double2)
+#undef SYCL_PLDEXP
+
+#endif
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_SYCL_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h b/src/3rdparty/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h
new file mode 100644
index 000000000..87badc076
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h
@@ -0,0 +1,670 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * PacketMath.h
+ *
+ * \brief:
+ *  PacketMath
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_PACKET_MATH_SYCL_H
+#define EIGEN_PACKET_MATH_SYCL_H
+#include <type_traits>
+namespace Eigen {
+
+namespace internal {
+#ifdef SYCL_DEVICE_ONLY
+
+#define SYCL_PLOADT_RO(address_space_target)                                 \
+  template <typename packet_type, int Alignment>                             \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt_ro(               \
+      typename cl::sycl::multi_ptr<                                          \
+          const typename unpacket_traits<packet_type>::type,                 \
+          cl::sycl::access::address_space::address_space_target>::pointer_t  \
+          from) {                                                            \
+    typedef typename unpacket_traits<packet_type>::type scalar;              \
+    typedef cl::sycl::multi_ptr<                                             \
+        scalar, cl::sycl::access::address_space::address_space_target>       \
+        multi_ptr;                                                           \
+    auto res = packet_type(                                                  \
+        static_cast<typename unpacket_traits<packet_type>::type>(0));        \
+    res.load(0, multi_ptr(const_cast<typename multi_ptr::pointer_t>(from))); \
+    return res;                                                              \
+  }
+
+SYCL_PLOADT_RO(global_space)
+SYCL_PLOADT_RO(local_space)
+#undef SYCL_PLOADT_RO
+#endif
+
+template <typename packet_type, int Alignment, typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
+ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
+          cl::sycl::access::mode::read_write, T>& from) {
+  return ploadt_ro<packet_type, Alignment>(from.get_pointer());
+}
+
+#ifdef SYCL_DEVICE_ONLY
+#define SYCL_PLOAD(address_space_target, Alignment, AlignedType)            \
+  template <typename packet_type>                                           \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType(     \
+      typename cl::sycl::multi_ptr<                                         \
+          const typename unpacket_traits<packet_type>::type,                \
+          cl::sycl::access::address_space::address_space_target>::pointer_t \
+          from) {                                                           \
+    return ploadt_ro<packet_type, Alignment>(from);                         \
+  }
+
+// global space
+SYCL_PLOAD(global_space, Unaligned, u)
+SYCL_PLOAD(global_space, Aligned, )
+// local space
+SYCL_PLOAD(local_space, Unaligned, u)
+SYCL_PLOAD(local_space, Aligned, )
+
+#undef SYCL_PLOAD
+#endif
+
+#define SYCL_PLOAD(Alignment, AlignedType)                              \
+  template <typename packet_type>                                       \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
+      const Eigen::TensorSycl::internal::RangeAccess<                   \
+          cl::sycl::access::mode::read_write,                           \
+          typename unpacket_traits<packet_type>::type>                  \
+          from) {                                                       \
+    return ploadt_ro<packet_type, Alignment>(from);                     \
+  }
+SYCL_PLOAD(Unaligned, u)
+SYCL_PLOAD(Aligned, )
+#undef SYCL_PLOAD
+
+#ifdef SYCL_DEVICE_ONLY
+/** \internal \returns a packet version of \a *from.
+ * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+#define SYCL_PLOADT(address_space_target)                                   \
+  template <typename packet_type, int Alignment>                            \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt(                 \
+      typename cl::sycl::multi_ptr<                                         \
+          const typename unpacket_traits<packet_type>::type,                \
+          cl::sycl::access::address_space::address_space_target>::pointer_t \
+          from) {                                                           \
+    if (Alignment >= unpacket_traits<packet_type>::alignment)               \
+      return pload<packet_type>(from);                                      \
+    else                                                                    \
+      return ploadu<packet_type>(from);                                     \
+  }
+
+// global space
+SYCL_PLOADT(global_space)
+// local space
+SYCL_PLOADT(local_space)
+#undef SYCL_PLOADT
+#endif
+
+template <typename packet_type, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
+ploadt(const Eigen::TensorSycl::internal::RangeAccess<
+       cl::sycl::access::mode::read_write,
+       typename unpacket_traits<packet_type>::type>& from) {
+  return ploadt<packet_type, Alignment>(from.get_pointer());
+}
+#ifdef SYCL_DEVICE_ONLY
+
+// private_space
+#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment)                 \
+  template <>                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type                    \
+  ploadt_ro<packet_type, Alignment>(                                   \
+      const typename unpacket_traits<packet_type>::type* from) {       \
+    typedef typename unpacket_traits<packet_type>::type scalar;        \
+    auto res = packet_type(static_cast<scalar>(0));                    \
+    res.template load<cl::sycl::access::address_space::private_space>( \
+        0, const_cast<scalar*>(from));                                 \
+    return res;                                                        \
+  }
+
+SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned)
+SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned)
+SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned)
+SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned)
+
+#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type)                    \
+  template <>                                                              \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##alignment_type( \
+      const typename unpacket_traits<packet_type>::type* from) {           \
+    typedef typename unpacket_traits<packet_type>::type scalar;            \
+    auto res = packet_type(static_cast<scalar>(0));                        \
+    res.template load<cl::sycl::access::address_space::private_space>(     \
+        0, const_cast<scalar*>(from));                                     \
+    return res;                                                            \
+  }
+SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, )
+SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, )
+SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u)
+SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u)
+
+#undef SYCL_PLOAD_SPECIAL
+
+#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment)   \
+  template <>                                                               \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment(             \
+      typename cl::sycl::multi_ptr<                                         \
+          scalar,                                                           \
+          cl::sycl::access::address_space::address_space_target>::pointer_t \
+          to,                                                               \
+      const packet_type& from) {                                            \
+    typedef cl::sycl::multi_ptr<                                            \
+        scalar, cl::sycl::access::address_space::address_space_target>      \
+        multi_ptr;                                                          \
+    from.store(0, multi_ptr(to));                                           \
+  }
+
+// global space
+SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, )
+SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u)
+SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, )
+SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u)
+SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, )
+SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u)
+SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, )
+SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u)
+
+SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, )
+SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u)
+SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, )
+SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u)
+#undef SYCL_PSTORE
+
+#define SYCL_PSTORE_T(address_space_target)                                 \
+  template <typename scalar, typename packet_type, int Alignment>           \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(                       \
+      typename cl::sycl::multi_ptr<                                         \
+          scalar,                                                           \
+          cl::sycl::access::address_space::address_space_target>::pointer_t \
+          to,                                                               \
+      const packet_type& from) {                                            \
+    if (Alignment)                                                          \
+      pstore(to, from);                                                     \
+    else                                                                    \
+      pstoreu(to, from);                                                    \
+  }
+
+SYCL_PSTORE_T(global_space)
+
+SYCL_PSTORE_T(local_space)
+
+#undef SYCL_PSTORE_T
+
+#define SYCL_PSET1(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
+      const typename unpacket_traits<packet_type>::type& from) {        \
+    return packet_type(from);                                           \
+  }
+
+// global space
+SYCL_PSET1(cl::sycl::cl_float4)
+SYCL_PSET1(cl::sycl::cl_double2)
+
+#undef SYCL_PSET1
+
+template <typename packet_type>
+struct get_base_packet {
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type
+  get_ploaddup(sycl_multi_pointer) {}
+
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type
+  get_pgather(sycl_multi_pointer, Index) {}
+};
+
+template <>
+struct get_base_packet<cl::sycl::cl_float4> {
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(
+      sycl_multi_pointer from) {
+    return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]);
+  }
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather(
+      sycl_multi_pointer from, Index stride) {
+    return cl::sycl::cl_float4(from[0 * stride], from[1 * stride],
+                               from[2 * stride], from[3 * stride]);
+  }
+
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
+      sycl_multi_pointer to, const cl::sycl::cl_float4& from, Index stride) {
+    auto tmp = stride;
+    to[0] = from.x();
+    to[tmp] = from.y();
+    to[tmp += stride] = from.z();
+    to[tmp += stride] = from.w();
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset(
+      const float& a) {
+    return cl::sycl::cl_float4(static_cast<float>(a), static_cast<float>(a + 1),
+                               static_cast<float>(a + 2),
+                               static_cast<float>(a + 3));
+  }
+};
+
+template <>
+struct get_base_packet<cl::sycl::cl_double2> {
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2
+  get_ploaddup(const sycl_multi_pointer from) {
+    return cl::sycl::cl_double2(from[0], from[0]);
+  }
+
+  template <typename sycl_multi_pointer, typename Index>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather(
+      const sycl_multi_pointer from, Index stride) {
+    return cl::sycl::cl_double2(from[0 * stride], from[1 * stride]);
+  }
+
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
+      sycl_multi_pointer to, const cl::sycl::cl_double2& from, Index stride) {
+    to[0] = from.x();
+    to[stride] = from.y();
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset(
+      const double& a) {
+    return cl::sycl::cl_double2(static_cast<double>(a),
+                                static_cast<double>(a + 1));
+  }
+};
+
+#define SYCL_PLOAD_DUP(address_space_target)                                \
+  template <typename packet_type>                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup(               \
+      typename cl::sycl::multi_ptr<                                         \
+          const typename unpacket_traits<packet_type>::type,                \
+          cl::sycl::access::address_space::address_space_target>::pointer_t \
+          from) {                                                           \
+    return get_base_packet<packet_type>::get_ploaddup(from);                \
+  }
+
+// global space
+SYCL_PLOAD_DUP(global_space)
+// local_space
+SYCL_PLOAD_DUP(local_space)
+#undef SYCL_PLOAD_DUP
+
+#define SYCL_PLOAD_DUP_SPECILIZE(packet_type)                              \
+  template <>                                                              \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup<packet_type>( \
+      const typename unpacket_traits<packet_type>::type* from) {           \
+    return get_base_packet<packet_type>::get_ploaddup(from);               \
+  }
+
+SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
+SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
+
+#undef SYCL_PLOAD_DUP_SPECILIZE
+
+#define SYCL_PLSET(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type plset<packet_type>( \
+      const typename unpacket_traits<packet_type>::type& a) {           \
+    return get_base_packet<packet_type>::set_plset(a);                  \
+  }
+
+SYCL_PLSET(cl::sycl::cl_float4)
+SYCL_PLSET(cl::sycl::cl_double2)
+
+#undef SYCL_PLSET
+
+#define SYCL_PGATHER(address_space_target)                                  \
+  template <typename Scalar, typename packet_type>                          \
+  EIGEN_DEVICE_FUNC inline packet_type pgather(                             \
+      typename cl::sycl::multi_ptr<                                         \
+          const typename unpacket_traits<packet_type>::type,                \
+          cl::sycl::access::address_space::address_space_target>::pointer_t \
+          from,                                                             \
+      Index stride) {                                                       \
+    return get_base_packet<packet_type>::get_pgather(from, stride);         \
+  }
+
+// global space
+SYCL_PGATHER(global_space)
+// local space
+SYCL_PGATHER(local_space)
+
+#undef SYCL_PGATHER
+
+#define SYCL_PGATHER_SPECILIZE(scalar, packet_type)                            \
+  template <>                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type                            \
+  pgather<scalar, packet_type>(                                                \
+      const typename unpacket_traits<packet_type>::type* from, Index stride) { \
+    return get_base_packet<packet_type>::get_pgather(from, stride);            \
+  }
+
+SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
+SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
+
+#undef SYCL_PGATHER_SPECILIZE
+
+#define SYCL_PSCATTER(address_space_target)                                 \
+  template <typename Scalar, typename packet_type>                          \
+  EIGEN_DEVICE_FUNC inline void pscatter(                                   \
+      typename cl::sycl::multi_ptr<                                         \
+          typename unpacket_traits<packet_type>::type,                      \
+          cl::sycl::access::address_space::address_space_target>::pointer_t \
+          to,                                                               \
+      const packet_type& from, Index stride) {                              \
+    get_base_packet<packet_type>::set_pscatter(to, from, stride);           \
+  }
+
+// global space
+SYCL_PSCATTER(global_space)
+// local space
+SYCL_PSCATTER(local_space)
+
+#undef SYCL_PSCATTER
+
+#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)                        \
+  template <>                                                               \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<scalar, packet_type>( \
+      typename unpacket_traits<packet_type>::type * to,                     \
+      const packet_type& from, Index stride) {                              \
+    get_base_packet<packet_type>::set_pscatter(to, from, stride);           \
+  }
+
+SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
+SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
+
+#undef SYCL_PSCATTER_SPECILIZE
+
+#define SYCL_PMAD(packet_type)                                            \
+  template <>                                                             \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd(                \
+      const packet_type& a, const packet_type& b, const packet_type& c) { \
+    return cl::sycl::mad(a, b, c);                                        \
+  }
+
+SYCL_PMAD(cl::sycl::cl_float4)
+SYCL_PMAD(cl::sycl::cl_double2)
+#undef SYCL_PMAD
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(
+    const cl::sycl::cl_float4& a) {
+  return a.x();
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(
+    const cl::sycl::cl_double2& a) {
+  return a.x();
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(
+    const cl::sycl::cl_float4& a) {
+  return a.x() + a.y() + a.z() + a.w();
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(
+    const cl::sycl::cl_double2& a) {
+  return a.x() + a.y();
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(
+    const cl::sycl::cl_float4& a) {
+  return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()),
+                        cl::sycl::fmax(a.z(), a.w()));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(
+    const cl::sycl::cl_double2& a) {
+  return cl::sycl::fmax(a.x(), a.y());
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(
+    const cl::sycl::cl_float4& a) {
+  return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()),
+                        cl::sycl::fmin(a.z(), a.w()));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(
+    const cl::sycl::cl_double2& a) {
+  return cl::sycl::fmin(a.x(), a.y());
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(
+    const cl::sycl::cl_float4& a) {
+  return a.x() * a.y() * a.z() * a.w();
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(
+    const cl::sycl::cl_double2& a) {
+  return a.x() * a.y();
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
+pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()),
+                             cl::sycl::fabs(a.z()), cl::sycl::fabs(a.w()));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2
+pabs<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
+  return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()));
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_le(const Packet &a,
+                                                          const Packet &b) {
+  return ((a <= b)
+              .template convert<typename unpacket_traits<Packet>::type,
+                                cl::sycl::rounding_mode::automatic>());
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_lt(const Packet &a,
+                                                          const Packet &b) {
+  return ((a < b)
+              .template convert<typename unpacket_traits<Packet>::type,
+                                cl::sycl::rounding_mode::automatic>());
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet &a,
+                                                          const Packet &b) {
+  return ((a == b)
+              .template convert<typename unpacket_traits<Packet>::type,
+                                cl::sycl::rounding_mode::automatic>());
+}
+
+#define SYCL_PCMP(OP, TYPE)                                                    \
+  template <>                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TYPE pcmp_##OP<TYPE>(const TYPE &a,    \
+                                                             const TYPE &b) {  \
+    return sycl_pcmp_##OP<TYPE>(a, b);                                         \
+  }
+
+SYCL_PCMP(le, cl::sycl::cl_float4)
+SYCL_PCMP(lt, cl::sycl::cl_float4)
+SYCL_PCMP(eq, cl::sycl::cl_float4)
+SYCL_PCMP(le, cl::sycl::cl_double2)
+SYCL_PCMP(lt, cl::sycl::cl_double2)
+SYCL_PCMP(eq, cl::sycl::cl_double2)
+#undef SYCL_PCMP
+
+template <typename T> struct convert_to_integer;
+
+template <> struct convert_to_integer<float> {
+  using type = std::int32_t;
+  using packet_type = cl::sycl::cl_int4;
+};
+template <> struct convert_to_integer<double> {
+  using type = std::int64_t;
+  using packet_type = cl::sycl::cl_long2;
+};
+
+template <typename PacketIn>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename convert_to_integer<
+    typename unpacket_traits<PacketIn>::type>::packet_type
+vector_as_int(const PacketIn &p) {
+  return (
+      p.template convert<typename convert_to_integer<
+                             typename unpacket_traits<PacketIn>::type>::type,
+                         cl::sycl::rounding_mode::automatic>());
+}
+
+template <typename packetOut, typename PacketIn>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packetOut
+convert_vector(const PacketIn &p) {
+  return (p.template convert<typename unpacket_traits<packetOut>::type,
+                             cl::sycl::rounding_mode::automatic>());
+}
+
+#define SYCL_PAND(TYPE)                                                        \
+  template <>                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pand<TYPE>(const TYPE &a,         \
+                                                        const TYPE &b) {       \
+    return convert_vector<TYPE>(vector_as_int(a) & vector_as_int(b));          \
+  }
+SYCL_PAND(cl::sycl::cl_float4)
+SYCL_PAND(cl::sycl::cl_double2)
+#undef SYCL_PAND
+
+#define SYCL_POR(TYPE)                                                         \
+  template <>                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE por<TYPE>(const TYPE &a,          \
+                                                       const TYPE &b) {        \
+    return convert_vector<TYPE>(vector_as_int(a) | vector_as_int(b));          \
+  }
+
+SYCL_POR(cl::sycl::cl_float4)
+SYCL_POR(cl::sycl::cl_double2)
+#undef SYCL_POR
+
+#define SYCL_PXOR(TYPE)                                                        \
+  template <>                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pxor<TYPE>(const TYPE &a,         \
+                                                        const TYPE &b) {       \
+    return convert_vector<TYPE>(vector_as_int(a) ^ vector_as_int(b));          \
+  }
+
+SYCL_PXOR(cl::sycl::cl_float4)
+SYCL_PXOR(cl::sycl::cl_double2)
+#undef SYCL_PXOR
+
+#define SYCL_PANDNOT(TYPE)                                                     \
+  template <>                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pandnot<TYPE>(const TYPE &a,      \
+                                                           const TYPE &b) {    \
+    return convert_vector<TYPE>(vector_as_int(a) & (~vector_as_int(b)));       \
+  }
+SYCL_PANDNOT(cl::sycl::cl_float4)
+SYCL_PANDNOT(cl::sycl::cl_double2)
+#undef SYCL_PANDNOT
+
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
+    PacketBlock<cl::sycl::cl_float4, 4>& kernel) {
+  float tmp = kernel.packet[0].y();
+  kernel.packet[0].y() = kernel.packet[1].x();
+  kernel.packet[1].x() = tmp;
+
+  tmp = kernel.packet[0].z();
+  kernel.packet[0].z() = kernel.packet[2].x();
+  kernel.packet[2].x() = tmp;
+
+  tmp = kernel.packet[0].w();
+  kernel.packet[0].w() = kernel.packet[3].x();
+  kernel.packet[3].x() = tmp;
+
+  tmp = kernel.packet[1].z();
+  kernel.packet[1].z() = kernel.packet[2].y();
+  kernel.packet[2].y() = tmp;
+
+  tmp = kernel.packet[1].w();
+  kernel.packet[1].w() = kernel.packet[3].y();
+  kernel.packet[3].y() = tmp;
+
+  tmp = kernel.packet[2].w();
+  kernel.packet[2].w() = kernel.packet[3].z();
+  kernel.packet[3].z() = tmp;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
+    PacketBlock<cl::sycl::cl_double2, 2>& kernel) {
+  double tmp = kernel.packet[0].y();
+  kernel.packet[0].y() = kernel.packet[1].x();
+  kernel.packet[1].x() = tmp;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend(
+    const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
+    const cl::sycl::cl_float4& thenPacket,
+    const cl::sycl::cl_float4& elsePacket) {
+  cl::sycl::cl_int4 condition(
+      ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1,
+      ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1);
+  return cl::sycl::select(thenPacket, elsePacket, condition);
+}
+
+template <>
+inline cl::sycl::cl_double2 pblend(
+    const Selector<unpacket_traits<cl::sycl::cl_double2>::size>& ifPacket,
+    const cl::sycl::cl_double2& thenPacket,
+    const cl::sycl::cl_double2& elsePacket) {
+  cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1,
+                               ifPacket.select[1] ? 0 : -1);
+  return cl::sycl::select(thenPacket, elsePacket, condition);
+}
+#endif  // SYCL_DEVICE_ONLY
+
+#define SYCL_PSTORE(alignment)                                  \
+  template <typename packet_type>                               \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
+      const Eigen::TensorSycl::internal::RangeAccess<           \
+          cl::sycl::access::mode::read_write,                   \
+          typename unpacket_traits<packet_type>::type>& to,     \
+      const packet_type& from) {                                \
+    pstore##alignment(to.get_pointer(), from);                  \
+  }
+
+// global space
+SYCL_PSTORE()
+SYCL_PSTORE(u)
+
+#undef SYCL_PSTORE
+
+template <typename scalar, typename packet_type, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(
+    Eigen::TensorSycl::internal::RangeAccess<
+        cl::sycl::access::mode::read_write,
+        typename unpacket_traits<packet_type>::type>
+        to,
+    const packet_type& from) {
+  pstoret<scalar, packet_type, Alignment>(to.get_pointer(), from);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_SYCL_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h b/src/3rdparty/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
new file mode 100644
index 000000000..f81e59db5
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
@@ -0,0 +1,694 @@
+/***************************************************************************
+ *  Copyright (C) 2017 Codeplay Software Limited
+ *  This Source Code Form is subject to the terms of the Mozilla
+ *  Public License v. 2.0. If a copy of the MPL was not distributed
+ *  with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ *
+ *  SyclMemoryModel.h
+ *
+ *  Description:
+ *    Interface for SYCL buffers to behave as a non-dereferenceable pointer
+ *    Interface for Placeholder accessor to behave as a pointer on both host
+ *    and device
+ *
+ * Authors:
+ *
+ *    Ruyman Reyes   Codeplay Software Ltd.
+ *    Mehdi Goli     Codeplay Software Ltd.
+ *    Vanya Yaneva   Codeplay Software Ltd.
+ *
+ **************************************************************************/
+
+#if defined(EIGEN_USE_SYCL) && \
+    !defined(EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H
+
+#include <CL/sycl.hpp>
+#ifdef EIGEN_EXCEPTIONS
+#include <stdexcept>
+#endif
+#include <cstddef>
+#include <queue>
+#include <set>
+#include <unordered_map>
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+using sycl_acc_target = cl::sycl::access::target;
+using sycl_acc_mode = cl::sycl::access::mode;
+
+/**
+ * Default values for template arguments
+ */
+using buffer_data_type_t = uint8_t;
+const sycl_acc_target default_acc_target = sycl_acc_target::global_buffer;
+const sycl_acc_mode default_acc_mode = sycl_acc_mode::read_write;
+
+/**
+ * PointerMapper
+ *  Associates fake pointers with buffers.
+ *
+ */
+class PointerMapper {
+ public:
+  using base_ptr_t = std::intptr_t;
+
+  /* Structure of a virtual pointer
+   *
+   * |================================================|
+   * |               POINTER ADDRESS                  |
+   * |================================================|
+   */
+  struct virtual_pointer_t {
+    /* Type for the pointers
+     */
+    base_ptr_t m_contents;
+
+    /** Conversions from virtual_pointer_t to
+     * void * should just reinterpret_cast the integer number
+     */
+    operator void *() const { return reinterpret_cast<void *>(m_contents); }
+
+    /**
+     * Convert back to the integer number.
+     */
+    operator base_ptr_t() const { return m_contents; }
+
+    /**
+     * Add a certain value to the pointer to create a
+     * new pointer to that offset
+     */
+    virtual_pointer_t operator+(size_t off) { return m_contents + off; }
+
+    /* Numerical order for sorting pointers in containers. */
+    bool operator<(virtual_pointer_t rhs) const {
+      return (static_cast<base_ptr_t>(m_contents) <
+              static_cast<base_ptr_t>(rhs.m_contents));
+    }
+
+    bool operator>(virtual_pointer_t rhs) const {
+      return (static_cast<base_ptr_t>(m_contents) >
+              static_cast<base_ptr_t>(rhs.m_contents));
+    }
+
+    /**
+     * Numerical order for sorting pointers in containers
+     */
+    bool operator==(virtual_pointer_t rhs) const {
+      return (static_cast<base_ptr_t>(m_contents) ==
+              static_cast<base_ptr_t>(rhs.m_contents));
+    }
+
+    /**
+     * Simple forward to the equality overload.
+     */
+    bool operator!=(virtual_pointer_t rhs) const {
+      return !(this->operator==(rhs));
+    }
+
+    /**
+     * Converts a void * into a virtual pointer structure.
+     * Note that this will only work if the void * was
+     * already a virtual_pointer_t, but we have no way of
+     * checking
+     */
+    virtual_pointer_t(const void *ptr)
+        : m_contents(reinterpret_cast<base_ptr_t>(ptr)){};
+
+    /**
+     * Creates a virtual_pointer_t from the given integer
+     * number
+     */
+    virtual_pointer_t(base_ptr_t u) : m_contents(u){};
+  };
+
+  /* Definition of a null pointer
+   */
+  const virtual_pointer_t null_virtual_ptr = nullptr;
+
+  /**
+   * Whether if a pointer is null or not.
+   * A pointer is nullptr if the value is of null_virtual_ptr
+   */
+  static inline bool is_nullptr(virtual_pointer_t ptr) {
+    return (static_cast<void *>(ptr) == nullptr);
+  }
+
+  /* basic type for all buffers
+   */
+  using buffer_t = cl::sycl::buffer_mem;
+
+  /**
+   * Node that stores information about a device allocation.
+   * Nodes are sorted by size to organise a free list of nodes
+   * that can be recovered.
+   */
+  struct pMapNode_t {
+    buffer_t m_buffer;
+    size_t m_size;
+    bool m_free;
+
+    pMapNode_t(buffer_t b, size_t size, bool f)
+        : m_buffer{b}, m_size{size}, m_free{f} {
+      m_buffer.set_final_data(nullptr);
+    }
+
+    bool operator<=(const pMapNode_t &rhs) { return (m_size <= rhs.m_size); }
+  };
+
+  /** Storage of the pointer / buffer tree
+   */
+  using pointerMap_t = std::map<virtual_pointer_t, pMapNode_t>;
+
+  /**
+   * Obtain the insertion point in the pointer map for
+   * a pointer of the given size.
+   * \param requiredSize Size attemted to reclaim
+   */
+  typename pointerMap_t::iterator get_insertion_point(size_t requiredSize) {
+    typename pointerMap_t::iterator retVal;
+    bool reuse = false;
+    if (!m_freeList.empty()) {
+      // try to re-use an existing block
+      for (auto freeElem : m_freeList) {
+        if (freeElem->second.m_size >= requiredSize) {
+          retVal = freeElem;
+          reuse = true;
+          // Element is not going to be free anymore
+          m_freeList.erase(freeElem);
+          break;
+        }
+      }
+    }
+    if (!reuse) {
+      retVal = std::prev(m_pointerMap.end());
+    }
+    return retVal;
+  }
+
+  /**
+   * Returns an iterator to the node that stores the information
+   * of the given virtual pointer from the given pointer map structure.
+   * If pointer is not found, throws std::out_of_range.
+   * If the pointer map structure is empty, throws std::out_of_range
+   *
+   * \param pMap the pointerMap_t structure storing all the pointers
+   * \param virtual_pointer_ptr The virtual pointer to obtain the node of
+   * \throws std::out:of_range if the pointer is not found or pMap is empty
+   */
+  typename pointerMap_t::iterator get_node(const virtual_pointer_t ptr) {
+    if (this->count() == 0) {
+      m_pointerMap.clear();
+      EIGEN_THROW_X(std::out_of_range("There are no pointers allocated\n"));
+
+    }
+    if (is_nullptr(ptr)) {
+      m_pointerMap.clear();
+      EIGEN_THROW_X(std::out_of_range("Cannot access null pointer\n"));
+    }
+    // The previous element to the lower bound is the node that
+    // holds this memory address
+    auto node = m_pointerMap.lower_bound(ptr);
+    // If the value of the pointer is not the one of the node
+    // then we return the previous one
+    if (node == std::end(m_pointerMap)) {
+      --node;
+    } else if (node->first != ptr) {
+      if (node == std::begin(m_pointerMap)) {
+        m_pointerMap.clear();
+        EIGEN_THROW_X(
+            std::out_of_range("The pointer is not registered in the map\n"));
+
+      }
+      --node;
+    }
+
+    return node;
+  }
+
+  /* get_buffer.
+   * Returns a buffer from the map using the pointer address
+   */
+  template <typename buffer_data_type = buffer_data_type_t>
+  cl::sycl::buffer<buffer_data_type, 1> get_buffer(
+      const virtual_pointer_t ptr) {
+    using sycl_buffer_t = cl::sycl::buffer<buffer_data_type, 1>;
+
+    // get_node() returns a `buffer_mem`, so we need to cast it to a `buffer<>`.
+    // We can do this without the `buffer_mem` being a pointer, as we
+    // only declare member variables in the base class (`buffer_mem`) and not in
+    // the child class (`buffer<>).
+    auto node = get_node(ptr);
+    eigen_assert(node->first == ptr || node->first < ptr);
+    eigen_assert(ptr < static_cast<virtual_pointer_t>(node->second.m_size +
+                                                      node->first));
+    return *(static_cast<sycl_buffer_t *>(&node->second.m_buffer));
+  }
+
+  /**
+   * @brief Returns an accessor to the buffer of the given virtual pointer
+   * @param accessMode
+   * @param accessTarget
+   * @param ptr The virtual pointer
+   */
+  template <sycl_acc_mode access_mode = default_acc_mode,
+            sycl_acc_target access_target = default_acc_target,
+            typename buffer_data_type = buffer_data_type_t>
+  cl::sycl::accessor<buffer_data_type, 1, access_mode, access_target>
+  get_access(const virtual_pointer_t ptr) {
+    auto buf = get_buffer<buffer_data_type>(ptr);
+    return buf.template get_access<access_mode, access_target>();
+  }
+
+  /**
+   * @brief Returns an accessor to the buffer of the given virtual pointer
+   *        in the given command group scope
+   * @param accessMode
+   * @param accessTarget
+   * @param ptr The virtual pointer
+   * @param cgh Reference to the command group scope
+   */
+  template <sycl_acc_mode access_mode = default_acc_mode,
+            sycl_acc_target access_target = default_acc_target,
+            typename buffer_data_type = buffer_data_type_t>
+  cl::sycl::accessor<buffer_data_type, 1, access_mode, access_target>
+  get_access(const virtual_pointer_t ptr, cl::sycl::handler &cgh) {
+    auto buf = get_buffer<buffer_data_type>(ptr);
+    return buf.template get_access<access_mode, access_target>(cgh);
+  }
+
+  /*
+   * Returns the offset from the base address of this pointer.
+   */
+  inline std::ptrdiff_t get_offset(const virtual_pointer_t ptr) {
+    // The previous element to the lower bound is the node that
+    // holds this memory address
+    auto node = get_node(ptr);
+    auto start = node->first;
+    eigen_assert(start == ptr || start < ptr);
+    eigen_assert(ptr < start + node->second.m_size);
+    return (ptr - start);
+  }
+
+  /*
+   * Returns the number of elements by which the given pointer is offset from
+   * the base address.
+   */
+  template <typename buffer_data_type>
+  inline size_t get_element_offset(const virtual_pointer_t ptr) {
+    return get_offset(ptr) / sizeof(buffer_data_type);
+  }
+
+  /**
+   * Constructs the PointerMapper structure.
+   */
+  PointerMapper(base_ptr_t baseAddress = 4096)
+      : m_pointerMap{}, m_freeList{}, m_baseAddress{baseAddress} {
+    if (m_baseAddress == 0) {
+      EIGEN_THROW_X(std::invalid_argument("Base address cannot be zero\n"));
+    }
+  };
+
+  /**
+   * PointerMapper cannot be copied or moved
+   */
+  PointerMapper(const PointerMapper &) = delete;
+
+  /**
+   * Empty the pointer list
+   */
+  inline void clear() {
+    m_freeList.clear();
+    m_pointerMap.clear();
+  }
+
+  /* add_pointer.
+   * Adds an existing pointer to the map and returns the virtual pointer id.
+   */
+  inline virtual_pointer_t add_pointer(const buffer_t &b) {
+    return add_pointer_impl(b);
+  }
+
+  /* add_pointer.
+   * Adds a pointer to the map and returns the virtual pointer id.
+   */
+  inline virtual_pointer_t add_pointer(buffer_t &&b) {
+    return add_pointer_impl(b);
+  }
+
+  /**
+   * @brief Fuses the given node with the previous nodes in the
+   *        pointer map if they are free
+   *
+   * @param node A reference to the free node to be fused
+   */
+  void fuse_forward(typename pointerMap_t::iterator &node) {
+    while (node != std::prev(m_pointerMap.end())) {
+      // if following node is free
+      // remove it and extend the current node with its size
+      auto fwd_node = std::next(node);
+      if (!fwd_node->second.m_free) {
+        break;
+      }
+      auto fwd_size = fwd_node->second.m_size;
+      m_freeList.erase(fwd_node);
+      m_pointerMap.erase(fwd_node);
+
+      node->second.m_size += fwd_size;
+    }
+  }
+
+  /**
+   * @brief Fuses the given node with the following nodes in the
+   *        pointer map if they are free
+   *
+   * @param node A reference to the free node to be fused
+   */
+  void fuse_backward(typename pointerMap_t::iterator &node) {
+    while (node != m_pointerMap.begin()) {
+      // if previous node is free, extend it
+      // with the size of the current one
+      auto prev_node = std::prev(node);
+      if (!prev_node->second.m_free) {
+        break;
+      }
+      prev_node->second.m_size += node->second.m_size;
+
+      // remove the current node
+      m_freeList.erase(node);
+      m_pointerMap.erase(node);
+
+      // point to the previous node
+      node = prev_node;
+    }
+  }
+
+  /* remove_pointer.
+   * Removes the given pointer from the map.
+   * The pointer is allowed to be reused only if ReUse if true.
+   */
+  template <bool ReUse = true>
+  void remove_pointer(const virtual_pointer_t ptr) {
+    if (is_nullptr(ptr)) {
+      return;
+    }
+    auto node = this->get_node(ptr);
+
+    node->second.m_free = true;
+    m_freeList.emplace(node);
+
+    // Fuse the node
+    // with free nodes before and after it
+    fuse_forward(node);
+    fuse_backward(node);
+
+    // If after fusing the node is the last one
+    // simply remove it (since it is free)
+    if (node == std::prev(m_pointerMap.end())) {
+      m_freeList.erase(node);
+      m_pointerMap.erase(node);
+    }
+  }
+
+  /* count.
+   * Return the number of active pointers (i.e, pointers that
+   * have been malloc but not freed).
+   */
+  size_t count() const { return (m_pointerMap.size() - m_freeList.size()); }
+
+ private:
+  /* add_pointer_impl.
+   * Adds a pointer to the map and returns the virtual pointer id.
+   * BufferT is either a const buffer_t& or a buffer_t&&.
+   */
+  template <class BufferT>
+  virtual_pointer_t add_pointer_impl(BufferT b) {
+    virtual_pointer_t retVal = nullptr;
+    size_t bufSize = b.get_count();
+    pMapNode_t p{b, bufSize, false};
+    // If this is the first pointer:
+    if (m_pointerMap.empty()) {
+      virtual_pointer_t initialVal{m_baseAddress};
+      m_pointerMap.emplace(initialVal, p);
+      return initialVal;
+    }
+
+    auto lastElemIter = get_insertion_point(bufSize);
+    // We are recovering an existing free node
+    if (lastElemIter->second.m_free) {
+      lastElemIter->second.m_buffer = b;
+      lastElemIter->second.m_free = false;
+
+      // If the recovered node is bigger than the inserted one
+      // add a new free node with the remaining space
+      if (lastElemIter->second.m_size > bufSize) {
+        // create a new node with the remaining space
+        auto remainingSize = lastElemIter->second.m_size - bufSize;
+        pMapNode_t p2{b, remainingSize, true};
+
+        // update size of the current node
+        lastElemIter->second.m_size = bufSize;
+
+        // add the new free node
+        auto newFreePtr = lastElemIter->first + bufSize;
+        auto freeNode = m_pointerMap.emplace(newFreePtr, p2).first;
+        m_freeList.emplace(freeNode);
+      }
+
+      retVal = lastElemIter->first;
+    } else {
+      size_t lastSize = lastElemIter->second.m_size;
+      retVal = lastElemIter->first + lastSize;
+      m_pointerMap.emplace(retVal, p);
+    }
+    return retVal;
+  }
+
+  /**
+   * Compare two iterators to pointer map entries according to
+   * the size of the allocation on the device.
+   */
+  struct SortBySize {
+    bool operator()(typename pointerMap_t::iterator a,
+                    typename pointerMap_t::iterator b) const {
+      return ((a->first < b->first) && (a->second <= b->second)) ||
+             ((a->first < b->first) && (b->second <= a->second));
+    }
+  };
+
+  /* Maps the pointer addresses to buffer and size pairs.
+   */
+  pointerMap_t m_pointerMap;
+
+  /* List of free nodes available for re-using
+   */
+  std::set<typename pointerMap_t::iterator, SortBySize> m_freeList;
+
+  /* Base address used when issuing the first virtual pointer, allows users
+   * to specify alignment. Cannot be zero. */
+  std::intptr_t m_baseAddress;
+};
+
+/* remove_pointer.
+ * Removes the given pointer from the map.
+ * The pointer is allowed to be reused only if ReUse if true.
+ */
+template <>
+inline void PointerMapper::remove_pointer<false>(const virtual_pointer_t ptr) {
+  if (is_nullptr(ptr)) {
+    return;
+  }
+  m_pointerMap.erase(this->get_node(ptr));
+}
+
+/**
+ * Malloc-like interface to the pointer-mapper.
+ * Given a size, creates a byte-typed buffer and returns a
+ * fake pointer to keep track of it.
+ * \param size Size in bytes of the desired allocation
+ * \throw cl::sycl::exception if error while creating the buffer
+ */
+inline void *SYCLmalloc(size_t size, PointerMapper &pMap) {
+  if (size == 0) {
+    return nullptr;
+  }
+  // Create a generic buffer of the given size
+  using buffer_t = cl::sycl::buffer<buffer_data_type_t, 1>;
+  auto thePointer = pMap.add_pointer(buffer_t(cl::sycl::range<1>{size}));
+  // Store the buffer on the global list
+  return static_cast<void *>(thePointer);
+}
+
+/**
+ * Free-like interface to the pointer mapper.
+ * Given a fake-pointer created with the virtual-pointer malloc,
+ * destroys the buffer and remove it from the list.
+ * If ReUse is false, the pointer is not added to the freeList,
+ * it should be false only for sub-buffers.
+ */
+template <bool ReUse = true, typename PointerMapper>
+inline void SYCLfree(void *ptr, PointerMapper &pMap) {
+  pMap.template remove_pointer<ReUse>(ptr);
+}
+
+/**
+ * Clear all the memory allocated by SYCL.
+ */
+template <typename PointerMapper>
+inline void SYCLfreeAll(PointerMapper &pMap) {
+  pMap.clear();
+}
+
+template <cl::sycl::access::mode AcMd, typename T>
+struct RangeAccess {
+  static const auto global_access = cl::sycl::access::target::global_buffer;
+  static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
+  typedef T scalar_t;
+  typedef scalar_t &ref_t;
+  typedef typename cl::sycl::global_ptr<scalar_t>::pointer_t ptr_t;
+
+  // the accessor type does not necessarily the same as T
+  typedef cl::sycl::accessor<scalar_t, 1, AcMd, global_access, is_place_holder>
+      accessor;
+
+  typedef RangeAccess<AcMd, T> self_t;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RangeAccess(accessor access,
+                                                    size_t offset,
+                                                    std::intptr_t virtual_ptr)
+      : access_(access), offset_(offset), virtual_ptr_(virtual_ptr) {}
+
+  RangeAccess(cl::sycl::buffer<scalar_t, 1> buff =
+                  cl::sycl::buffer<scalar_t, 1>(cl::sycl::range<1>(1)))
+      : access_{accessor{buff}}, offset_(0), virtual_ptr_(-1) {}
+
+  // This should be only used for null constructor on the host side
+  RangeAccess(std::nullptr_t) : RangeAccess() {}
+  // This template parameter must be removed and scalar_t should be replaced
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const {
+    return (access_.get_pointer().get() + offset_);
+  }
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) {
+    offset_ += (offset);
+    return *this;
+  }
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator+(Index offset) const {
+    return self_t(access_, offset_ + offset, virtual_ptr_);
+  }
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator-(Index offset) const {
+    return self_t(access_, offset_ - offset, virtual_ptr_);
+  }
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator-=(Index offset) {
+    offset_ -= offset;
+    return *this;
+  }
+
+  // THIS IS FOR NULL COMPARISON ONLY
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==(
+      const RangeAccess &lhs, std::nullptr_t) {
+    return ((lhs.virtual_ptr_ == -1));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=(
+      const RangeAccess &lhs, std::nullptr_t i) {
+    return !(lhs == i);
+  }
+
+  // THIS IS FOR NULL COMPARISON ONLY
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==(
+      std::nullptr_t, const RangeAccess &rhs) {
+    return ((rhs.virtual_ptr_ == -1));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=(
+      std::nullptr_t i, const RangeAccess &rhs) {
+    return !(i == rhs);
+  }
+  // Prefix operator (Increment and return value)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator++() {
+    offset_++;
+    return (*this);
+  }
+
+  // Postfix operator (Return value and increment)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator++(int i) {
+    EIGEN_UNUSED_VARIABLE(i);
+    self_t temp_iterator(*this);
+    offset_++;
+    return temp_iterator;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_size() const {
+    return (access_.get_count() - offset_);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_offset() const {
+    return offset_;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_offset(std::ptrdiff_t offset) {
+    offset_ = offset;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() const {
+    return *get_pointer();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() {
+    return *get_pointer();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t operator->() = delete;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) {
+    return *(get_pointer() + x);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) const {
+    return *(get_pointer() + x);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_t *get_virtual_pointer() const {
+    return reinterpret_cast<scalar_t *>(virtual_ptr_ +
+                                        (offset_ * sizeof(scalar_t)));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit operator bool() const {
+    return (virtual_ptr_ != -1);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator RangeAccess<AcMd, const T>() {
+    return RangeAccess<AcMd, const T>(access_, offset_, virtual_ptr_);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  operator RangeAccess<AcMd, const T>() const {
+    return RangeAccess<AcMd, const T>(access_, offset_, virtual_ptr_);
+  }
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
+      cl::sycl::handler &cgh) const {
+    cgh.require(access_);
+  }
+
+ private:
+  accessor access_;
+  size_t offset_;
+  std::intptr_t virtual_ptr_;  // the location of the buffer in the map
+};
+
+template <cl::sycl::access::mode AcMd, typename T>
+struct RangeAccess<AcMd, const T> : RangeAccess<AcMd, T> {
+  typedef RangeAccess<AcMd, T> Base;
+  using Base::Base;
+};
+
+}  // namespace internal
+}  // namespace TensorSycl
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h b/src/3rdparty/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h
new file mode 100644
index 000000000..9208ab21d
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h
@@ -0,0 +1,85 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TypeCasting.h
+ *
+ * \brief:
+ *  TypeCasting
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_TYPE_CASTING_SYCL_H
+#define EIGEN_TYPE_CASTING_SYCL_H
+
+namespace Eigen {
+
+namespace internal {
+#ifdef SYCL_DEVICE_ONLY
+template <>
+struct type_casting_traits<float, int> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_int4
+pcast<cl::sycl::cl_float4, cl::sycl::cl_int4>(const cl::sycl::cl_float4& a) {
+  return a
+      .template convert<cl::sycl::cl_int, cl::sycl::rounding_mode::automatic>();
+}
+
+template <>
+struct type_casting_traits<int, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
+pcast<cl::sycl::cl_int4, cl::sycl::cl_float4>(const cl::sycl::cl_int4& a) {
+  return a.template convert<cl::sycl::cl_float,
+                            cl::sycl::rounding_mode::automatic>();
+}
+
+template <>
+struct type_casting_traits<double, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
+pcast<cl::sycl::cl_double2, cl::sycl::cl_float4>(
+    const cl::sycl::cl_double2& a, const cl::sycl::cl_double2& b) {
+  auto a1 = a.template convert<cl::sycl::cl_float,
+                               cl::sycl::rounding_mode::automatic>();
+  auto b1 = b.template convert<cl::sycl::cl_float,
+                               cl::sycl::rounding_mode::automatic>();
+  return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y());
+}
+
+template <>
+struct type_casting_traits<float, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2
+pcast<cl::sycl::cl_float4, cl::sycl::cl_double2>(const cl::sycl::cl_float4& a) {
+  // Simply discard the second half of the input
+  return cl::sycl::cl_double2(a.x(), a.y());
+}
+
+#endif
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_SYCL_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/ZVector/Complex.h b/src/3rdparty/eigen/Eigen/src/Core/arch/ZVector/Complex.h
new file mode 100644
index 000000000..0b9b33d99
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/ZVector/Complex.h
@@ -0,0 +1,426 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX32_ALTIVEC_H
+#define EIGEN_COMPLEX32_ALTIVEC_H
+
+namespace Eigen {
+
+namespace internal {
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+static Packet4ui  p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
+#endif
+
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+
+struct Packet1cd
+{
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+  Packet2d v;
+};
+
+struct Packet2cf
+{
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+  union {
+    Packet4f v;
+    Packet1cd cd[2];
+  };
+#else
+  Packet4f v;
+#endif
+};
+
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+{
+  typedef Packet2cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasBlend  = 1,
+    HasSetLinear = 0
+  };
+};
+
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 1,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float>  type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; };
+
+/* Forward declaration */
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
+
+/* complex<double> first */
+template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
+{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride EIGEN_UNUSED)
+{
+  return pload<Packet1cd>(from);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride EIGEN_UNUSED)
+{
+  pstore<std::complex<double> >(to, from);
+}
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  Packet2d a_re, a_im, v1, v2;
+
+  // Permute and multiply the real parts of a and b
+  a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
+  // Get the imaginary parts of a
+  a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
+  // multiply a_re * b
+  v1 = vec_madd(a_re, b.v, p2d_ZERO);
+  // multiply a_im * b and get the conjugate result
+  v2 = vec_madd(a_im, b.v, p2d_ZERO);
+  v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8);
+  v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1);
+
+  return Packet1cd(v1 + v2);
+}
+template<> EIGEN_STRONG_INLINE Packet1cd pand    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por     <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from) {  return pset1<Packet1cd>(*from); }
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+  Packet2d eq = vec_cmpeq (a.v, b.v);
+  Packet2d tmp = { eq[1], eq[0] };
+  return (Packet1cd)pand<Packet2d>(eq, tmp);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
+{
+  std::complex<double> EIGEN_ALIGN16 res;
+  pstore<std::complex<double> >(&res, a);
+
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
+{
+  return pfirst(a);
+}
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
+{
+  return pfirst(a);
+}
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for AltiVec
+  Packet1cd res = pmul(a,pconj(b));
+  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
+  return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64)));
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+{
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
+{
+  Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+
+/* complex<float> follows */
+template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> EIGEN_ALIGN16 res[2];
+  pstore<std::complex<float> >(res, a);
+
+  return res[0];
+}
+
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+{
+  Packet2cf res;
+  res.cd[0] = Packet1cd(vec_ld2f((const float *)&from));
+  res.cd[1] = res.cd[0];
+  return res;
+}
+#else
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+{
+  Packet2cf res;
+  if((std::ptrdiff_t(&from) % 16) == 0)
+    res.v = pload<Packet4f>((const float *)&from);
+  else
+    res.v = ploadu<Packet4f>((const float *)&from);
+  res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
+  return res;
+}
+#endif
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  return pload<Packet2cf>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  pstore<std::complex<float> >((std::complex<float> *) af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*      from) {  return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *     addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = pcmp_eq<Packet4f> (a.v, b.v);
+  Packet2cf res;
+  Packet2d tmp1 = { eq.v4f[0][1], eq.v4f[0][0] };
+  Packet2d tmp2 = { eq.v4f[1][1], eq.v4f[1][0] };
+  res.v.v4f[0] = pand<Packet2d>(eq.v4f[0], tmp1);
+  res.v.v4f[1] = pand<Packet2d>(eq.v4f[1], tmp2);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
+{
+  Packet2cf res;
+  res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
+  res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet2cf res;
+  res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
+  res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+{
+  Packet2cf res;
+  res.cd[0] = a.cd[1];
+  res.cd[1] = a.cd[0];
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> res;
+  Packet1cd b = padd<Packet1cd>(a.cd[0], a.cd[1]);
+  vec_st2f(b.v, (float*)&res);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> res;
+  Packet1cd b = pmul<Packet1cd>(a.cd[0], a.cd[1]);
+  vec_st2f(b.v, (float*)&res);
+  return res;
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+
+template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for AltiVec
+  Packet2cf res;
+  res.cd[0] = pdiv<Packet1cd>(a.cd[0], b.cd[0]);
+  res.cd[1] = pdiv<Packet1cd>(a.cd[1], b.cd[1]);
+  return res;
+}
+
+EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
+{
+  Packet2cf res;
+  res.cd[0] = pcplxflip(x.cd[0]);
+  res.cd[1] = pcplxflip(x.cd[1]);
+  return res;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
+{
+  Packet1cd tmp = kernel.packet[0].cd[1];
+  kernel.packet[0].cd[1] = kernel.packet[1].cd[0];
+  kernel.packet[1].cd[0] = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  Packet2cf result;
+  const Selector<4> ifPacket4 = { ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1] };
+  result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v);
+  return result;
+}
+#else
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = vec_cmpeq (a.v, b.v);
+  Packet4f tmp = { eq[1], eq[0], eq[3], eq[2] };
+  return (Packet2cf)pand<Packet4f>(eq, tmp);
+}
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet4f a_re, a_im, prod, prod_im;
+
+  // Permute and multiply the real parts of a and b
+  a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
+  
+  // Get the imaginary parts of a
+  a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
+
+  // multiply a_im * b and get the conjugate result
+  prod_im = a_im * b.v;
+  prod_im = pxor<Packet4f>(prod_im, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR));
+  // permute back to a proper order
+  prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV);
+
+  // multiply a_re * b, add prod_im
+  prod = pmadd<Packet4f>(a_re, b.v, prod_im);
+ 
+  return Packet2cf(prod);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+{
+  Packet4f rev_a;
+  rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2);
+  return Packet2cf(rev_a);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  Packet4f b;
+  b = vec_sld(a.v, a.v, 8);
+  b = padd<Packet4f>(a.v, b);
+  return pfirst<Packet2cf>(Packet2cf(b));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  Packet4f b;
+  Packet2cf prod;
+  b = vec_sld(a.v, a.v, 8);
+  prod = pmul<Packet2cf>(a, Packet2cf(b));
+
+  return pfirst<Packet2cf>(prod);
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+
+template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for AltiVec
+  Packet2cf res = pmul(a, pconj(b));
+  Packet4f s = pmul<Packet4f>(b.v, b.v);
+  return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
+{
+  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
+{
+  Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  Packet2cf result;
+  result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
+  return result;
+}
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX32_ALTIVEC_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h b/src/3rdparty/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h
new file mode 100644
index 000000000..1635e128c
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h
@@ -0,0 +1,233 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+
+namespace Eigen {
+
+namespace internal {
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+static _EIGEN_DECLARE_CONST_Packet4i(23, 23);
+
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+/* the smallest non denormalized float number */
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
+  
+/* natural logarithm computed for 4 simultaneous float
+  return NaN for x <= 0
+*/
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+#endif
+
+static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
+static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
+static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+
+static _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
+static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d pexp<Packet2d>(const Packet2d& _x)
+{
+  Packet2d x = _x;
+
+  Packet2d tmp, fx;
+  Packet2l emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
+
+  fx = vec_floor(fx);
+
+  tmp = pmul(fx, p2d_cephes_exp_C1);
+  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet2d x2 = pmul(x,x);
+
+  Packet2d px = p2d_cephes_exp_p0;
+  px = pmadd(px, x2, p2d_cephes_exp_p1);
+  px = pmadd(px, x2, p2d_cephes_exp_p2);
+  px = pmul (px, x);
+
+  Packet2d qx = p2d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+  x = pdiv(px,psub(qx,px));
+  x = pmadd(p2d_2,x,p2d_1);
+
+  // build 2^n
+  emm0 = vec_ctsl(fx, 0);
+
+  static const Packet2l p2l_1023 = { 1023, 1023 };
+  static const Packet2ul p2ul_52 = { 52, 52 };
+
+  emm0 = emm0 + p2l_1023;
+  emm0 = emm0 << reinterpret_cast<Packet2l>(p2ul_52);
+
+  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
+  // inputs and return them unmodified.
+  Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
+  return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x),
+                 isnumber_mask);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& _x)
+{
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  Packet4f x = _x;
+
+  Packet4f tmp, fx;
+  Packet4i emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
+
+  // express exp(x) as exp(g + n*log(2))
+  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
+
+  fx = pfloor(fx);
+
+  tmp = pmul(fx, p4f_cephes_exp_C1);
+  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  z = pmul(x,x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // build 2^n
+  emm0 = (Packet4i){ (int)fx[0], (int)fx[1], (int)fx[2], (int)fx[3] };
+  emm0 = emm0 + p4i_0x7f;
+  emm0 = emm0 << reinterpret_cast<Packet4i>(p4i_23);
+
+  return pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x);
+#else
+  Packet4f res;
+  res.v4f[0] = pexp<Packet2d>(_x.v4f[0]);
+  res.v4f[1] = pexp<Packet2d>(_x.v4f[1]);
+  return res;
+#endif
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d psqrt<Packet2d>(const Packet2d& x)
+{
+  return vec_sqrt(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psqrt<Packet4f>(const Packet4f& x)
+{
+  Packet4f res;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  res = vec_sqrt(x);
+#else
+  res.v4f[0] = psqrt<Packet2d>(x.v4f[0]);
+  res.v4f[1] = psqrt<Packet2d>(x.v4f[1]);
+#endif
+  return res;
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d prsqrt<Packet2d>(const Packet2d& x) {
+  return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+  Packet4f res;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  res = pset1<Packet4f>(1.0) / psqrt<Packet4f>(x);
+#else
+  res.v4f[0] = prsqrt<Packet2d>(x.v4f[0]);
+  res.v4f[1] = prsqrt<Packet2d>(x.v4f[1]);
+#endif
+  return res;
+}
+
+// Hyperbolic Tangent function.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+ptanh<Packet4f>(const Packet4f& x) {
+  return internal::generic_fast_tanh_float(x);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_ALTIVEC_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h b/src/3rdparty/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h
new file mode 100644
index 000000000..1f55a90a5
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -0,0 +1,1060 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_ZVECTOR_H
+#define EIGEN_PACKET_MATH_ZVECTOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
+#endif
+
+typedef __vector int                 Packet4i;
+typedef __vector unsigned int        Packet4ui;
+typedef __vector __bool int          Packet4bi;
+typedef __vector short int           Packet8i;
+typedef __vector unsigned char       Packet16uc;
+typedef __vector double              Packet2d;
+typedef __vector unsigned long long  Packet2ul;
+typedef __vector long long           Packet2l;
+
+// Z14 has builtin support for float vectors
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+typedef __vector float               Packet4f;
+#else
+typedef struct {
+	Packet2d  v4f[2];
+} Packet4f;
+#endif
+
+typedef union {
+  numext::int32_t   i[4];
+  numext::uint32_t ui[4];
+  numext::int64_t   l[2];
+  numext::uint64_t ul[2];
+  double    d[2];
+  float     f[4];
+  Packet4i  v4i;
+  Packet4ui v4ui;
+  Packet2l  v2l;
+  Packet2ul v2ul;
+  Packet2d  v2d;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  Packet4f  v4f;
+#endif
+} Packet;
+
+// We don't want to write the same code all the time, but we need to reuse the constants
+// and it doesn't really work to declare them global, so we define macros instead
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
+  Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet2d(NAME,X) \
+  Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet2l(NAME,X) \
+  Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
+  Packet4i p4i_##NAME = pset1<Packet4i>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
+  Packet2d p2d_##NAME = pset1<Packet2d>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
+  Packet2l p2l_##NAME = pset1<Packet2l>(X)
+
+// These constants are endian-agnostic
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
+
+static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
+static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
+static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
+
+static Packet2d p2d_ONE = { 1.0, 1.0 };
+static Packet2d p2d_ZERO_ = { numext::bit_cast<double>0x8000000000000000ull),
+                              numext::bit_cast<double>0x8000000000000000ull) };
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
+  Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
+  Packet4f p4f_##NAME = pset1<Packet4f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
+
+static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
+static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000};
+#endif
+
+static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
+static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
+
+static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
+
+// Mask alignment
+#define _EIGEN_MASK_ALIGNMENT	0xfffffffffffffff0
+
+#define _EIGEN_ALIGNED_PTR(x)	((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
+
+// Handle endianness properly while loading constants
+// Define global static constants:
+
+static Packet16uc p16uc_FORWARD =   { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 };
+static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
+static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+
+static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+/*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+
+static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/
+static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+/*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16);                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16);                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/
+static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+
+static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+
+static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+
+
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+  #define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#else
+  #define EIGEN_ZVECTOR_PREFETCH(ADDR) asm( "   pfd [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#endif
+
+template<> struct packet_traits<int>    : default_packet_traits
+{
+  typedef Packet4i type;
+  typedef Packet4i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet4f type;
+  typedef Packet4f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasAbs = 1,
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 0,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh = 1,
+    HasErf = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 1,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
+
+/* Forward declaration */
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
+ 
+inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
+{
+  Packet vt;
+  vt.v4i = v;
+  s << vt.i[0] << ", " << vt.i[1] << ", " << vt.i[2] << ", " << vt.i[3];
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
+{
+  Packet vt;
+  vt.v4ui = v;
+  s << vt.ui[0] << ", " << vt.ui[1] << ", " << vt.ui[2] << ", " << vt.ui[3];
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
+{
+  Packet vt;
+  vt.v2l = v;
+  s << vt.l[0] << ", " << vt.l[1];
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet2ul & v)
+{
+  Packet vt;
+  vt.v2ul = v;
+  s << vt.ul[0] << ", " << vt.ul[1] ;
+  return s;
+}
+
+inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
+{
+  Packet vt;
+  vt.v2d = v;
+  s << vt.d[0] << ", " << vt.d[1];
+  return s;
+}
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
+{
+  Packet vt;
+  vt.v4f = v;
+  s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3];
+  return s;
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet *vfrom;
+  vfrom = (Packet *) from;
+  return vfrom->v4i;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet *vfrom;
+  vfrom = (Packet *) from;
+  return vfrom->v2d;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  Packet *vto;
+  vto = (Packet *) to;
+  vto->v4i = from;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  Packet *vto;
+  vto = (Packet *) to;
+  vto->v2d = from;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)
+{
+  return vec_splats(from);
+}
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  return vec_splats(from);
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4i>(const int *a,
+                      Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
+{
+  a3 = pload<Packet4i>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet2d>(const double *a,
+                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
+{
+  a1 = pload<Packet2d>(a);
+  a0 = vec_splat(a1, 0);
+  a1 = vec_splat(a1, 1);
+  a3 = pload<Packet2d>(a+2);
+  a2 = vec_splat(a3, 0);
+  a3 = vec_splat(a3, 1);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+{
+  int EIGEN_ALIGN16 ai[4];
+  ai[0] = from[0*stride];
+  ai[1] = from[1*stride];
+  ai[2] = from[2*stride];
+  ai[3] = from[3*stride];
+ return pload<Packet4i>(ai);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+{
+  double EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+ return pload<Packet2d>(af);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+{
+  int EIGEN_ALIGN16 ai[4];
+  pstore<int>((int *)ai, from);
+  to[0*stride] = ai[0];
+  to[1*stride] = ai[1];
+  to[2*stride] = ai[2];
+  to[3*stride] = ai[3];
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+  double EIGEN_ALIGN16 af[2];
+  pstore<double>(af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
+template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
+
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)    { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*       from) { return pload<Packet4i>(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double*    from) { return pload<Packet2d>(from); }
+
+
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
+{
+  Packet4i p = pload<Packet4i>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
+{
+  Packet2d p = pload<Packet2d>(from);
+  return vec_perm(p, p, p16uc_PSET64_HI);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*        to, const Packet4i& from) { pstore<int>(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { pstore<double>(to, from); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int    EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
+{
+  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
+{
+  return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
+
+template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+{
+  Packet4i b, sum;
+  b   = vec_sld(a, a, 8);
+  sum = padd<Packet4i>(a, b);
+  b   = vec_sld(sum, sum, 4);
+  sum = padd<Packet4i>(sum, b);
+  return pfirst(sum);
+}
+
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{
+  Packet2d b, sum;
+  b   = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
+  sum = padd<Packet2d>(a, b);
+  return pfirst(sum);
+}
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+{
+  EIGEN_ALIGN16 int aux[4];
+  pstore(aux, a);
+  return aux[0] * aux[1] * aux[2] * aux[3];
+}
+
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+}
+
+// min
+template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
+{
+  Packet4i b, res;
+  b   = pmin<Packet4i>(a, vec_sld(a, a, 8));
+  res = pmin<Packet4i>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+}
+
+// max
+template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
+{
+  Packet4i b, res;
+  b = pmax<Packet4i>(a, vec_sld(a, a, 8));
+  res = pmax<Packet4i>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+// max
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2d,2>& kernel) {
+  Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
+  Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
+  kernel.packet[0] = t0;
+  kernel.packet[1] = t1;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+/* z13 has no vector float support so we emulate that with double
+   z14 has proper vector float support.
+*/
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+/* Helper function to simulate a vec_splat_packet4f
+ */
+template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f&   from)
+{
+  Packet4f splat;
+  switch (element) {
+  case 0:
+    splat.v4f[0] = vec_splat(from.v4f[0], 0);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 1:
+    splat.v4f[0] = vec_splat(from.v4f[0], 1);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 2:
+    splat.v4f[0] = vec_splat(from.v4f[1], 0);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 3:
+    splat.v4f[0] = vec_splat(from.v4f[1], 1);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  }
+  return splat;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet4f vfrom;
+  vfrom.v4f[0] = vec_ld2f(&from[0]);
+  vfrom.v4f[1] = vec_ld2f(&from[2]);
+  return vfrom;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_st2f(from.v4f[0], &to[0]);
+  vec_st2f(from.v4f[1], &to[2]);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&    from)
+{
+  Packet4f to;
+  to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
+  to.v4f[1] = to.v4f[0];
+  return to;
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat_packet4f<0>(a3);
+  a1 = vec_splat_packet4f<1>(a3);
+  a2 = vec_splat_packet4f<2>(a3);
+  a3 = vec_splat_packet4f<3>(a3);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  float EIGEN_ALIGN16 ai[4];
+  ai[0] = from[0*stride];
+  ai[1] = from[1*stride];
+  ai[2] = from[2*stride];
+  ai[3] = from[3*stride];
+ return pload<Packet4f>(ai);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  float EIGEN_ALIGN16 ai[4];
+  pstore<float>((float *)ai, from);
+  to[0*stride] = ai[0];
+  to[1*stride] = ai[1];
+  to[2*stride] = ai[2];
+  to[3*stride] = ai[3];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] + b.v4f[0];
+  c.v4f[1] = a.v4f[1] + b.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] - b.v4f[0];
+  c.v4f[1] = a.v4f[1] - b.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] * b.v4f[0];
+  c.v4f[1] = a.v4f[1] * b.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] / b.v4f[0];
+  c.v4f[1] = a.v4f[1] / b.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
+{
+  Packet4f c;
+  c.v4f[0] = -a.v4f[0];
+  c.v4f[1] = -a.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
+{
+  Packet4f res;
+  res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
+  res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = por(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = por(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pxor(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pxor(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_round(a.v4f[0]);
+  res.v4f[1] = vec_round(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_ceil(a.v4f[0]);
+  res.v4f[1] = vec_ceil(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_floor(a.v4f[0]);
+  res.v4f[1] = vec_floor(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*    from)
+{
+  Packet4f p = pload<Packet4f>(from);
+  p.v4f[1] = vec_splat(p.v4f[0], 1);
+  p.v4f[0] = vec_splat(p.v4f[0], 0);
+  return p;
+}
+
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float  EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  Packet4f rev;
+  rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
+  rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
+  return rev;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = pabs(a.v4f[0]);
+  res.v4f[1] = pabs(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  Packet2d sum;
+  sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
+  double first = predux<Packet2d>(sum);
+  return static_cast<float>(first);
+}
+
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  // Return predux_mul<Packet2d> of the subvectors product
+  return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  Packet2d b, res;
+  b   = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  return static_cast<float>(pfirst(res));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  Packet2d b, res;
+  b   = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  return static_cast<float>(pfirst(res));
+}
+
+/* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
+ */
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  PacketBlock<Packet2d,2> t0,t1,t2,t3;
+  // copy top-left 2x2 Packet2d block
+  t0.packet[0] = kernel.packet[0].v4f[0];
+  t0.packet[1] = kernel.packet[1].v4f[0];
+
+  // copy top-right 2x2 Packet2d block
+  t1.packet[0] = kernel.packet[0].v4f[1];
+  t1.packet[1] = kernel.packet[1].v4f[1];
+
+  // copy bottom-left 2x2 Packet2d block
+  t2.packet[0] = kernel.packet[2].v4f[0];
+  t2.packet[1] = kernel.packet[3].v4f[0];
+
+  // copy bottom-right 2x2 Packet2d block
+  t3.packet[0] = kernel.packet[2].v4f[1];
+  t3.packet[1] = kernel.packet[3].v4f[1];
+
+  // Transpose all 2x2 blocks
+  ptranspose(t0);
+  ptranspose(t1);
+  ptranspose(t2);
+  ptranspose(t3);
+
+  // Copy back transposed blocks, but exchange t1 and t2 due to transposition
+  kernel.packet[0].v4f[0] = t0.packet[0];
+  kernel.packet[0].v4f[1] = t2.packet[0];
+  kernel.packet[1].v4f[0] = t0.packet[1];
+  kernel.packet[1].v4f[1] = t2.packet[1];
+  kernel.packet[2].v4f[0] = t1.packet[0];
+  kernel.packet[2].v4f[1] = t3.packet[0];
+  kernel.packet[3].v4f[0] = t1.packet[1];
+  kernel.packet[3].v4f[1] = t3.packet[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
+  Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast<Packet2ul>(p2l_ONE));
+  Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast<Packet2ul>(p2l_ONE));
+  Packet4f result;
+  result.v4f[0] = vec_sel(elsePacket.v4f[0], thenPacket.v4f[0], mask_hi);
+  result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo);
+  return result;
+}
+
+template<> Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+#else
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet *vfrom;
+  vfrom = (Packet *) from;
+  return vfrom->v4f;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  Packet *vto;
+  vto = (Packet *) to;
+  vto->v4f = from;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
+{
+  return vec_splats(from);
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  float EIGEN_ALIGN16 af[4];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  af[2] = from[2*stride];
+  af[3] = from[3*stride];
+ return pload<Packet4f>(af);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  float EIGEN_ALIGN16 af[4];
+  pstore<float>((float*)af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+  to[2*stride] = af[2];
+  to[3*stride] = af[3];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a + b); }
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a - b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a * b); }
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a / b); }
+template<> EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) { return (-a); }
+template<> EIGEN_STRONG_INLINE Packet4f pconj<Packet4f>  (const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f>  (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>    (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f> (const Packet4f& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>  (const Packet4f& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f> (const Packet4f& a) { return vec_floor(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>   (const Packet4f& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
+{
+  Packet4f p = pload<Packet4f>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  Packet4f b, sum;
+  b   = vec_sld(a, a, 8);
+  sum = padd<Packet4f>(a, b);
+  b   = vec_sld(sum, sum, 4);
+  sum = padd<Packet4f>(sum, b);
+  return pfirst(sum);
+}
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  Packet4f prod;
+  prod = pmul(a, vec_sld(a, a, 8));
+  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
+}
+
+// min
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  Packet4f b, res;
+  b   = pmin<Packet4f>(a, vec_sld(a, a, 8));
+  res = pmin<Packet4f>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+// max
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  Packet4f b, res;
+  b = pmax<Packet4f>(a, vec_sld(a, a, 8));
+  res = pmax<Packet4f>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+#endif
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f> (const float* from) { return pload<Packet4f>(from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); }
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>  (const float& a)  { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_ZVECTOR_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/functors/AssignmentFunctors.h b/src/3rdparty/eigen/Eigen/src/Core/functors/AssignmentFunctors.h
new file mode 100644
index 000000000..bf64ef4ed
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -0,0 +1,177 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ASSIGNMENT_FUNCTORS_H
+#define EIGEN_ASSIGNMENT_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+  
+/** \internal
+  * \brief Template functor for scalar/packet assignment
+  *
+  */
+template<typename DstScalar,typename SrcScalar> struct assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a = b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,b); }
+};
+
+// Empty overload for void type (used by PermutationMatrix)
+template<typename DstScalar> struct assign_op<DstScalar,void> {};
+
+template<typename DstScalar,typename SrcScalar>
+struct functor_traits<assign_op<DstScalar,SrcScalar> > {
+  enum {
+    Cost = NumTraits<DstScalar>::ReadCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::Vectorizable && packet_traits<SrcScalar>::Vectorizable
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with addition
+  *
+  */
+template<typename DstScalar,typename SrcScalar> struct add_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(add_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a += b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::padd(internal::ploadt<Packet,Alignment>(a),b)); }
+};
+template<typename DstScalar,typename SrcScalar>
+struct functor_traits<add_assign_op<DstScalar,SrcScalar> > {
+  enum {
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasAdd
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with subtraction
+  *
+  */
+template<typename DstScalar,typename SrcScalar> struct sub_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(sub_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a -= b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::psub(internal::ploadt<Packet,Alignment>(a),b)); }
+};
+template<typename DstScalar,typename SrcScalar>
+struct functor_traits<sub_assign_op<DstScalar,SrcScalar> > {
+  enum {
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasSub
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with multiplication
+  *
+  */
+template<typename DstScalar, typename SrcScalar=DstScalar>
+struct mul_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(mul_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a *= b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::pmul(internal::ploadt<Packet,Alignment>(a),b)); }
+};
+template<typename DstScalar, typename SrcScalar>
+struct functor_traits<mul_assign_op<DstScalar,SrcScalar> > {
+  enum {
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::MulCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasMul
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with diviving
+  *
+  */
+template<typename DstScalar, typename SrcScalar=DstScalar> struct div_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(div_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a /= b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::pdiv(internal::ploadt<Packet,Alignment>(a),b)); }
+};
+template<typename DstScalar, typename SrcScalar>
+struct functor_traits<div_assign_op<DstScalar,SrcScalar> > {
+  enum {
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::MulCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasDiv
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with swapping
+  *
+  * It works as follow. For a non-vectorized evaluation loop, we have:
+  *   for(i) func(A.coeffRef(i), B.coeff(i));
+  * where B is a SwapWrapper expression. The trick is to make SwapWrapper::coeff behaves like a non-const coeffRef.
+  * Actually, SwapWrapper might not even be needed since even if B is a plain expression, since it has to be writable
+  * B.coeff already returns a const reference to the underlying scalar value.
+  * 
+  * The case of a vectorized loop is more tricky:
+  *   for(i,j) func.assignPacket<A_Align>(&A.coeffRef(i,j), B.packet<B_Align>(i,j));
+  * Here, B must be a SwapWrapper whose packet function actually returns a proxy object holding a Scalar*,
+  * the actual alignment and Packet type.
+  *
+  */
+template<typename Scalar> struct swap_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
+  {
+#ifdef EIGEN_GPUCC
+    // FIXME is there some kind of cuda::swap?
+    Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
+#else
+    using std::swap;
+    swap(a,const_cast<Scalar&>(b));
+#endif
+  }
+};
+template<typename Scalar>
+struct functor_traits<swap_assign_op<Scalar> > {
+  enum {
+    Cost = 3 * NumTraits<Scalar>::ReadCost,
+    PacketAccess = 
+    #if defined(EIGEN_VECTORIZE_AVX) && EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<800 || defined(__apple_build_version__))
+    // This is a partial workaround for a bug in clang generating bad code
+    // when mixing 256/512 bits loads and 128 bits moves.
+    // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1684
+    //     https://bugs.llvm.org/show_bug.cgi?id=40815
+    0
+    #else
+    packet_traits<Scalar>::Vectorizable
+    #endif
+  };
+};
+
+} // namespace internal
+
+} // namespace Eigen
+
+#endif // EIGEN_ASSIGNMENT_FUNCTORS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/functors/BinaryFunctors.h b/src/3rdparty/eigen/Eigen/src/Core/functors/BinaryFunctors.h
new file mode 100644
index 000000000..63f09ab93
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/functors/BinaryFunctors.h
@@ -0,0 +1,541 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BINARY_FUNCTORS_H
+#define EIGEN_BINARY_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- associative binary functors ----------
+
+template<typename Arg1, typename Arg2>
+struct binary_op_base
+{
+  typedef Arg1 first_argument_type;
+  typedef Arg2 second_argument_type;
+};
+
+/** \internal
+  * \brief Template functor to compute the sum of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, DenseBase::sum()
+  */
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_sum_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_sum_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
+#else
+  scalar_sum_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::padd(a,b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
+  { return internal::predux(a); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_sum_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (int(NumTraits<LhsScalar>::AddCost) + int(NumTraits<RhsScalar>::AddCost)) / 2, // rough estimate!
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAdd && packet_traits<RhsScalar>::HasAdd
+    // TODO vectorize mixed sum
+  };
+};
+
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_sum_op<bool,bool>::operator() (const bool& a, const bool& b) const { return a || b; }
+
+
+/** \internal
+  * \brief Template functor to compute the product of two scalars
+  *
+  * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
+  */
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_product_op  : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_product_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
+#else
+  scalar_product_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pmul(a,b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
+  { return internal::predux_mul(a); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (int(NumTraits<LhsScalar>::MulCost) + int(NumTraits<RhsScalar>::MulCost))/2, // rough estimate!
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
+    // TODO vectorize mixed product
+  };
+};
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_product_op<bool,bool>::operator() (const bool& a, const bool& b) const { return a && b; }
+
+
+/** \internal
+  * \brief Template functor to compute the conjugate product of two scalars
+  *
+  * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
+  */
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_conj_product_op  : binary_op_base<LhsScalar,RhsScalar>
+{
+
+  enum {
+    Conj = NumTraits<LhsScalar>::IsComplex
+  };
+  
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_conj_product_op>::ReturnType result_type;
+  
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const
+  { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
+  
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
+  { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = NumTraits<LhsScalar>::MulCost,
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the min of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
+  */
+template<typename LhsScalar,typename RhsScalar, int NaNPropagation>
+struct scalar_min_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_min_op>::ReturnType result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const {
+    return internal::pmin<NaNPropagation>(a, b);
+  }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
+  {
+    return internal::pmin<NaNPropagation>(a,b);
+  }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
+  {
+    return internal::predux_min<NaNPropagation>(a);
+  }
+};
+
+template<typename LhsScalar,typename RhsScalar, int NaNPropagation>
+struct functor_traits<scalar_min_op<LhsScalar,RhsScalar, NaNPropagation> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMin
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the max of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
+  */
+template<typename LhsScalar,typename RhsScalar, int NaNPropagation>
+struct scalar_max_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_max_op>::ReturnType result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const {
+    return internal::pmax<NaNPropagation>(a,b);
+  }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
+  {
+    return internal::pmax<NaNPropagation>(a,b);
+  }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
+  {
+    return internal::predux_max<NaNPropagation>(a);
+  }
+};
+
+template<typename LhsScalar,typename RhsScalar, int NaNPropagation>
+struct functor_traits<scalar_max_op<LhsScalar,RhsScalar, NaNPropagation> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMax
+  };
+};
+
+/** \internal
+  * \brief Template functors for comparison of two scalars
+  * \todo Implement packet-comparisons
+  */
+template<typename LhsScalar, typename RhsScalar, ComparisonName cmp> struct scalar_cmp_op;
+
+template<typename LhsScalar, typename RhsScalar, ComparisonName cmp>
+struct functor_traits<scalar_cmp_op<LhsScalar,RhsScalar, cmp> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    PacketAccess = false
+  };
+};
+
+template<ComparisonName Cmp, typename LhsScalar, typename RhsScalar>
+struct result_of<scalar_cmp_op<LhsScalar, RhsScalar, Cmp>(LhsScalar,RhsScalar)> {
+  typedef bool type;
+};
+
+
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_EQ> : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a==b;}
+};
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LT> : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<b;}
+};
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LE> : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<=b;}
+};
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GT> : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>b;}
+};
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GE> : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>=b;}
+};
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_UNORD> : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return !(a<=b || b<=a);}
+};
+template<typename LhsScalar, typename RhsScalar>
+struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_NEQ> : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a!=b;}
+};
+
+/** \internal
+  * \brief Template functor to compute the hypot of two \b positive \b and \b real scalars
+  *
+  * \sa MatrixBase::stableNorm(), class Redux
+  */
+template<typename Scalar>
+struct scalar_hypot_op<Scalar,Scalar> : binary_op_base<Scalar,Scalar>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const
+  {
+    // This functor is used by hypotNorm only for which it is faster to first apply abs
+    // on all coefficients prior to reduction through hypot.
+    // This way we avoid calling abs on positive and real entries, and this also permits
+    // to seamlessly handle complexes. Otherwise we would have to handle both real and complexes
+    // through the same functor...
+    return internal::positive_real_hypot(x,y);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_hypot_op<Scalar,Scalar> > {
+  enum
+  {
+    Cost = 3 * NumTraits<Scalar>::AddCost +
+           2 * NumTraits<Scalar>::MulCost +
+           2 * scalar_div_cost<Scalar,false>::value,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the pow of two scalars
+  * See the specification of pow in https://en.cppreference.com/w/cpp/numeric/math/pow
+  */
+template<typename Scalar, typename Exponent>
+struct scalar_pow_op  : binary_op_base<Scalar,Exponent>
+{
+  typedef typename ScalarBinaryOpTraits<Scalar,Exponent,scalar_pow_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_pow_op)
+#else
+  scalar_pow_op() {
+    typedef Scalar LhsScalar;
+    typedef Exponent RhsScalar;
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC
+  inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); }
+
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  {
+    return generic_pow(a,b);
+  }
+};
+
+template<typename Scalar, typename Exponent>
+struct functor_traits<scalar_pow_op<Scalar,Exponent> > {
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = (!NumTraits<Scalar>::IsComplex && !NumTraits<Scalar>::IsInteger &&
+                    packet_traits<Scalar>::HasExp && packet_traits<Scalar>::HasLog &&
+                    packet_traits<Scalar>::HasRound && packet_traits<Scalar>::HasCmp &&
+                    // Temporarly disable packet access for half/bfloat16 until
+                    // accuracy is improved.
+                    !is_same<Scalar, half>::value && !is_same<Scalar, bfloat16>::value
+                    )
+  };
+};
+
+//---------- non associative binary functors ----------
+
+/** \internal
+  * \brief Template functor to compute the difference of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::operator-
+  */
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_difference_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_difference_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
+#else
+  scalar_difference_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::psub(a,b); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_difference_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (int(NumTraits<LhsScalar>::AddCost) + int(NumTraits<RhsScalar>::AddCost)) / 2,
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasSub && packet_traits<RhsScalar>::HasSub
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the quotient of two scalars
+  *
+  * \sa class CwiseBinaryOp, Cwise::operator/()
+  */
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_quotient_op  : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_quotient_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
+#else
+  scalar_quotient_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pdiv(a,b); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
+  typedef typename scalar_quotient_op<LhsScalar,RhsScalar>::result_type result_type;
+  enum {
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv,
+    Cost = scalar_div_cost<result_type,PacketAccess>::value
+  };
+};
+
+
+
+/** \internal
+  * \brief Template functor to compute the and of two booleans
+  *
+  * \sa class CwiseBinaryOp, ArrayBase::operator&&
+  */
+struct scalar_boolean_and_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pand(a,b); }
+};
+template<> struct functor_traits<scalar_boolean_and_op> {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = true
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the or of two booleans
+  *
+  * \sa class CwiseBinaryOp, ArrayBase::operator||
+  */
+struct scalar_boolean_or_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::por(a,b); }
+};
+template<> struct functor_traits<scalar_boolean_or_op> {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = true
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the xor of two booleans
+ *
+ * \sa class CwiseBinaryOp, ArrayBase::operator^
+ */
+struct scalar_boolean_xor_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pxor(a,b); }
+};
+template<> struct functor_traits<scalar_boolean_xor_op> {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = true
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the absolute difference of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::absolute_difference
+  */
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_absolute_difference_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_absolute_difference_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_absolute_difference_op)
+#else
+  scalar_absolute_difference_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
+  { return numext::absdiff(a,b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pabsdiff(a,b); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_absolute_difference_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAbsDiff
+  };
+};
+
+
+
+//---------- binary functors bound to a constant, thus appearing as a unary functor ----------
+
+// The following two classes permits to turn any binary functor into a unary one with one argument bound to a constant value.
+// They are analogues to std::binder1st/binder2nd but with the following differences:
+//  - they are compatible with packetOp
+//  - they are portable across C++ versions (the std::binder* are deprecated in C++11)
+template<typename BinaryOp> struct bind1st_op : BinaryOp {
+
+  typedef typename BinaryOp::first_argument_type  first_argument_type;
+  typedef typename BinaryOp::second_argument_type second_argument_type;
+  typedef typename BinaryOp::result_type          result_type;
+
+  EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
+
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& b) const
+  { return BinaryOp::packetOp(internal::pset1<Packet>(m_value), b); }
+
+  first_argument_type m_value;
+};
+template<typename BinaryOp> struct functor_traits<bind1st_op<BinaryOp> > : functor_traits<BinaryOp> {};
+
+
+template<typename BinaryOp> struct bind2nd_op : BinaryOp {
+
+  typedef typename BinaryOp::first_argument_type  first_argument_type;
+  typedef typename BinaryOp::second_argument_type second_argument_type;
+  typedef typename BinaryOp::result_type          result_type;
+
+  EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
+
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return BinaryOp::packetOp(a,internal::pset1<Packet>(m_value)); }
+
+  second_argument_type m_value;
+};
+template<typename BinaryOp> struct functor_traits<bind2nd_op<BinaryOp> > : functor_traits<BinaryOp> {};
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BINARY_FUNCTORS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/functors/NullaryFunctors.h b/src/3rdparty/eigen/Eigen/src/Core/functors/NullaryFunctors.h
new file mode 100644
index 000000000..192f225dd
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/functors/NullaryFunctors.h
@@ -0,0 +1,189 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NULLARY_FUNCTORS_H
+#define EIGEN_NULLARY_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Scalar>
+struct scalar_constant_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const scalar_constant_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() () const { return m_other; }
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const { return internal::pset1<PacketType>(m_other); }
+  const Scalar m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_constant_op<Scalar> >
+{ enum { Cost = 0 /* as the constant value should be loaded in register only once for the whole expression */,
+         PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
+
+template<typename Scalar> struct scalar_identity_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_identity_op)
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType row, IndexType col) const { return row==col ? Scalar(1) : Scalar(0); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_identity_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
+
+template <typename Scalar, bool IsInteger> struct linspaced_op_impl;
+
+template <typename Scalar>
+struct linspaced_op_impl<Scalar,/*IsInteger*/false>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
+    m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : Scalar((high-low)/RealScalar(num_steps-1))),
+    m_flip(numext::abs(high)<numext::abs(low))
+  {}
+
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const {
+    if(m_flip)
+      return (i==0)? m_low : Scalar(m_high - RealScalar(m_size1-i)*m_step);
+    else
+      return (i==m_size1)? m_high : Scalar(m_low + RealScalar(i)*m_step);
+  }
+
+  template<typename Packet, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const
+  {
+    // Principle:
+    // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
+    if(m_flip)
+    {
+      Packet pi = plset<Packet>(Scalar(i-m_size1));
+      Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
+      if (EIGEN_PREDICT_TRUE(i != 0)) return res;
+      Packet mask = pcmp_lt(pset1<Packet>(0), plset<Packet>(0));
+      return pselect<Packet>(mask, res, pset1<Packet>(m_low));
+    }
+    else
+    {
+      Packet pi = plset<Packet>(Scalar(i));
+      Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
+      if(EIGEN_PREDICT_TRUE(i != m_size1-unpacket_traits<Packet>::size+1)) return res;
+      Packet mask = pcmp_lt(plset<Packet>(0), pset1<Packet>(unpacket_traits<Packet>::size-1));
+      return pselect<Packet>(mask, res, pset1<Packet>(m_high));
+    }
+  }
+
+  const Scalar m_low;
+  const Scalar m_high;
+  const Index m_size1;
+  const Scalar m_step;
+  const bool m_flip;
+};
+
+template <typename Scalar>
+struct linspaced_op_impl<Scalar,/*IsInteger*/true>
+{
+  EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
+    m_low(low),
+    m_multiplier((high-low)/convert_index<Scalar>(num_steps<=1 ? 1 : num_steps-1)),
+    m_divisor(convert_index<Scalar>((high>=low?num_steps:-num_steps)+(high-low))/((numext::abs(high-low)+1)==0?1:(numext::abs(high-low)+1))),
+    m_use_divisor(num_steps>1 && (numext::abs(high-low)+1)<num_steps)
+  {}
+
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar operator() (IndexType i) const
+  {
+    if(m_use_divisor) return m_low + convert_index<Scalar>(i)/m_divisor;
+    else              return m_low + convert_index<Scalar>(i)*m_multiplier;
+  }
+
+  const Scalar m_low;
+  const Scalar m_multiplier;
+  const Scalar m_divisor;
+  const bool m_use_divisor;
+};
+
+// ----- Linspace functor ----------------------------------------------------------------
+
+// Forward declaration (we default to random access which does not really give
+// us a speed gain when using packet access but it allows to use the functor in
+// nested expressions).
+template <typename Scalar> struct linspaced_op;
+template <typename Scalar> struct functor_traits< linspaced_op<Scalar> >
+{
+  enum
+  {
+    Cost = 1,
+    PacketAccess =   (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear && packet_traits<Scalar>::HasBlend,
+                  /*&& ((!NumTraits<Scalar>::IsInteger) || packet_traits<Scalar>::HasDiv),*/ // <- vectorization for integer is currently disabled
+    IsRepeatable = true
+  };
+};
+template <typename Scalar> struct linspaced_op
+{
+  EIGEN_DEVICE_FUNC linspaced_op(const Scalar& low, const Scalar& high, Index num_steps)
+    : impl((num_steps==1 ? high : low),high,num_steps)
+  {}
+
+  template<typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return impl(i); }
+
+  template<typename Packet,typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.template packetOp<Packet>(i); }
+
+  // This proxy object handles the actual required temporaries and the different
+  // implementations (integer vs. floating point).
+  const linspaced_op_impl<Scalar,NumTraits<Scalar>::IsInteger> impl;
+};
+
+// Linear access is automatically determined from the operator() prototypes available for the given functor.
+// If it exposes an operator()(i,j), then we assume the i and j coefficients are required independently
+// and linear access is not possible. In all other cases, linear access is enabled.
+// Users should not have to deal with this structure.
+template<typename Functor> struct functor_has_linear_access { enum { ret = !has_binary_operator<Functor>::value }; };
+
+// For unreliable compilers, let's specialize the has_*ary_operator
+// helpers so that at least built-in nullary functors work fine.
+#if !( (EIGEN_COMP_MSVC>1600) || (EIGEN_GNUC_AT_LEAST(4,8)) || (EIGEN_COMP_ICC>=1600))
+template<typename Scalar,typename IndexType>
+struct has_nullary_operator<scalar_constant_op<Scalar>,IndexType> { enum { value = 1}; };
+template<typename Scalar,typename IndexType>
+struct has_unary_operator<scalar_constant_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_binary_operator<scalar_constant_op<Scalar>,IndexType> { enum { value = 0}; };
+
+template<typename Scalar,typename IndexType>
+struct has_nullary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_unary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_binary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value = 1}; };
+
+template<typename Scalar,typename IndexType>
+struct has_nullary_operator<linspaced_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_unary_operator<linspaced_op<Scalar>,IndexType> { enum { value = 1}; };
+template<typename Scalar,typename IndexType>
+struct has_binary_operator<linspaced_op<Scalar>,IndexType> { enum { value = 0}; };
+
+template<typename Scalar,typename IndexType>
+struct has_nullary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 1}; };
+template<typename Scalar,typename IndexType>
+struct has_unary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_binary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 0}; };
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_NULLARY_FUNCTORS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/functors/StlFunctors.h b/src/3rdparty/eigen/Eigen/src/Core/functors/StlFunctors.h
new file mode 100644
index 000000000..4570c9b63
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/functors/StlFunctors.h
@@ -0,0 +1,166 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STL_FUNCTORS_H
+#define EIGEN_STL_FUNCTORS_H
+
+namespace Eigen {
+
+// Portable replacements for certain functors.
+namespace numext {
+
+template<typename T = void>
+struct equal_to {
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const {
+    return lhs == rhs;
+  }
+};
+
+template<typename T = void>
+struct not_equal_to {
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const {
+    return lhs != rhs;
+  }
+};
+
+}
+
+
+namespace internal {
+
+// default functor traits for STL functors:
+
+template<typename T>
+struct functor_traits<std::multiplies<T> >
+{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::divides<T> >
+{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::plus<T> >
+{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::minus<T> >
+{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::negate<T> >
+{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::logical_or<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::logical_and<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::logical_not<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::greater<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::less<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::greater_equal<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::less_equal<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::equal_to<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<numext::equal_to<T> >
+  : functor_traits<std::equal_to<T> > {};
+
+template<typename T>
+struct functor_traits<std::not_equal_to<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<numext::not_equal_to<T> >
+  : functor_traits<std::not_equal_to<T> > {};
+
+#if (EIGEN_COMP_CXXVER < 11)
+// std::binder* are deprecated since c++11 and will be removed in c++17
+template<typename T>
+struct functor_traits<std::binder2nd<T> >
+{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::binder1st<T> >
+{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
+#endif
+
+#if (EIGEN_COMP_CXXVER < 17)
+// std::unary_negate is deprecated since c++17 and will be removed in c++20
+template<typename T>
+struct functor_traits<std::unary_negate<T> >
+{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
+
+// std::binary_negate is deprecated since c++17 and will be removed in c++20
+template<typename T>
+struct functor_traits<std::binary_negate<T> >
+{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
+#endif
+
+#ifdef EIGEN_STDEXT_SUPPORT
+
+template<typename T0,typename T1>
+struct functor_traits<std::project1st<T0,T1> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::project2nd<T0,T1> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::select2nd<std::pair<T0,T1> > >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::select1st<std::pair<T0,T1> > >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::unary_compose<T0,T1> >
+{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost, PacketAccess = false }; };
+
+template<typename T0,typename T1,typename T2>
+struct functor_traits<std::binary_compose<T0,T1,T2> >
+{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost + functor_traits<T2>::Cost, PacketAccess = false }; };
+
+#endif // EIGEN_STDEXT_SUPPORT
+
+// allow to add new functors and specializations of functor_traits from outside Eigen.
+// this macro is really needed because functor_traits must be specialized after it is declared but before it is used...
+#ifdef EIGEN_FUNCTORS_PLUGIN
+#include EIGEN_FUNCTORS_PLUGIN
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_STL_FUNCTORS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/functors/TernaryFunctors.h b/src/3rdparty/eigen/Eigen/src/Core/functors/TernaryFunctors.h
new file mode 100644
index 000000000..b254e96c6
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/functors/TernaryFunctors.h
@@ -0,0 +1,25 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TERNARY_FUNCTORS_H
+#define EIGEN_TERNARY_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- associative ternary functors ----------
+
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TERNARY_FUNCTORS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/functors/UnaryFunctors.h b/src/3rdparty/eigen/Eigen/src/Core/functors/UnaryFunctors.h
new file mode 100644
index 000000000..16136d185
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/functors/UnaryFunctors.h
@@ -0,0 +1,1131 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_UNARY_FUNCTORS_H
+#define EIGEN_UNARY_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+  * \brief Template functor to compute the opposite of a scalar
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::operator-
+  */
+template<typename Scalar> struct scalar_opposite_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_opposite_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pnegate(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_opposite_op<Scalar> >
+{ enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasNegate };
+};
+
+/** \internal
+  * \brief Template functor to compute the absolute value of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::abs
+  */
+template<typename Scalar> struct scalar_abs_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs(a); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pabs(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_abs_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasAbs
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the score of a scalar, to chose a pivot
+  *
+  * \sa class CwiseUnaryOp
+  */
+template<typename Scalar> struct scalar_score_coeff_op : scalar_abs_op<Scalar>
+{
+  typedef void Score_is_abs;
+};
+template<typename Scalar>
+struct functor_traits<scalar_score_coeff_op<Scalar> > : functor_traits<scalar_abs_op<Scalar> > {};
+
+/* Avoid recomputing abs when we know the score and they are the same. Not a true Eigen functor.  */
+template<typename Scalar, typename=void> struct abs_knowing_score
+{
+  EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template<typename Score>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a, const Score&) const { return numext::abs(a); }
+};
+template<typename Scalar> struct abs_knowing_score<Scalar, typename scalar_score_coeff_op<Scalar>::Score_is_abs>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template<typename Scal>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scal&, const result_type& a) const { return a; }
+};
+
+/** \internal
+  * \brief Template functor to compute the squared absolute value of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::abs2
+  */
+template<typename Scalar> struct scalar_abs2_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs2_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs2(a); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a,a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_abs2_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasAbs2 }; };
+
+/** \internal
+  * \brief Template functor to compute the conjugate of a complex value
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::conjugate()
+  */
+template<typename Scalar> struct scalar_conjugate_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op)
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::conj(a); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_conjugate_op<Scalar> >
+{
+  enum {
+    Cost = 0,
+    // Yes the cost is zero even for complexes because in most cases for which
+    // the cost is used, conjugation turns to be a no-op. Some examples:
+    //   cost(a*conj(b)) == cost(a*b)
+    //   cost(a+conj(b)) == cost(a+b)
+    //   <etc.
+    // If we don't set it to zero, then:
+    //   A.conjugate().lazyProduct(B.conjugate())
+    // will bake its operands. We definitely don't want that!
+    PacketAccess = packet_traits<Scalar>::HasConj
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the phase angle of a complex
+  *
+  * \sa class CwiseUnaryOp, Cwise::arg
+  */
+template<typename Scalar> struct scalar_arg_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::arg(a); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::parg(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_arg_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::IsComplex ? 5 * NumTraits<Scalar>::MulCost : NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasArg
+  };
+};
+/** \internal
+  * \brief Template functor to cast a scalar to another type
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::cast()
+  */
+template<typename Scalar, typename NewType>
+struct scalar_cast_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef NewType result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const NewType operator() (const Scalar& a) const { return cast<Scalar, NewType>(a); }
+};
+template<typename Scalar, typename NewType>
+struct functor_traits<scalar_cast_op<Scalar,NewType> >
+{ enum { Cost = is_same<Scalar, NewType>::value ? 0 : NumTraits<NewType>::AddCost, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to arithmetically shift a scalar right by a number of bits
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::shift_right()
+  */
+template<typename Scalar, int N>
+struct scalar_shift_right_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_right_op)
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const
+  { return a >> N; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::parithmetic_shift_right<N>(a); }
+};
+template<typename Scalar, int N>
+struct functor_traits<scalar_shift_right_op<Scalar,N> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasShift }; };
+
+/** \internal
+  * \brief Template functor to logically shift a scalar left by a number of bits
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::shift_left()
+  */
+template<typename Scalar, int N>
+struct scalar_shift_left_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_left_op)
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const
+  { return a << N; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::plogical_shift_left<N>(a); }
+};
+template<typename Scalar, int N>
+struct functor_traits<scalar_shift_left_op<Scalar,N> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasShift }; };
+
+/** \internal
+  * \brief Template functor to extract the real part of a complex
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::real()
+  */
+template<typename Scalar>
+struct scalar_real_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::real(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_real_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the imaginary part of a complex
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::imag()
+  */
+template<typename Scalar>
+struct scalar_imag_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::imag(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_imag_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the real part of a complex as a reference
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::real()
+  */
+template<typename Scalar>
+struct scalar_real_ref_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_ref_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::real_ref(*const_cast<Scalar*>(&a)); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_real_ref_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the imaginary part of a complex as a reference
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::imag()
+  */
+template<typename Scalar>
+struct scalar_imag_ref_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_ref_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::imag_ref(*const_cast<Scalar*>(&a)); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_imag_ref_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  *
+  * \brief Template functor to compute the exponential of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::exp()
+  */
+template<typename Scalar> struct scalar_exp_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::exp(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexp(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_exp_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasExp,
+    // The following numbers are based on the AVX implementation.
+#ifdef EIGEN_VECTORIZE_FMA
+    // Haswell can issue 2 add/mul/madd per cycle.
+    Cost =
+    (sizeof(Scalar) == 4
+     // float: 8 pmadd, 4 pmul, 2 padd/psub, 6 other
+     ? (8 * NumTraits<Scalar>::AddCost + 6 * NumTraits<Scalar>::MulCost)
+     // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div,  13 other
+     : (14 * NumTraits<Scalar>::AddCost +
+        6 * NumTraits<Scalar>::MulCost +
+        scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value))
+#else
+    Cost =
+    (sizeof(Scalar) == 4
+     // float: 7 pmadd, 6 pmul, 4 padd/psub, 10 other
+     ? (21 * NumTraits<Scalar>::AddCost + 13 * NumTraits<Scalar>::MulCost)
+     // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div,  13 other
+     : (23 * NumTraits<Scalar>::AddCost +
+        12 * NumTraits<Scalar>::MulCost +
+        scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value))
+#endif
+  };
+};
+
+/** \internal
+  *
+  * \brief Template functor to compute the exponential of a scalar - 1.
+  *
+  * \sa class CwiseUnaryOp, ArrayBase::expm1()
+  */
+template<typename Scalar> struct scalar_expm1_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_expm1_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::expm1(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexpm1(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_expm1_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasExpm1,
+    Cost = functor_traits<scalar_exp_op<Scalar> >::Cost // TODO measure cost of expm1
+  };
+};
+
+/** \internal
+  *
+  * \brief Template functor to compute the logarithm of a scalar
+  *
+  * \sa class CwiseUnaryOp, ArrayBase::log()
+  */
+template<typename Scalar> struct scalar_log_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::log(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_log_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasLog,
+    Cost =
+    (PacketAccess
+     // The following numbers are based on the AVX implementation.
+#ifdef EIGEN_VECTORIZE_FMA
+     // 8 pmadd, 6 pmul, 8 padd/psub, 16 other, can issue 2 add/mul/madd per cycle.
+     ? (20 * NumTraits<Scalar>::AddCost + 7 * NumTraits<Scalar>::MulCost)
+#else
+     // 8 pmadd, 6 pmul, 8 padd/psub, 20 other
+     ? (36 * NumTraits<Scalar>::AddCost + 14 * NumTraits<Scalar>::MulCost)
+#endif
+     // Measured cost of std::log.
+     : sizeof(Scalar)==4 ? 40 : 85)
+  };
+};
+
+/** \internal
+  *
+  * \brief Template functor to compute the logarithm of 1 plus a scalar value
+  *
+  * \sa class CwiseUnaryOp, ArrayBase::log1p()
+  */
+template<typename Scalar> struct scalar_log1p_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_log1p_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::log1p(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog1p(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_log1p_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasLog1p,
+    Cost = functor_traits<scalar_log_op<Scalar> >::Cost // TODO measure cost of log1p
+  };
+};
+
+/** \internal
+  *
+  * \brief Template functor to compute the base-10 logarithm of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::log10()
+  */
+template<typename Scalar> struct scalar_log10_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_log10_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { EIGEN_USING_STD(log10) return log10(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog10(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_log10_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog10 }; };
+
+/** \internal
+  *
+  * \brief Template functor to compute the base-2 logarithm of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::log2()
+  */
+template<typename Scalar> struct scalar_log2_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_log2_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(EIGEN_LOG2E) * numext::log(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog2(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_log2_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; };
+
+/** \internal
+  * \brief Template functor to compute the square root of a scalar
+  * \sa class CwiseUnaryOp, Cwise::sqrt()
+  */
+template<typename Scalar> struct scalar_sqrt_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sqrt(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_sqrt_op<Scalar> > {
+  enum {
+#if EIGEN_FAST_MATH
+    // The following numbers are based on the AVX implementation.
+    Cost = (sizeof(Scalar) == 8 ? 28
+                                // 4 pmul, 1 pmadd, 3 other
+                                : (3 * NumTraits<Scalar>::AddCost +
+                                   5 * NumTraits<Scalar>::MulCost)),
+#else
+    // The following numbers are based on min VSQRT throughput on Haswell.
+    Cost = (sizeof(Scalar) == 8 ? 28 : 14),
+#endif
+    PacketAccess = packet_traits<Scalar>::HasSqrt
+  };
+};
+
+// Boolean specialization to eliminate -Wimplicit-conversion-floating-point-to-bool warnings.
+template<> struct scalar_sqrt_op<bool> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; }
+  template <typename Packet>
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return a; }
+};
+template <>
+struct functor_traits<scalar_sqrt_op<bool> > {
+  enum { Cost = 1, PacketAccess = packet_traits<bool>::Vectorizable };
+};
+
+/** \internal
+  * \brief Template functor to compute the reciprocal square root of a scalar
+  * \sa class CwiseUnaryOp, Cwise::rsqrt()
+  */
+template<typename Scalar> struct scalar_rsqrt_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::rsqrt(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); }
+};
+
+template<typename Scalar>
+struct functor_traits<scalar_rsqrt_op<Scalar> >
+{ enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRsqrt
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the cosine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::cos()
+  */
+template<typename Scalar> struct scalar_cos_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op)
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return numext::cos(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcos(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_cos_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasCos
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the sine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::sin()
+  */
+template<typename Scalar> struct scalar_sin_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sin(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psin(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sin_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasSin
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the tan of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::tan()
+  */
+template<typename Scalar> struct scalar_tan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tan(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptan(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_tan_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasTan
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the arc cosine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::acos()
+  */
+template<typename Scalar> struct scalar_acos_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::acos(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pacos(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_acos_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasACos
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the arc sine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::asin()
+  */
+template<typename Scalar> struct scalar_asin_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::asin(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pasin(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_asin_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasASin
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the atan of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::atan()
+  */
+template<typename Scalar> struct scalar_atan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::atan(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::patan(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_atan_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasATan
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the tanh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::tanh()
+  */
+template <typename Scalar>
+struct scalar_tanh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::tanh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const { return ptanh(x); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_tanh_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasTanh,
+    Cost = ( (EIGEN_FAST_MATH && is_same<Scalar,float>::value)
+// The following numbers are based on the AVX implementation,
+#ifdef EIGEN_VECTORIZE_FMA
+                // Haswell can issue 2 add/mul/madd per cycle.
+                // 9 pmadd, 2 pmul, 1 div, 2 other
+                ? (2 * NumTraits<Scalar>::AddCost +
+                   6 * NumTraits<Scalar>::MulCost +
+                   scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value)
+#else
+                ? (11 * NumTraits<Scalar>::AddCost +
+                   11 * NumTraits<Scalar>::MulCost +
+                   scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value)
+#endif
+                // This number assumes a naive implementation of tanh
+                : (6 * NumTraits<Scalar>::AddCost +
+                   3 * NumTraits<Scalar>::MulCost +
+                   2 * scalar_div_cost<Scalar,packet_traits<Scalar>::HasDiv>::value +
+                   functor_traits<scalar_exp_op<Scalar> >::Cost))
+  };
+};
+
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+  * \brief Template functor to compute the atanh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::atanh()
+  */
+template <typename Scalar>
+struct scalar_atanh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_atanh_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
+/** \internal
+  * \brief Template functor to compute the sinh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::sinh()
+  */
+template<typename Scalar> struct scalar_sinh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sinh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sinh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psinh(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sinh_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasSinh
+  };
+};
+
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+  * \brief Template functor to compute the asinh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::asinh()
+  */
+template <typename Scalar>
+struct scalar_asinh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_asinh_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
+/** \internal
+  * \brief Template functor to compute the cosh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::cosh()
+  */
+template<typename Scalar> struct scalar_cosh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cosh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::cosh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcosh(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_cosh_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasCosh
+  };
+};
+
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+  * \brief Template functor to compute the acosh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::acosh()
+  */
+template <typename Scalar>
+struct scalar_acosh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_acosh_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
+/** \internal
+  * \brief Template functor to compute the inverse of a scalar
+  * \sa class CwiseUnaryOp, Cwise::inverse()
+  */
+template<typename Scalar>
+struct scalar_inverse_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_inverse_op)
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_inverse_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasDiv,
+    Cost = scalar_div_cost<Scalar, PacketAccess>::value
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the square of a scalar
+  * \sa class CwiseUnaryOp, Cwise::square()
+  */
+template<typename Scalar>
+struct scalar_square_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op)
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a*a; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a,a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_square_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+
+// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC.
+template<>
+struct scalar_square_op<bool> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op)
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; }
+  template<typename Packet>
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return a; }
+};
+template<>
+struct functor_traits<scalar_square_op<bool> >
+{ enum { Cost = 0, PacketAccess = packet_traits<bool>::Vectorizable }; };
+
+/** \internal
+  * \brief Template functor to compute the cube of a scalar
+  * \sa class CwiseUnaryOp, Cwise::cube()
+  */
+template<typename Scalar>
+struct scalar_cube_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op)
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a*a*a; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a,pmul(a,a)); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_cube_op<Scalar> >
+{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+
+// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC.
+template<>
+struct scalar_cube_op<bool> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op)
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; }
+  template<typename Packet>
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return a; }
+};
+template<>
+struct functor_traits<scalar_cube_op<bool> >
+{ enum { Cost = 0, PacketAccess = packet_traits<bool>::Vectorizable }; };
+
+/** \internal
+  * \brief Template functor to compute the rounded value of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::round()
+  */
+template<typename Scalar> struct scalar_round_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_round_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::round(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pround(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_round_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRound
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the floor of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::floor()
+  */
+template<typename Scalar> struct scalar_floor_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_floor_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::floor(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pfloor(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_floor_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasFloor
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the rounded (with current rounding mode)  value of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::rint()
+  */
+template<typename Scalar> struct scalar_rint_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_rint_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::rint(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::print(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_rint_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRint
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the ceil of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::ceil()
+  */
+template<typename Scalar> struct scalar_ceil_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_ceil_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::ceil(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pceil(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_ceil_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasCeil
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute whether a scalar is NaN
+  * \sa class CwiseUnaryOp, ArrayBase::isnan()
+  */
+template<typename Scalar> struct scalar_isnan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_isnan_op)
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return numext::isnan(a);
+#else
+    return (numext::isnan)(a);
+#endif
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_isnan_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to check whether a scalar is +/-inf
+  * \sa class CwiseUnaryOp, ArrayBase::isinf()
+  */
+template<typename Scalar> struct scalar_isinf_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_isinf_op)
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return numext::isinf(a);
+#else
+    return (numext::isinf)(a);
+#endif
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_isinf_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to check whether a scalar has a finite value
+  * \sa class CwiseUnaryOp, ArrayBase::isfinite()
+  */
+template<typename Scalar> struct scalar_isfinite_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_isfinite_op)
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return numext::isfinite(a);
+#else
+    return (numext::isfinite)(a);
+#endif
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_isfinite_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the logical not of a boolean
+  *
+  * \sa class CwiseUnaryOp, ArrayBase::operator!
+  */
+template<typename Scalar> struct scalar_boolean_not_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_not_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a) const { return !a; }
+};
+template<typename Scalar>
+struct functor_traits<scalar_boolean_not_op<Scalar> > {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the signum of a scalar
+  * \sa class CwiseUnaryOp, Cwise::sign()
+  */
+template<typename Scalar,bool is_complex=(NumTraits<Scalar>::IsComplex!=0), bool is_integer=(NumTraits<Scalar>::IsInteger!=0) > struct scalar_sign_op;
+template<typename Scalar>
+struct scalar_sign_op<Scalar, false, true> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
+  {
+      return Scalar( (a>Scalar(0)) - (a<Scalar(0)) );
+  }
+  //TODO
+  //template <typename Packet>
+  //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); }
+};
+
+template<typename Scalar>
+struct scalar_sign_op<Scalar, false, false> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
+  {
+    return (numext::isnan)(a) ? a : Scalar( (a>Scalar(0)) - (a<Scalar(0)) );
+  }
+  //TODO
+  //template <typename Packet>
+  //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); }
+};
+
+template<typename Scalar, bool is_integer>
+struct scalar_sign_op<Scalar,true, is_integer> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
+  {
+    typedef typename NumTraits<Scalar>::Real real_type;
+    real_type aa = numext::abs(a);
+    if (aa==real_type(0))
+      return Scalar(0);
+    aa = real_type(1)/aa;
+    return Scalar(a.real()*aa, a.imag()*aa );
+  }
+  //TODO
+  //template <typename Packet>
+  //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sign_op<Scalar> >
+{ enum {
+    Cost =
+        NumTraits<Scalar>::IsComplex
+        ? ( 8*NumTraits<Scalar>::MulCost  ) // roughly
+        : ( 3*NumTraits<Scalar>::AddCost),
+    PacketAccess = packet_traits<Scalar>::HasSign
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the logistic function of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::logistic()
+  */
+template <typename T>
+struct scalar_logistic_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
+    return packetOp(x);
+  }
+
+  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(const Packet& x) const {
+    const Packet one = pset1<Packet>(T(1));
+    return pdiv(one, padd(one, pexp(pnegate(x))));
+  }
+};
+
+#ifndef EIGEN_GPU_COMPILE_PHASE
+/** \internal
+  * \brief Template specialization of the logistic function for float.
+  *
+  *  Uses just a 9/10-degree rational interpolant which
+  *  interpolates 1/(1+exp(-x)) - 0.5 up to a couple of ulps in the range
+  *  [-9, 18]. Below -9 we use the more accurate approximation
+  *  1/(1+exp(-x)) ~= exp(x), and above 18 the logistic function is 1 withing
+  *  one ulp. The shifted logistic is interpolated because it was easier to
+  *  make the fit converge.
+  *
+  */
+template <>
+struct scalar_logistic_op<float> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(const float& x) const {
+    return packetOp(x);
+  }
+
+  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(const Packet& _x) const {
+    const Packet cutoff_lower = pset1<Packet>(-9.f);
+    const Packet lt_mask = pcmp_lt<Packet>(_x, cutoff_lower);
+    const bool any_small = predux_any(lt_mask);
+
+    // The upper cut-off is the smallest x for which the rational approximation evaluates to 1.
+    // Choosing this value saves us a few instructions clamping the results at the end.
+#ifdef EIGEN_VECTORIZE_FMA
+    const Packet cutoff_upper = pset1<Packet>(15.7243833541870117f);
+#else
+    const Packet cutoff_upper = pset1<Packet>(15.6437711715698242f);
+#endif
+    const Packet x = pmin(_x, cutoff_upper);
+
+    // The monomial coefficients of the numerator polynomial (odd).
+    const Packet alpha_1 = pset1<Packet>(2.48287947061529e-01f);
+    const Packet alpha_3 = pset1<Packet>(8.51377133304701e-03f);
+    const Packet alpha_5 = pset1<Packet>(6.08574864600143e-05f);
+    const Packet alpha_7 = pset1<Packet>(1.15627324459942e-07f);
+    const Packet alpha_9 = pset1<Packet>(4.37031012579801e-11f);
+
+    // The monomial coefficients of the denominator polynomial (even).
+    const Packet beta_0 = pset1<Packet>(9.93151921023180e-01f);
+    const Packet beta_2 = pset1<Packet>(1.16817656904453e-01f);
+    const Packet beta_4 = pset1<Packet>(1.70198817374094e-03f);
+    const Packet beta_6 = pset1<Packet>(6.29106785017040e-06f);
+    const Packet beta_8 = pset1<Packet>(5.76102136993427e-09f);
+    const Packet beta_10 = pset1<Packet>(6.10247389755681e-13f);
+
+    // Since the polynomials are odd/even, we need x^2.
+    const Packet x2 = pmul(x, x);
+
+    // Evaluate the numerator polynomial p.
+    Packet p = pmadd(x2, alpha_9, alpha_7);
+    p = pmadd(x2, p, alpha_5);
+    p = pmadd(x2, p, alpha_3);
+    p = pmadd(x2, p, alpha_1);
+    p = pmul(x, p);
+
+    // Evaluate the denominator polynomial q.
+    Packet q = pmadd(x2, beta_10, beta_8);
+    q = pmadd(x2, q, beta_6);
+    q = pmadd(x2, q, beta_4);
+    q = pmadd(x2, q, beta_2);
+    q = pmadd(x2, q, beta_0);
+    // Divide the numerator by the denominator and shift it up.
+    const Packet logistic = padd(pdiv(p, q), pset1<Packet>(0.5f));
+    if (EIGEN_PREDICT_FALSE(any_small)) {
+      const Packet exponential = pexp(_x);
+      return pselect(lt_mask, exponential, logistic);
+    } else {
+      return logistic;
+    }
+  }
+};
+#endif  // #ifndef EIGEN_GPU_COMPILE_PHASE
+
+template <typename T>
+struct functor_traits<scalar_logistic_op<T> > {
+  enum {
+    // The cost estimate for float here here is for the common(?) case where
+    // all arguments are greater than -9.
+    Cost = scalar_div_cost<T, packet_traits<T>::HasDiv>::value +
+           (internal::is_same<T, float>::value
+                ? NumTraits<T>::AddCost * 15 + NumTraits<T>::MulCost * 11
+                : NumTraits<T>::AddCost * 2 +
+                      functor_traits<scalar_exp_op<T> >::Cost),
+    PacketAccess =
+        packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
+        (internal::is_same<T, float>::value
+             ? packet_traits<T>::HasMul && packet_traits<T>::HasMax &&
+                   packet_traits<T>::HasMin
+             : packet_traits<T>::HasNegate && packet_traits<T>::HasExp)
+  };
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_FUNCTORS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
new file mode 100644
index 000000000..f35b760c1
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -0,0 +1,2645 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
+#define EIGEN_GENERAL_BLOCK_PANEL_H
+
+
+namespace Eigen {
+
+namespace internal {
+
+enum GEBPPacketSizeType {
+  GEBPPacketFull = 0,
+  GEBPPacketHalf,
+  GEBPPacketQuarter
+};
+
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull>
+class gebp_traits;
+
+
+/** \internal \returns b if a<=0, and returns a otherwise. */
+inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
+{
+  return a<=0 ? b : a;
+}
+
+#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
+#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
+#else
+#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
+#endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
+
+#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
+#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
+#else
+#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
+#endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
+
+#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
+#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
+#else
+#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
+#endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
+  
+#if EIGEN_ARCH_i386_OR_x86_64
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
+#elif EIGEN_ARCH_PPC
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
+#else
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
+#endif
+
+#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
+#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
+#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
+
+/** \internal */
+struct CacheSizes {
+  CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
+    int l1CacheSize, l2CacheSize, l3CacheSize;
+    queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
+    m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
+    m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
+    m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
+  }
+
+  std::ptrdiff_t m_l1;
+  std::ptrdiff_t m_l2;
+  std::ptrdiff_t m_l3;
+};
+
+/** \internal */
+inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
+{
+  static CacheSizes m_cacheSizes;
+
+  if(action==SetAction)
+  {
+    // set the cpu cache size and cache all block sizes from a global cache size in byte
+    eigen_internal_assert(l1!=0 && l2!=0);
+    m_cacheSizes.m_l1 = *l1;
+    m_cacheSizes.m_l2 = *l2;
+    m_cacheSizes.m_l3 = *l3;
+  }
+  else if(action==GetAction)
+  {
+    eigen_internal_assert(l1!=0 && l2!=0);
+    *l1 = m_cacheSizes.m_l1;
+    *l2 = m_cacheSizes.m_l2;
+    *l3 = m_cacheSizes.m_l3;
+  }
+  else
+  {
+    eigen_internal_assert(false);
+  }
+}
+
+/* Helper for computeProductBlockingSizes.
+ *
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+ * this function computes the blocking size parameters along the respective dimensions
+ * for matrix products and related algorithms. The blocking sizes depends on various
+ * parameters:
+ * - the L1 and L2 cache sizes,
+ * - the register level blocking sizes defined by gebp_traits,
+ * - the number of scalars that fit into a packet (when vectorization is enabled).
+ *
+ * \sa setCpuCacheSizes */
+
+template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
+void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+  // Explanations:
+  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
+  // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
+  // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
+  // at the register level. This small horizontal panel has to stay within L1 cache.
+  std::ptrdiff_t l1, l2, l3;
+  manage_caching_sizes(GetAction, &l1, &l2, &l3);
+  #ifdef EIGEN_VECTORIZE_AVX512
+  // We need to find a rationale for that, but without this adjustment,
+  // performance with AVX512 is pretty bad, like -20% slower.
+  // One reason is that with increasing packet-size, the blocking size k
+  // has to become pretty small if we want that 1 lhs panel fit within L1.
+  // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
+  //   k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
+  // This is quite small for a good reuse of the accumulation registers.
+  l1 *= 4;
+  #endif
+
+  if (num_threads > 1) {
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
+      kr = 8,
+      mr = Traits::mr,
+      nr = Traits::nr
+    };
+    // Increasing k gives us more time to prefetch the content of the "C"
+    // registers. However once the latency is hidden there is no point in
+    // increasing the value of k, so we'll cap it at 320 (value determined
+    // experimentally).
+    // To avoid that k vanishes, we make k_cache at least as big as kr
+    const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
+    if (k_cache < k) {
+      k = k_cache - (k_cache % kr);
+      eigen_internal_assert(k > 0);
+    }
+
+    const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
+    const Index n_per_thread = numext::div_ceil(n, num_threads);
+    if (n_cache <= n_per_thread) {
+      // Don't exceed the capacity of the l2 cache.
+      eigen_internal_assert(n_cache >= static_cast<Index>(nr));
+      n = n_cache - (n_cache % nr);
+      eigen_internal_assert(n > 0);
+    } else {
+      n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
+    }
+
+    if (l3 > l2) {
+      // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
+      const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
+      const Index m_per_thread = numext::div_ceil(m, num_threads);
+      if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
+        m = m_cache - (m_cache % mr);
+        eigen_internal_assert(m > 0);
+      } else {
+        m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
+      }
+    }
+  }
+  else {
+    // In unit tests we do not want to use extra large matrices,
+    // so we reduce the cache size to check the blocking strategy is not flawed
+#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    l1 = 9*1024;
+    l2 = 32*1024;
+    l3 = 512*1024;
+#endif
+
+    // Early return for small problems because the computation below are time consuming for small problems.
+    // Perhaps it would make more sense to consider k*n*m??
+    // Note that for very tiny problem, this function should be bypassed anyway
+    // because we use the coefficient-based implementation for them.
+    if((numext::maxi)(k,(numext::maxi)(m,n))<48)
+      return;
+
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      k_peeling = 8,
+      k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
+    };
+
+    // ---- 1st level of blocking on L1, yields kc ----
+
+    // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
+    // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
+    // We also include a register-level block of the result (mx x nr).
+    // (In an ideal world only the lhs panel would stay in L1)
+    // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
+    const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
+    const Index old_k = k;
+    if(k>max_kc)
+    {
+      // We are really blocking on the third dimension:
+      // -> reduce blocking size to make sure the last block is as large as possible
+      //    while keeping the same number of sweeps over the result.
+      k = (k%max_kc)==0 ? max_kc
+                        : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
+
+      eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
+    }
+
+    // ---- 2nd level of blocking on max(L2,L3), yields nc ----
+
+    // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
+    //      actual_l2 = max(l2, l3/nb_core_sharing_l3)
+    // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
+    // For instance, it corresponds to 6MB of L3 shared among 4 cores.
+    #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    const Index actual_l2 = l3;
+    #else
+    const Index actual_l2 = 1572864; // == 1.5 MB
+    #endif
+
+    // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
+    // The second half is implicitly reserved to access the result and lhs coefficients.
+    // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
+    // to limit this growth: we bound nc to growth by a factor x1.5.
+    // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
+    // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
+    Index max_nc;
+    const Index lhs_bytes = m * k * sizeof(LhsScalar);
+    const Index remaining_l1 = l1- k_sub - lhs_bytes;
+    if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
+    {
+      // L1 blocking
+      max_nc = remaining_l1 / (k*sizeof(RhsScalar));
+    }
+    else
+    {
+      // L2 blocking
+      max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
+    }
+    // WARNING Below, we assume that Traits::nr is a power of two.
+    Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
+    if(n>nc)
+    {
+      // We are really blocking over the columns:
+      // -> reduce blocking size to make sure the last block is as large as possible
+      //    while keeping the same number of sweeps over the packed lhs.
+      //    Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
+      n = (n%nc)==0 ? nc
+                    : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
+    }
+    else if(old_k==k)
+    {
+      // So far, no blocking at all, i.e., kc==k, and nc==n.
+      // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
+      // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
+      Index problem_size = k*n*sizeof(LhsScalar);
+      Index actual_lm = actual_l2;
+      Index max_mc = m;
+      if(problem_size<=1024)
+      {
+        // problem is small enough to keep in L1
+        // Let's choose m such that lhs's block fit in 1/3 of L1
+        actual_lm = l1;
+      }
+      else if(l3!=0 && problem_size<=32768)
+      {
+        // we have both L2 and L3, and problem is small enough to be kept in L2
+        // Let's choose m such that lhs's block fit in 1/3 of L2
+        actual_lm = l2;
+        max_mc = (numext::mini<Index>)(576,max_mc);
+      }
+      Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
+      if (mc > Traits::mr) mc -= mc % Traits::mr;
+      else if (mc==0) return;
+      m = (m%mc)==0 ? mc
+                    : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
+    }
+  }
+}
+
+template <typename Index>
+inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
+{
+#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
+    k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+    m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+    n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+    return true;
+  }
+#else
+  EIGEN_UNUSED_VARIABLE(k)
+  EIGEN_UNUSED_VARIABLE(m)
+  EIGEN_UNUSED_VARIABLE(n)
+#endif
+  return false;
+}
+
+/** \brief Computes the blocking parameters for a m x k times k x n matrix product
+  *
+  * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
+  * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
+  * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
+  *
+  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+  * this function computes the blocking size parameters along the respective dimensions
+  * for matrix products and related algorithms.
+  *
+  * The blocking size parameters may be evaluated:
+  *   - either by a heuristic based on cache sizes;
+  *   - or using fixed prescribed values (for testing purposes).
+  *
+  * \sa setCpuCacheSizes */
+
+template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
+void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+  if (!useSpecificBlockingSizes(k, m, n)) {
+    evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
+  }
+}
+
+template<typename LhsScalar, typename RhsScalar, typename Index>
+inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+  computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
+}
+
+template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
+struct RhsPanelHelper {
+ private:
+  static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
+ public:
+  typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
+};
+
+template <typename Packet>
+struct QuadPacket
+{
+  Packet B_0, B1, B2, B3;
+  const Packet& get(const FixedInt<0>&) const { return B_0; }
+  const Packet& get(const FixedInt<1>&) const { return B1; }
+  const Packet& get(const FixedInt<2>&) const { return B2; }
+  const Packet& get(const FixedInt<3>&) const { return B3; }
+};
+
+template <int N, typename T1, typename T2, typename T3>
+struct packet_conditional { typedef T3 type; };
+
+template <typename T1, typename T2, typename T3>
+struct packet_conditional<GEBPPacketFull, T1, T2, T3> { typedef T1 type; };
+
+template <typename T1, typename T2, typename T3>
+struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; };
+
+#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size)         \
+  typedef typename packet_conditional<packet_size,                 \
+                                      typename packet_traits<name ## Scalar>::type, \
+                                      typename packet_traits<name ## Scalar>::half, \
+                                      typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
+  prefix ## name ## Packet
+
+#define PACKET_DECL_COND(name, packet_size)                        \
+  typedef typename packet_conditional<packet_size,                 \
+                                      typename packet_traits<name ## Scalar>::type, \
+                                      typename packet_traits<name ## Scalar>::half, \
+                                      typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
+  name ## Packet
+
+#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size)        \
+  typedef typename packet_conditional<packet_size,                 \
+                                      typename packet_traits<Scalar>::type, \
+                                      typename packet_traits<Scalar>::half, \
+                                      typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
+  prefix ## ScalarPacket
+
+#define PACKET_DECL_COND_SCALAR(packet_size)                       \
+  typedef typename packet_conditional<packet_size,                 \
+                                      typename packet_traits<Scalar>::type, \
+                                      typename packet_traits<Scalar>::half, \
+                                      typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
+  ScalarPacket
+
+/* Vectorization logic
+ *  real*real: unpack rhs to constant packets, ...
+ * 
+ *  cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
+ *          storing each res packet into two packets (2x2),
+ *          at the end combine them: swap the second and addsub them 
+ *  cf*cf : same but with 2x4 blocks
+ *  cplx*real : unpack rhs to constant packets, ...
+ *  real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
+ */
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
+class gebp_traits
+{
+public:
+  typedef _LhsScalar LhsScalar;
+  typedef _RhsScalar RhsScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+
+  enum {
+    ConjLhs = _ConjLhs,
+    ConjRhs = _ConjRhs,
+    Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
+    LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+    ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+    
+    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+
+    // register block size along the N direction must be 1 or 4
+    nr = 4,
+
+    // register block size along the M direction (currently, this one cannot be modified)
+    default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
+    && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
+    // we assume 16 registers or more
+    // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
+    // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
+    // Bug 1515: MSVC prior to v19.14 yields to register spilling.
+    mr = Vectorizable ? 3*LhsPacketSize : default_mr,
+#else
+    mr = default_mr,
+#endif
+    
+    LhsProgress = LhsPacketSize,
+    RhsProgress = 1
+  };
+
+
+  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
+  typedef ResPacket AccPacket;
+  
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+  {
+    p = pset1<ResPacket>(ResScalar(0));
+  }
+
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    dest = pset1<RhsPacketType>(*b);
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    loadRhs(b, dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {
+  }
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = ploadquad<RhsPacket>(b);
+  }
+
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
+  {
+    dest = pload<LhsPacketType>(a);
+  }
+
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
+  {
+    dest = ploadu<LhsPacketType>(a);
+  }
+
+  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
+  {
+    conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
+    // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
+    // let gcc allocate the register in which to store the result of the pmul
+    // (in the case where there is no FMA) gcc fails to figure out how to avoid
+    // spilling register.
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+    EIGEN_UNUSED_VARIABLE(tmp);
+    c = cj.pmadd(a,b,c);
+#else
+    tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
+#endif
+  }
+
+  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+  {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+
+  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+  {
+    r = pmadd(c,alpha,r);
+  }
+  
+  template<typename ResPacketHalf>
+  EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
+  {
+    r = pmadd(c,alpha,r);
+  }
+
+};
+
+template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize>
+class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
+{
+public:
+  typedef std::complex<RealScalar> LhsScalar;
+  typedef RealScalar RhsScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+
+  enum {
+    ConjLhs = _ConjLhs,
+    ConjRhs = false,
+    Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
+    LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+    ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+    
+    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+    nr = 4,
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+    // we assume 16 registers
+    mr = 3*LhsPacketSize,
+#else
+    mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
+#endif
+
+    LhsProgress = LhsPacketSize,
+    RhsProgress = 1
+  };
+
+  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
+
+  typedef ResPacket AccPacket;
+
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+  {
+    p = pset1<ResPacket>(ResScalar(0));
+  }
+
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    dest = pset1<RhsPacketType>(*b);
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    loadRhs(b, dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {}
+  
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  {
+    loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type());
+  }
+
+  EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
+  {
+    // FIXME we can do better!
+    // what we want here is a ploadheight
+    RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
+    dest = ploadquad<RhsPacket>(tmp);
+  }
+
+  EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const
+  {
+    eigen_internal_assert(RhsPacketSize<=8);
+    dest = pset1<RhsPacket>(*b);
+  }
+
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = pload<LhsPacket>(a);
+  }
+
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
+  {
+    dest = ploadu<LhsPacketType>(a);
+  }
+
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
+  {
+    madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
+  }
+
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
+  {
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+    EIGEN_UNUSED_VARIABLE(tmp);
+    c.v = pmadd(a.v,b,c.v);
+#else
+    tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
+  {
+    c += a * b;
+  }
+
+  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+  {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+
+  template <typename ResPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
+  {
+    conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
+    r = cj.pmadd(c,alpha,r);
+  }
+
+protected:
+};
+
+template<typename Packet>
+struct DoublePacket
+{
+  Packet first;
+  Packet second;
+};
+
+template<typename Packet>
+DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
+{
+  DoublePacket<Packet> res;
+  res.first  = padd(a.first, b.first);
+  res.second = padd(a.second,b.second);
+  return res;
+}
+
+// note that for DoublePacket<RealPacket> the "4" in "downto4"
+// corresponds to the number of complexes, so it means "8"
+// it terms of real coefficients.
+
+template<typename Packet>
+const DoublePacket<Packet>&
+predux_half_dowto4(const DoublePacket<Packet> &a,
+                   typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
+{
+  return a;
+}
+
+template<typename Packet>
+DoublePacket<typename unpacket_traits<Packet>::half>
+predux_half_dowto4(const DoublePacket<Packet> &a,
+                   typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
+{
+  // yes, that's pretty hackish :(
+  DoublePacket<typename unpacket_traits<Packet>::half> res;
+  typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
+  typedef typename packet_traits<Cplx>::type CplxPacket;
+  res.first  = predux_half_dowto4(CplxPacket(a.first)).v;
+  res.second = predux_half_dowto4(CplxPacket(a.second)).v;
+  return res;
+}
+
+// same here, "quad" actually means "8" in terms of real coefficients
+template<typename Scalar, typename RealPacket>
+void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
+                            typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0)
+{
+  dest.first  = pset1<RealPacket>(numext::real(*b));
+  dest.second = pset1<RealPacket>(numext::imag(*b));
+}
+
+template<typename Scalar, typename RealPacket>
+void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
+                            typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
+{
+  // yes, that's pretty hackish too :(
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
+  RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
+  dest.first  = ploadquad<RealPacket>(r);
+  dest.second = ploadquad<RealPacket>(i);
+}
+
+
+template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
+  typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
+};
+// template<typename Packet>
+// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
+// {
+//   DoublePacket<Packet> res;
+//   res.first  = padd(a.first, b.first);
+//   res.second = padd(a.second,b.second);
+//   return res;
+// }
+
+template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
+class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
+{
+public:
+  typedef std::complex<RealScalar>  Scalar;
+  typedef std::complex<RealScalar>  LhsScalar;
+  typedef std::complex<RealScalar>  RhsScalar;
+  typedef std::complex<RealScalar>  ResScalar;
+  
+  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+  PACKET_DECL_COND(Real, _PacketSize);
+  PACKET_DECL_COND_SCALAR(_PacketSize);
+
+  enum {
+    ConjLhs = _ConjLhs,
+    ConjRhs = _ConjRhs,
+    Vectorizable = unpacket_traits<RealPacket>::vectorizable
+                && unpacket_traits<ScalarPacket>::vectorizable,
+    ResPacketSize   = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+    LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
+    RealPacketSize  = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
+
+    // FIXME: should depend on NumberOfRegisters
+    nr = 4,
+    mr = ResPacketSize,
+
+    LhsProgress = ResPacketSize,
+    RhsProgress = 1
+  };
+  
+  typedef DoublePacket<RealPacket>                 DoublePacketType;
+
+  typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing;
+  typedef typename conditional<Vectorizable,RealPacket,  Scalar>::type LhsPacket;
+  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
+  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
+
+  // this actualy holds 8 packets!
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
+  
+  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
+
+  EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
+  {
+    p.first   = pset1<RealPacket>(RealScalar(0));
+    p.second  = pset1<RealPacket>(RealScalar(0));
+  }
+
+  // Scalar path
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const
+  {
+    dest = pset1<ScalarPacket>(*b);
+  }
+
+  // Vectorized path
+  template<typename RealPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
+  {
+    dest.first  = pset1<RealPacketType>(numext::real(*b));
+    dest.second = pset1<RealPacketType>(numext::imag(*b));
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    loadRhs(b, dest.B_0);
+    loadRhs(b + 1, dest.B1);
+    loadRhs(b + 2, dest.B2);
+    loadRhs(b + 3, dest.B3);
+  }
+
+  // Scalar path
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
+  {
+    loadRhs(b, dest);
+  }
+
+  // Vectorized path
+  template<typename RealPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
+  {
+    loadRhs(b, dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+  
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
+  {
+    loadRhs(b,dest);
+  }
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
+  {
+    loadQuadToDoublePacket(b,dest);
+  }
+
+  // nothing special here
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
+  }
+
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
+  {
+    dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
+  }
+
+  template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
+  EIGEN_STRONG_INLINE
+  typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type
+  madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
+  {
+    c.first   = padd(pmul(a,b.first), c.first);
+    c.second  = padd(pmul(a,b.second),c.second);
+  }
+
+  template<typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
+  {
+    c = cj.pmadd(a,b,c);
+  }
+
+  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+  {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+  
+  EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
+  
+  template<typename RealPacketType, typename ResPacketType>
+  EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const
+  {
+    // assemble c
+    ResPacketType tmp;
+    if((!ConjLhs)&&(!ConjRhs))
+    {
+      tmp = pcplxflip(pconj(ResPacketType(c.second)));
+      tmp = padd(ResPacketType(c.first),tmp);
+    }
+    else if((!ConjLhs)&&(ConjRhs))
+    {
+      tmp = pconj(pcplxflip(ResPacketType(c.second)));
+      tmp = padd(ResPacketType(c.first),tmp);
+    }
+    else if((ConjLhs)&&(!ConjRhs))
+    {
+      tmp = pcplxflip(ResPacketType(c.second));
+      tmp = padd(pconj(ResPacketType(c.first)),tmp);
+    }
+    else if((ConjLhs)&&(ConjRhs))
+    {
+      tmp = pcplxflip(ResPacketType(c.second));
+      tmp = psub(pconj(ResPacketType(c.first)),tmp);
+    }
+    
+    r = pmadd(tmp,alpha,r);
+  }
+
+protected:
+  conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
+};
+
+template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize>
+class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
+{
+public:
+  typedef std::complex<RealScalar>  Scalar;
+  typedef RealScalar  LhsScalar;
+  typedef Scalar      RhsScalar;
+  typedef Scalar      ResScalar;
+
+  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
+  PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
+
+#undef PACKET_DECL_COND_SCALAR_PREFIX
+#undef PACKET_DECL_COND_PREFIX
+#undef PACKET_DECL_COND_SCALAR
+#undef PACKET_DECL_COND
+
+  enum {
+    ConjLhs = false,
+    ConjRhs = _ConjRhs,
+    Vectorizable = unpacket_traits<_RealPacket>::vectorizable
+                && unpacket_traits<_ScalarPacket>::vectorizable,
+    LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+    ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+    
+    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+    // FIXME: should depend on NumberOfRegisters
+    nr = 4,
+    mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
+
+    LhsProgress = ResPacketSize,
+    RhsProgress = 1
+  };
+
+  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef LhsPacket LhsPacket4Packing;
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
+  typedef ResPacket AccPacket;
+
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+  {
+    p = pset1<ResPacket>(ResScalar(0));
+  }
+
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    dest = pset1<RhsPacketType>(*b);
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    loadRhs(b, dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {}
+
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = ploaddup<LhsPacket>(a);
+  }
+  
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = ploadquad<RhsPacket>(b);
+  }
+
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
+  {
+    dest = ploaddup<LhsPacketType>(a);
+  }
+
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
+  {
+    madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
+  }
+
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
+  {
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+    EIGEN_UNUSED_VARIABLE(tmp);
+    c.v = pmadd(a,b.v,c.v);
+#else
+    tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
+#endif
+    
+  }
+
+  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
+  {
+    c += a * b;
+  }
+
+  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+  {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+
+  template <typename ResPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
+  {
+    conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
+    r = cj.pmadd(alpha,c,r);
+  }
+
+protected:
+
+};
+
+/* optimized General packed Block * packed Panel product kernel
+ *
+ * Mixing type logic: C += A * B
+ *  |  A  |  B  | comments
+ *  |real |cplx | no vectorization yet, would require to pack A with duplication
+ *  |cplx |real | easy vectorization
+ */
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel
+{
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf> HalfTraits;
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketQuarter> QuarterTraits;
+  
+  typedef typename Traits::ResScalar ResScalar;
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
+  typedef typename Traits::AccPacket AccPacket;
+  typedef typename Traits::RhsPacketx4 RhsPacketx4;
+
+  typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
+
+  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
+
+  typedef typename SwappedTraits::ResScalar SResScalar;
+  typedef typename SwappedTraits::LhsPacket SLhsPacket;
+  typedef typename SwappedTraits::RhsPacket SRhsPacket;
+  typedef typename SwappedTraits::ResPacket SResPacket;
+  typedef typename SwappedTraits::AccPacket SAccPacket;
+
+  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+  typedef typename HalfTraits::ResPacket ResPacketHalf;
+  typedef typename HalfTraits::AccPacket AccPacketHalf;
+
+  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
+  typedef typename QuarterTraits::AccPacket AccPacketQuarter;
+
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
+  enum {
+    Vectorizable  = Traits::Vectorizable,
+    LhsProgress   = Traits::LhsProgress,
+    LhsProgressHalf      = HalfTraits::LhsProgress,
+    LhsProgressQuarter   = QuarterTraits::LhsProgress,
+    RhsProgress   = Traits::RhsProgress,
+    RhsProgressHalf      = HalfTraits::RhsProgress,
+    RhsProgressQuarter   = QuarterTraits::RhsProgress,
+    ResPacketSize = Traits::ResPacketSize
+  };
+
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+                  Index rows, Index depth, Index cols, ResScalar alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+};
+
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
+int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress>
+struct last_row_process_16_packets
+{
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
+  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
+
+  typedef typename Traits::ResScalar ResScalar;
+  typedef typename SwappedTraits::LhsPacket SLhsPacket;
+  typedef typename SwappedTraits::RhsPacket SRhsPacket;
+  typedef typename SwappedTraits::ResPacket SResPacket;
+  typedef typename SwappedTraits::AccPacket SAccPacket;
+
+  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
+                  const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
+                  ResScalar alpha, SAccPacket &C0)
+    {
+      EIGEN_UNUSED_VARIABLE(res);
+      EIGEN_UNUSED_VARIABLE(straits);
+      EIGEN_UNUSED_VARIABLE(blA);
+      EIGEN_UNUSED_VARIABLE(blB);
+      EIGEN_UNUSED_VARIABLE(depth);
+      EIGEN_UNUSED_VARIABLE(endk);
+      EIGEN_UNUSED_VARIABLE(i);
+      EIGEN_UNUSED_VARIABLE(j2);
+      EIGEN_UNUSED_VARIABLE(alpha);
+      EIGEN_UNUSED_VARIABLE(C0);
+    }
+};
+
+
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper,  mr,  nr, ConjugateLhs,  ConjugateRhs, 16> {
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
+  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
+
+  typedef typename Traits::ResScalar ResScalar;
+  typedef typename SwappedTraits::LhsPacket SLhsPacket;
+  typedef typename SwappedTraits::RhsPacket SRhsPacket;
+  typedef typename SwappedTraits::ResPacket SResPacket;
+  typedef typename SwappedTraits::AccPacket SAccPacket;
+
+  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
+                  const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
+                  ResScalar alpha, SAccPacket &C0)
+  {
+    typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
+    typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
+    typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
+    typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
+
+    SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
+    SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
+
+    if (depth - endk > 0)
+      {
+	// We have to handle the last row(s) of the rhs, which
+	// correspond to a half-packet
+	SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
+
+	for (Index kk = endk; kk < depth; kk++)
+	  {
+	    SLhsPacketQuarter a0;
+	    SRhsPacketQuarter b0;
+	    straits.loadLhsUnaligned(blB, a0);
+	    straits.loadRhs(blA, b0);
+	    straits.madd(a0,b0,c0,b0, fix<0>);
+	    blB += SwappedTraits::LhsProgress/4;
+	    blA += 1;
+	  }
+	straits.acc(c0, alphav, R);
+      }
+    else
+      {
+	straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
+      }
+    res.scatterPacket(i, j2, R);
+  }
+};
+
+template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
+struct lhs_process_one_packet
+{
+  typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
+
+  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
+  {
+    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
+    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
+    traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
+    traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
+    traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
+    traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
+    traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
+    traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
+    #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
+    __asm__  ("" : "+x,m" (*A0));
+    #endif
+    EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
+  }
+
+  EIGEN_STRONG_INLINE void operator()(
+    const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
+    Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
+    int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
+  {
+    GEBPTraits traits;
+
+    // loops on each largest micro horizontal panel of lhs
+    // (LhsProgress x depth)
+    for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
+    {
+      // loops on each largest micro vertical panel of rhs (depth * nr)
+      for(Index j2=0; j2<packet_cols4; j2+=nr)
+      {
+        // We select a LhsProgress x nr micro block of res
+        // which is entirely stored into 1 x nr registers.
+
+        const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
+        prefetch(&blA[0]);
+
+        // gets res block as register
+        AccPacket C0, C1, C2, C3;
+        traits.initAcc(C0);
+        traits.initAcc(C1);
+        traits.initAcc(C2);
+        traits.initAcc(C3);
+        // To improve instruction pipelining, let's double the accumulation registers:
+        //  even k will accumulate in C*, while odd k will accumulate in D*.
+        // This trick is crutial to get good performance with FMA, otherwise it is 
+        // actually faster to perform separated MUL+ADD because of a naturally
+        // better instruction-level parallelism.
+        AccPacket D0, D1, D2, D3;
+        traits.initAcc(D0);
+        traits.initAcc(D1);
+        traits.initAcc(D2);
+        traits.initAcc(D3);
+
+        LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+        LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+        LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+        LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+        r0.prefetch(prefetch_res_offset);
+        r1.prefetch(prefetch_res_offset);
+        r2.prefetch(prefetch_res_offset);
+        r3.prefetch(prefetch_res_offset);
+
+        // performs "inner" products
+        const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+        prefetch(&blB[0]);
+        LhsPacket A0, A1;
+
+        for(Index k=0; k<peeled_kc; k+=pk)
+        {
+          EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
+          RhsPacketx4 rhs_panel;
+          RhsPacket T0;
+
+          internal::prefetch(blB+(48+0));
+          peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+          peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+          internal::prefetch(blB+(48+16));
+          peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+          peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+
+          blB += pk*4*RhsProgress;
+          blA += pk*LhsProgress;
+
+          EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
+        }
+        C0 = padd(C0,D0);
+        C1 = padd(C1,D1);
+        C2 = padd(C2,D2);
+        C3 = padd(C3,D3);
+
+        // process remaining peeled loop
+        for(Index k=peeled_kc; k<depth; k++)
+        {
+          RhsPacketx4 rhs_panel;
+          RhsPacket T0;
+          peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          blB += 4*RhsProgress;
+          blA += LhsProgress;
+        }
+
+        ResPacket R0, R1;
+        ResPacket alphav = pset1<ResPacket>(alpha);
+
+        R0 = r0.template loadPacket<ResPacket>(0);
+        R1 = r1.template loadPacket<ResPacket>(0);
+        traits.acc(C0, alphav, R0);
+        traits.acc(C1,  alphav, R1);
+        r0.storePacket(0, R0);
+        r1.storePacket(0, R1);
+
+        R0 = r2.template loadPacket<ResPacket>(0);
+        R1 = r3.template loadPacket<ResPacket>(0);
+        traits.acc(C2,  alphav, R0);
+        traits.acc(C3,  alphav, R1);
+        r2.storePacket(0, R0);
+        r3.storePacket(0, R1);
+      }
+
+      // Deal with remaining columns of the rhs
+      for(Index j2=packet_cols4; j2<cols; j2++)
+      {
+        // One column at a time
+        const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
+        prefetch(&blA[0]);
+
+        // gets res block as register
+        AccPacket C0;
+        traits.initAcc(C0);
+
+        LinearMapper r0 = res.getLinearMapper(i, j2);
+
+        // performs "inner" products
+        const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+        LhsPacket A0;
+
+        for(Index k= 0; k<peeled_kc; k+=pk)
+        {
+          EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
+          RhsPacket B_0;
+
+#define EIGEN_GEBGP_ONESTEP(K)                                          \
+	      do {                                                      \
+		EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
+		EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+    /* FIXME: why unaligned???? */ \
+		traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
+		traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);		\
+		traits.madd(A0, B_0, C0, B_0, fix<0>);				\
+		EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
+	      } while(false);
+
+          EIGEN_GEBGP_ONESTEP(0);
+          EIGEN_GEBGP_ONESTEP(1);
+          EIGEN_GEBGP_ONESTEP(2);
+          EIGEN_GEBGP_ONESTEP(3);
+          EIGEN_GEBGP_ONESTEP(4);
+          EIGEN_GEBGP_ONESTEP(5);
+          EIGEN_GEBGP_ONESTEP(6);
+          EIGEN_GEBGP_ONESTEP(7);
+
+          blB += pk*RhsProgress;
+          blA += pk*LhsProgress;
+
+          EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
+        }
+
+        // process remaining peeled loop
+        for(Index k=peeled_kc; k<depth; k++)
+        {
+          RhsPacket B_0;
+          EIGEN_GEBGP_ONESTEP(0);
+          blB += RhsProgress;
+          blA += LhsProgress;
+        }
+#undef EIGEN_GEBGP_ONESTEP
+        ResPacket R0;
+        ResPacket alphav = pset1<ResPacket>(alpha);
+        R0 = r0.template loadPacket<ResPacket>(0);
+        traits.acc(C0, alphav, R0);
+        r0.storePacket(0, R0);
+      }
+    }
+  }
+};
+
+template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
+struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
+{
+
+EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
+  {
+        EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
+        EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
+        traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
+        traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
+        traits.madd(*A0, *B_0, *C0, *B_0);
+        traits.madd(*A0, *B1,  *C1, *B1);
+        traits.madd(*A0, *B2,  *C2, *B2);
+        traits.madd(*A0, *B3,  *C3, *B3);
+        EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
+  }
+};
+
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
+  ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+               Index rows, Index depth, Index cols, ResScalar alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    Traits traits;
+    SwappedTraits straits;
+    
+    if(strideA==-1) strideA = depth;
+    if(strideB==-1) strideB = depth;
+    conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+    Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+    const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
+    const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
+    const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
+    const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
+    const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
+    enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
+    const Index peeled_kc  = depth & ~(pk-1);
+    const int prefetch_res_offset = 32/sizeof(ResScalar);    
+//     const Index depth2     = depth & ~1;
+
+    //---------- Process 3 * LhsProgress rows at once ----------
+    // This corresponds to 3*LhsProgress x nr register blocks.
+    // Usually, make sense only with FMA
+    if(mr>=3*Traits::LhsProgress)
+    {
+      // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
+      // and on each largest micro vertical panel of the rhs (depth * nr).
+      // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
+      // However, if depth is too small, we can extend the number of rows of these horizontal panels.
+      // This actual number of rows is computed as follow:
+      const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
+      // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+      // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
+      // or because we are testing specific blocking sizes.
+      const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
+      for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
+      {
+        const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
+        for(Index j2=0; j2<packet_cols4; j2+=nr)
+        {
+          for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
+          {
+          
+          // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
+          // stored into 3 x nr registers.
+          
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C1, C2,  C3,
+                    C4, C5, C6,  C7,
+                    C8, C9, C10, C11;
+          traits.initAcc(C0);  traits.initAcc(C1);  traits.initAcc(C2);  traits.initAcc(C3);
+          traits.initAcc(C4);  traits.initAcc(C5);  traits.initAcc(C6);  traits.initAcc(C7);
+          traits.initAcc(C8);  traits.initAcc(C9);  traits.initAcc(C10); traits.initAcc(C11);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(0);
+          r1.prefetch(0);
+          r2.prefetch(0);
+          r3.prefetch(0);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+          prefetch(&blB[0]);
+          LhsPacket A0, A1;
+
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
+            // 15 registers are taken (12 for acc, 2 for lhs).
+            RhsPanel15 rhs_panel;
+            RhsPacket T0;
+            LhsPacket A2;
+            #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
+            // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
+            // without this workaround A0, A1, and A2 are loaded in the same register,
+            // which is not good for pipelining
+            #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__  ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
+            #else
+            #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
+            #endif
+#define EIGEN_GEBP_ONESTEP(K)                                                     \
+            do {                                                                  \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4");          \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              internal::prefetch(blA + (3 * K + 16) * LhsProgress);               \
+              if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) {                            \
+                internal::prefetch(blB + (4 * K + 16) * RhsProgress);             \
+              } /* Bug 953 */                                                     \
+              traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                \
+              traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                \
+              traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                \
+              EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
+              traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel);     \
+              traits.madd(A0, rhs_panel, C0, T0, fix<0>);                         \
+              traits.madd(A1, rhs_panel, C4, T0, fix<0>);                         \
+              traits.madd(A2, rhs_panel, C8, T0, fix<0>);                         \
+              traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel);   \
+              traits.madd(A0, rhs_panel, C1, T0, fix<1>);                         \
+              traits.madd(A1, rhs_panel, C5, T0, fix<1>);                         \
+              traits.madd(A2, rhs_panel, C9, T0, fix<1>);                         \
+              traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel);   \
+              traits.madd(A0, rhs_panel, C2, T0, fix<2>);                         \
+              traits.madd(A1, rhs_panel, C6, T0, fix<2>);                         \
+              traits.madd(A2, rhs_panel, C10, T0, fix<2>);                        \
+              traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel);   \
+              traits.madd(A0, rhs_panel, C3, T0, fix<3>);                         \
+              traits.madd(A1, rhs_panel, C7, T0, fix<3>);                         \
+              traits.madd(A2, rhs_panel, C11, T0, fix<3>);                        \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4");            \
+            } while (false)
+
+            internal::prefetch(blB);
+            EIGEN_GEBP_ONESTEP(0);
+            EIGEN_GEBP_ONESTEP(1);
+            EIGEN_GEBP_ONESTEP(2);
+            EIGEN_GEBP_ONESTEP(3);
+            EIGEN_GEBP_ONESTEP(4);
+            EIGEN_GEBP_ONESTEP(5);
+            EIGEN_GEBP_ONESTEP(6);
+            EIGEN_GEBP_ONESTEP(7);
+
+            blB += pk*4*RhsProgress;
+            blA += pk*3*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
+          }
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPanel15 rhs_panel;
+            RhsPacket T0;
+            LhsPacket A2;
+            EIGEN_GEBP_ONESTEP(0);
+            blB += 4*RhsProgress;
+            blA += 3*Traits::LhsProgress;
+          }
+
+#undef EIGEN_GEBP_ONESTEP
+
+          ResPacket R0, R1, R2;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C4, alphav, R1);
+          traits.acc(C8, alphav, R2);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+          traits.acc(C1, alphav, R0);
+          traits.acc(C5, alphav, R1);
+          traits.acc(C9, alphav, R2);
+          r1.storePacket(0 * Traits::ResPacketSize, R0);
+          r1.storePacket(1 * Traits::ResPacketSize, R1);
+          r1.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+          traits.acc(C2, alphav, R0);
+          traits.acc(C6, alphav, R1);
+          traits.acc(C10, alphav, R2);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r2.storePacket(1 * Traits::ResPacketSize, R1);
+          r2.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+          traits.acc(C3, alphav, R0);
+          traits.acc(C7, alphav, R1);
+          traits.acc(C11, alphav, R2);
+          r3.storePacket(0 * Traits::ResPacketSize, R0);
+          r3.storePacket(1 * Traits::ResPacketSize, R1);
+          r3.storePacket(2 * Traits::ResPacketSize, R2);          
+          }
+        }
+
+        // Deal with remaining columns of the rhs
+        for(Index j2=packet_cols4; j2<cols; j2++)
+        {
+          for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
+          {
+          // One column at a time
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C4, C8;
+          traits.initAcc(C0);
+          traits.initAcc(C4);
+          traits.initAcc(C8);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2);
+          r0.prefetch(0);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          LhsPacket A0, A1, A2;
+          
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
+            RhsPacket B_0;
+#define EIGEN_GEBGP_ONESTEP(K)                                                    \
+            do {                                                                  \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1");          \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                \
+              traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                \
+              traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                \
+              traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0);                   \
+              traits.madd(A0, B_0, C0, B_0, fix<0>);                              \
+              traits.madd(A1, B_0, C4, B_0, fix<0>);                              \
+              traits.madd(A2, B_0, C8, B_0, fix<0>);                              \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1");            \
+            } while (false)
+
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += int(pk) * int(RhsProgress);
+            blA += int(pk) * 3 * int(Traits::LhsProgress);
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
+          }
+
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacket B_0;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += RhsProgress;
+            blA += 3*Traits::LhsProgress;
+          }
+#undef EIGEN_GEBGP_ONESTEP
+          ResPacket R0, R1, R2;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C4, alphav, R1);
+          traits.acc(C8, alphav, R2);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);          
+          }
+        }
+      }
+    }
+
+    //---------- Process 2 * LhsProgress rows at once ----------
+    if(mr>=2*Traits::LhsProgress)
+    {
+      const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
+      // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+      // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
+      // or because we are testing specific blocking sizes.
+      Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
+
+      for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
+      {
+        Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
+        for(Index j2=0; j2<packet_cols4; j2+=nr)
+        {
+          for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
+          {
+          
+          // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
+          // stored into 2 x nr registers.
+          
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C1, C2, C3,
+                    C4, C5, C6, C7;
+          traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
+          traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(prefetch_res_offset);
+          r1.prefetch(prefetch_res_offset);
+          r2.prefetch(prefetch_res_offset);
+          r3.prefetch(prefetch_res_offset);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+          prefetch(&blB[0]);
+          LhsPacket A0, A1;
+
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
+            RhsPacketx4 rhs_panel;
+            RhsPacket T0;
+
+          // NOTE: the begin/end asm comments below work around bug 935!
+          // but they are not enough for gcc>=6 without FMA (bug 1637)
+          #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
+            #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__  ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
+          #else
+            #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
+          #endif
+#define EIGEN_GEBGP_ONESTEP(K)                                            \
+            do {                                                          \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");  \
+              traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0);        \
+              traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1);        \
+              traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
+              traits.madd(A0, rhs_panel, C0, T0, fix<0>);                 \
+              traits.madd(A1, rhs_panel, C4, T0, fix<0>);                 \
+              traits.madd(A0, rhs_panel, C1, T0, fix<1>);                 \
+              traits.madd(A1, rhs_panel, C5, T0, fix<1>);                 \
+              traits.madd(A0, rhs_panel, C2, T0, fix<2>);                 \
+              traits.madd(A1, rhs_panel, C6, T0, fix<2>);                 \
+              traits.madd(A0, rhs_panel, C3, T0, fix<3>);                 \
+              traits.madd(A1, rhs_panel, C7, T0, fix<3>);                 \
+              EIGEN_GEBP_2PX4_SPILLING_WORKAROUND                         \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");    \
+            } while (false)
+
+            internal::prefetch(blB+(48+0));
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            internal::prefetch(blB+(48+16));
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*4*RhsProgress;
+            blA += pk*(2*Traits::LhsProgress);
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
+          }
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacketx4 rhs_panel;
+            RhsPacket T0;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += 4*RhsProgress;
+            blA += 2*Traits::LhsProgress;
+          }
+#undef EIGEN_GEBGP_ONESTEP
+
+          ResPacket R0, R1, R2, R3;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C4, alphav, R1);
+          traits.acc(C1, alphav, R2);
+          traits.acc(C5, alphav, R3);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r1.storePacket(0 * Traits::ResPacketSize, R2);
+          r1.storePacket(1 * Traits::ResPacketSize, R3);
+
+          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          traits.acc(C2,  alphav, R0);
+          traits.acc(C6,  alphav, R1);
+          traits.acc(C3,  alphav, R2);
+          traits.acc(C7,  alphav, R3);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r2.storePacket(1 * Traits::ResPacketSize, R1);
+          r3.storePacket(0 * Traits::ResPacketSize, R2);
+          r3.storePacket(1 * Traits::ResPacketSize, R3);
+          }
+        }
+      
+        // Deal with remaining columns of the rhs
+        for(Index j2=packet_cols4; j2<cols; j2++)
+        {
+          for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
+          {
+          // One column at a time
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C4;
+          traits.initAcc(C0);
+          traits.initAcc(C4);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2);
+          r0.prefetch(prefetch_res_offset);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          LhsPacket A0, A1;
+
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
+            RhsPacket B_0, B1;
+        
+#define EIGEN_GEBGP_ONESTEP(K) \
+            do {                                                                  \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");          \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                      \
+              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                      \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                       \
+              traits.madd(A0, B_0, C0, B1, fix<0>);                               \
+              traits.madd(A1, B_0, C4, B_0, fix<0>);                              \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");            \
+            } while(false)
+        
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += int(pk) * int(RhsProgress);
+            blA += int(pk) * 2 * int(Traits::LhsProgress);
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
+          }
+
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacket B_0, B1;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += RhsProgress;
+            blA += 2*Traits::LhsProgress;
+          }
+#undef EIGEN_GEBGP_ONESTEP
+          ResPacket R0, R1;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C4, alphav, R1);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          }
+        }
+      }
+    }
+    //---------- Process 1 * LhsProgress rows at once ----------
+    if(mr>=1*Traits::LhsProgress)
+    {
+      lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
+      p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
+    }
+    //---------- Process LhsProgressHalf rows at once ----------
+    if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
+    {
+      lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
+      p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
+    }
+    //---------- Process LhsProgressQuarter rows at once ----------
+    if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
+    {
+      lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
+      p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
+    }
+    //---------- Process remaining rows, 1 at once ----------
+    if(peeled_mc_quarter<rows)
+    {
+      // loop on each panel of the rhs
+      for(Index j2=0; j2<packet_cols4; j2+=nr)
+      {
+        // loop on each row of the lhs (1*LhsProgress x depth)
+        for(Index i=peeled_mc_quarter; i<rows; i+=1)
+        {
+          const LhsScalar* blA = &blockA[i*strideA+offsetA];
+          prefetch(&blA[0]);
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+
+          // If LhsProgress is 8 or 16, it assumes that there is a
+          // half or quarter packet, respectively, of the same size as
+          // nr (which is currently 4) for the return type.
+          const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
+          const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
+          if ((SwappedTraits::LhsProgress % 4) == 0 &&
+              (SwappedTraits::LhsProgress<=16) &&
+              (SwappedTraits::LhsProgress!=8  || SResPacketHalfSize==nr) &&
+              (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
+          {
+            SAccPacket C0, C1, C2, C3;
+            straits.initAcc(C0);
+            straits.initAcc(C1);
+            straits.initAcc(C2);
+            straits.initAcc(C3);
+
+            const Index spk   = (std::max)(1,SwappedTraits::LhsProgress/4);
+            const Index endk  = (depth/spk)*spk;
+            const Index endk4 = (depth/(spk*4))*(spk*4);
+
+            Index k=0;
+            for(; k<endk4; k+=4*spk)
+            {
+              SLhsPacket A0,A1;
+              SRhsPacket B_0,B_1;
+
+              straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
+              straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
+
+              straits.loadRhsQuad(blA+0*spk, B_0);
+              straits.loadRhsQuad(blA+1*spk, B_1);
+              straits.madd(A0,B_0,C0,B_0, fix<0>);
+              straits.madd(A1,B_1,C1,B_1, fix<0>);
+
+              straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
+              straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
+              straits.loadRhsQuad(blA+2*spk, B_0);
+              straits.loadRhsQuad(blA+3*spk, B_1);
+              straits.madd(A0,B_0,C2,B_0, fix<0>);
+              straits.madd(A1,B_1,C3,B_1, fix<0>);
+
+              blB += 4*SwappedTraits::LhsProgress;
+              blA += 4*spk;
+            }
+            C0 = padd(padd(C0,C1),padd(C2,C3));
+            for(; k<endk; k+=spk)
+            {
+              SLhsPacket A0;
+              SRhsPacket B_0;
+
+              straits.loadLhsUnaligned(blB, A0);
+              straits.loadRhsQuad(blA, B_0);
+              straits.madd(A0,B_0,C0,B_0, fix<0>);
+
+              blB += SwappedTraits::LhsProgress;
+              blA += spk;
+            }
+            if(SwappedTraits::LhsProgress==8)
+            {
+              // Special case where we have to first reduce the accumulation register C0
+              typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
+              typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
+              typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
+              typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
+
+              SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
+              SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
+
+              if(depth-endk>0)
+              {
+                // We have to handle the last row of the rhs which corresponds to a half-packet
+                SLhsPacketHalf a0;
+                SRhsPacketHalf b0;
+                straits.loadLhsUnaligned(blB, a0);
+                straits.loadRhs(blA, b0);
+                SAccPacketHalf c0 = predux_half_dowto4(C0);
+                straits.madd(a0,b0,c0,b0, fix<0>);
+                straits.acc(c0, alphav, R);
+              }
+              else
+              {
+                straits.acc(predux_half_dowto4(C0), alphav, R);
+              }
+              res.scatterPacket(i, j2, R);
+            }
+            else if (SwappedTraits::LhsProgress==16)
+            {
+              // Special case where we have to first reduce the
+              // accumulation register C0. We specialize the block in
+              // template form, so that LhsProgress < 16 paths don't
+              // fail to compile
+              last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
+	            p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
+            }
+            else
+            {
+              SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
+              SResPacket alphav = pset1<SResPacket>(alpha);
+              straits.acc(C0, alphav, R);
+              res.scatterPacket(i, j2, R);
+            }
+          }
+          else // scalar path
+          {
+            // get a 1 x 4 res block as registers
+            ResScalar C0(0), C1(0), C2(0), C3(0);
+
+            for(Index k=0; k<depth; k++)
+            {
+              LhsScalar A0;
+              RhsScalar B_0, B_1;
+
+              A0 = blA[k];
+
+              B_0 = blB[0];
+              B_1 = blB[1];
+              C0 = cj.pmadd(A0,B_0,C0);
+              C1 = cj.pmadd(A0,B_1,C1);
+
+              B_0 = blB[2];
+              B_1 = blB[3];
+              C2 = cj.pmadd(A0,B_0,C2);
+              C3 = cj.pmadd(A0,B_1,C3);
+
+              blB += 4;
+            }
+            res(i, j2 + 0) += alpha * C0;
+            res(i, j2 + 1) += alpha * C1;
+            res(i, j2 + 2) += alpha * C2;
+            res(i, j2 + 3) += alpha * C3;
+          }
+        }
+      }
+      // remaining columns
+      for(Index j2=packet_cols4; j2<cols; j2++)
+      {
+        // loop on each row of the lhs (1*LhsProgress x depth)
+        for(Index i=peeled_mc_quarter; i<rows; i+=1)
+        {
+          const LhsScalar* blA = &blockA[i*strideA+offsetA];
+          prefetch(&blA[0]);
+          // gets a 1 x 1 res block as registers
+          ResScalar C0(0);
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          for(Index k=0; k<depth; k++)
+          {
+            LhsScalar A0 = blA[k];
+            RhsScalar B_0 = blB[k];
+            C0 = cj.pmadd(A0, B_0, C0);
+          }
+          res(i, j2) += alpha * C0;
+        }
+      }
+    }
+  }
+
+
+// pack a block of the lhs
+// The traversal is as follow (mr==4):
+//   0  4  8 12 ...
+//   1  5  9 13 ...
+//   2  6 10 14 ...
+//   3  7 11 15 ...
+//
+//  16 20 24 28 ...
+//  17 21 25 29 ...
+//  18 22 26 30 ...
+//  19 23 27 31 ...
+//
+//  32 33 34 35 ...
+//  36 36 38 39 ...
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+{
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+  enum { PacketSize = unpacket_traits<Packet>::size,
+         HalfPacketSize = unpacket_traits<HalfPacket>::size,
+         QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+         HasHalf = (int)HalfPacketSize < (int)PacketSize,
+         HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
+
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
+  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+  eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index count = 0;
+
+  const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+  const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+  const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
+  const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
+  const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
+  const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
+  const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
+                         : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
+
+  Index i=0;
+
+  // Pack 3 packets
+  if(Pack1>=3*PacketSize)
+  {
+    for(; i<peeled_mc3; i+=3*PacketSize)
+    {
+      if(PanelMode) count += (3*PacketSize) * offset;
+
+      for(Index k=0; k<depth; k++)
+      {
+        Packet A, B, C;
+        A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
+        B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
+        C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
+        pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
+        pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
+        pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
+      }
+      if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
+    }
+  }
+  // Pack 2 packets
+  if(Pack1>=2*PacketSize)
+  {
+    for(; i<peeled_mc2; i+=2*PacketSize)
+    {
+      if(PanelMode) count += (2*PacketSize) * offset;
+
+      for(Index k=0; k<depth; k++)
+      {
+        Packet A, B;
+        A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
+        B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
+        pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
+        pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
+      }
+      if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
+    }
+  }
+  // Pack 1 packets
+  if(Pack1>=1*PacketSize)
+  {
+    for(; i<peeled_mc1; i+=1*PacketSize)
+    {
+      if(PanelMode) count += (1*PacketSize) * offset;
+
+      for(Index k=0; k<depth; k++)
+      {
+        Packet A;
+        A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
+        pstore(blockA+count, cj.pconj(A));
+        count+=PacketSize;
+      }
+      if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
+    }
+  }
+  // Pack half packets
+  if(HasHalf && Pack1>=HalfPacketSize)
+  {
+    for(; i<peeled_mc_half; i+=HalfPacketSize)
+    {
+      if(PanelMode) count += (HalfPacketSize) * offset;
+
+      for(Index k=0; k<depth; k++)
+      {
+        HalfPacket A;
+        A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
+        pstoreu(blockA+count, cj.pconj(A));
+        count+=HalfPacketSize;
+      }
+      if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
+    }
+  }
+  // Pack quarter packets
+  if(HasQuarter && Pack1>=QuarterPacketSize)
+  {
+    for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
+    {
+      if(PanelMode) count += (QuarterPacketSize) * offset;
+
+      for(Index k=0; k<depth; k++)
+      {
+        QuarterPacket A;
+        A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
+        pstoreu(blockA+count, cj.pconj(A));
+        count+=QuarterPacketSize;
+      }
+      if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
+    }
+  }
+  // Pack2 may be *smaller* than PacketSize—that happens for
+  // products like real * complex, where we have to go half the
+  // progress on the lhs in order to duplicate those operands to
+  // address both real & imaginary parts on the rhs. This portion will
+  // pack those half ones until they match the number expected on the
+  // last peeling loop at this point (for the rhs).
+  if(Pack2<PacketSize && Pack2>1)
+  {
+    for(; i<peeled_mc0; i+=last_lhs_progress)
+    {
+      if(PanelMode) count += last_lhs_progress * offset;
+
+      for(Index k=0; k<depth; k++)
+        for(Index w=0; w<last_lhs_progress; w++)
+          blockA[count++] = cj(lhs(i+w, k));
+
+      if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
+    }
+  }
+  // Pack scalars
+  for(; i<rows; i++)
+  {
+    if(PanelMode) count += offset;
+    for(Index k=0; k<depth; k++)
+      blockA[count++] = cj(lhs(i, k));
+    if(PanelMode) count += (stride-offset-depth);
+  }
+}
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+{
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+  enum { PacketSize = unpacket_traits<Packet>::size,
+         HalfPacketSize = unpacket_traits<HalfPacket>::size,
+         QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+         HasHalf = (int)HalfPacketSize < (int)PacketSize,
+         HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
+
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
+  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index count = 0;
+  bool gone_half = false, gone_quarter = false, gone_last = false;
+
+  Index i = 0;
+  int pack = Pack1;
+  int psize = PacketSize;
+  while(pack>0)
+  {
+    Index remaining_rows = rows-i;
+    Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
+    Index starting_pos = i;
+    for(; i<peeled_mc; i+=pack)
+    {
+      if(PanelMode) count += pack * offset;
+
+      Index k=0;
+      if(pack>=psize && psize >= QuarterPacketSize)
+      {
+        const Index peeled_k = (depth/psize)*psize;
+        for(; k<peeled_k; k+=psize)
+        {
+          for (Index m = 0; m < pack; m += psize)
+          {
+            if (psize == PacketSize) {
+              PacketBlock<Packet> kernel;
+              for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
+              ptranspose(kernel);
+              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
+            } else if (HasHalf && psize == HalfPacketSize) {
+              gone_half = true;
+              PacketBlock<HalfPacket> kernel_half;
+              for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
+              ptranspose(kernel_half);
+              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
+            } else if (HasQuarter && psize == QuarterPacketSize) {
+              gone_quarter = true;
+              PacketBlock<QuarterPacket> kernel_quarter;
+              for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
+              ptranspose(kernel_quarter);
+              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
+	    }
+          }
+          count += psize*pack;
+        }
+      }
+
+      for(; k<depth; k++)
+      {
+        Index w=0;
+        for(; w<pack-3; w+=4)
+        {
+          Scalar a(cj(lhs(i+w+0, k))),
+                 b(cj(lhs(i+w+1, k))),
+                 c(cj(lhs(i+w+2, k))),
+                 d(cj(lhs(i+w+3, k)));
+          blockA[count++] = a;
+          blockA[count++] = b;
+          blockA[count++] = c;
+          blockA[count++] = d;
+        }
+        if(pack%4)
+          for(;w<pack;++w)
+            blockA[count++] = cj(lhs(i+w, k));
+      }
+
+      if(PanelMode) count += pack * (stride-offset-depth);
+    }
+
+    pack -= psize;
+    Index left = rows - i;
+    if (pack <= 0) {
+      if (!gone_last &&
+          (starting_pos == i || left >= psize/2 || left >= psize/4) &&
+          ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
+           (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
+        psize /= 2;
+        pack = psize;
+        continue;
+      }
+      // Pack2 may be *smaller* than PacketSize—that happens for
+      // products like real * complex, where we have to go half the
+      // progress on the lhs in order to duplicate those operands to
+      // address both real & imaginary parts on the rhs. This portion will
+      // pack those half ones until they match the number expected on the
+      // last peeling loop at this point (for the rhs).
+      if (Pack2 < PacketSize && !gone_last) {
+        gone_last = true;
+        psize = pack = left & ~1;
+      }
+    }
+  }
+
+  for(; i<rows; i++)
+  {
+    if(PanelMode) count += offset;
+    for(Index k=0; k<depth; k++)
+      blockA[count++] = cj(lhs(i, k));
+    if(PanelMode) count += (stride-offset-depth);
+  }
+}
+
+// copy a complete panel of the rhs
+// this version is optimized for column major matrices
+// The traversal order is as follow: (nr==4):
+//  0  1  2  3   12 13 14 15   24 27
+//  4  5  6  7   16 17 18 19   25 28
+//  8  9 10 11   20 21 22 23   26 29
+//  .  .  .  .    .  .  .  .    .  .
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  enum { PacketSize = packet_traits<Scalar>::size };
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
+  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+  Index count = 0;
+  const Index peeled_k = (depth/PacketSize)*PacketSize;
+//   if(nr>=8)
+//   {
+//     for(Index j2=0; j2<packet_cols8; j2+=8)
+//     {
+//       // skip what we have before
+//       if(PanelMode) count += 8 * offset;
+//       const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+//       const Scalar* b1 = &rhs[(j2+1)*rhsStride];
+//       const Scalar* b2 = &rhs[(j2+2)*rhsStride];
+//       const Scalar* b3 = &rhs[(j2+3)*rhsStride];
+//       const Scalar* b4 = &rhs[(j2+4)*rhsStride];
+//       const Scalar* b5 = &rhs[(j2+5)*rhsStride];
+//       const Scalar* b6 = &rhs[(j2+6)*rhsStride];
+//       const Scalar* b7 = &rhs[(j2+7)*rhsStride];
+//       Index k=0;
+//       if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4
+//       {
+//         for(; k<peeled_k; k+=PacketSize) {
+//           PacketBlock<Packet> kernel;
+//           for (int p = 0; p < PacketSize; ++p) {
+//             kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
+//           }
+//           ptranspose(kernel);
+//           for (int p = 0; p < PacketSize; ++p) {
+//             pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
+//             count+=PacketSize;
+//           }
+//         }
+//       }
+//       for(; k<depth; k++)
+//       {
+//         blockB[count+0] = cj(b0[k]);
+//         blockB[count+1] = cj(b1[k]);
+//         blockB[count+2] = cj(b2[k]);
+//         blockB[count+3] = cj(b3[k]);
+//         blockB[count+4] = cj(b4[k]);
+//         blockB[count+5] = cj(b5[k]);
+//         blockB[count+6] = cj(b6[k]);
+//         blockB[count+7] = cj(b7[k]);
+//         count += 8;
+//       }
+//       // skip what we have after
+//       if(PanelMode) count += 8 * (stride-offset-depth);
+//     }
+//   }
+
+  if(nr>=4)
+  {
+    for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
+    {
+      // skip what we have before
+      if(PanelMode) count += 4 * offset;
+      const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k=0;
+      if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
+      {
+        for(; k<peeled_k; k+=PacketSize) {
+          PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
+          kernel.packet[0           ] = dm0.template loadPacket<Packet>(k);
+          kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
+          kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
+          kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
+          ptranspose(kernel);
+          pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
+          pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
+          pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
+          pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
+          count+=4*PacketSize;
+        }
+      }
+      for(; k<depth; k++)
+      {
+        blockB[count+0] = cj(dm0(k));
+        blockB[count+1] = cj(dm1(k));
+        blockB[count+2] = cj(dm2(k));
+        blockB[count+3] = cj(dm3(k));
+        count += 4;
+      }
+      // skip what we have after
+      if(PanelMode) count += 4 * (stride-offset-depth);
+    }
+  }
+
+  // copy the remaining columns one at a time (nr==1)
+  for(Index j2=packet_cols4; j2<cols; ++j2)
+  {
+    if(PanelMode) count += offset;
+    const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
+    for(Index k=0; k<depth; k++)
+    {
+      blockB[count] = cj(dm0(k));
+      count += 1;
+    }
+    if(PanelMode) count += (stride-offset-depth);
+  }
+}
+
+// this version is optimized for row major matrices
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  enum { PacketSize = packet_traits<Scalar>::size,
+         HalfPacketSize = unpacket_traits<HalfPacket>::size,
+		 QuarterPacketSize = unpacket_traits<QuarterPacket>::size};
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0)
+  {
+    EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
+    EIGEN_UNUSED_VARIABLE(stride);
+    EIGEN_UNUSED_VARIABLE(offset);
+    eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+    const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
+    const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
+    conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+    Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+    Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+    Index count = 0;
+
+  //   if(nr>=8)
+  //   {
+  //     for(Index j2=0; j2<packet_cols8; j2+=8)
+  //     {
+  //       // skip what we have before
+  //       if(PanelMode) count += 8 * offset;
+  //       for(Index k=0; k<depth; k++)
+  //       {
+  //         if (PacketSize==8) {
+  //           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+  //           pstoreu(blockB+count, cj.pconj(A));
+  //         } else if (PacketSize==4) {
+  //           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+  //           Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
+  //           pstoreu(blockB+count, cj.pconj(A));
+  //           pstoreu(blockB+count+PacketSize, cj.pconj(B));
+  //         } else {
+  //           const Scalar* b0 = &rhs[k*rhsStride + j2];
+  //           blockB[count+0] = cj(b0[0]);
+  //           blockB[count+1] = cj(b0[1]);
+  //           blockB[count+2] = cj(b0[2]);
+  //           blockB[count+3] = cj(b0[3]);
+  //           blockB[count+4] = cj(b0[4]);
+  //           blockB[count+5] = cj(b0[5]);
+  //           blockB[count+6] = cj(b0[6]);
+  //           blockB[count+7] = cj(b0[7]);
+  //         }
+  //         count += 8;
+  //       }
+  //       // skip what we have after
+  //       if(PanelMode) count += 8 * (stride-offset-depth);
+  //     }
+  //   }
+    if(nr>=4)
+    {
+      for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
+      {
+        // skip what we have before
+        if(PanelMode) count += 4 * offset;
+        for(Index k=0; k<depth; k++)
+        {
+          if (PacketSize==4) {
+            Packet A = rhs.template loadPacket<Packet>(k, j2);
+            pstoreu(blockB+count, cj.pconj(A));
+            count += PacketSize;
+          } else if (HasHalf && HalfPacketSize==4) {
+            HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
+            pstoreu(blockB+count, cj.pconj(A));
+            count += HalfPacketSize;
+          } else if (HasQuarter && QuarterPacketSize==4) {
+            QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
+            pstoreu(blockB+count, cj.pconj(A));
+            count += QuarterPacketSize;
+          } else {
+            const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+            blockB[count+0] = cj(dm0(0));
+            blockB[count+1] = cj(dm0(1));
+            blockB[count+2] = cj(dm0(2));
+            blockB[count+3] = cj(dm0(3));
+            count += 4;
+          }
+        }
+        // skip what we have after
+        if(PanelMode) count += 4 * (stride-offset-depth);
+      }
+    }
+    // copy the remaining columns one at a time (nr==1)
+    for(Index j2=packet_cols4; j2<cols; ++j2)
+    {
+      if(PanelMode) count += offset;
+      for(Index k=0; k<depth; k++)
+      {
+        blockB[count] = cj(rhs(k, j2));
+        count += 1;
+      }
+      if(PanelMode) count += stride-offset-depth;
+    }
+  }
+};
+
+} // end namespace internal
+
+/** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
+  * \sa setCpuCacheSize */
+inline std::ptrdiff_t l1CacheSize()
+{
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+  return l1;
+}
+
+/** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
+  * \sa setCpuCacheSize */
+inline std::ptrdiff_t l2CacheSize()
+{
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+  return l2;
+}
+
+/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\
+rs.                                                                                                                
+* \sa setCpuCacheSize */
+inline std::ptrdiff_t l3CacheSize()
+{
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+  return l3;
+}
+
+/** Set the cpu L1 and L2 cache sizes (in bytes).
+  * These values are use to adjust the size of the blocks
+  * for the algorithms working per blocks.
+  *
+  * \sa computeProductBlockingSizes */
+inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
+{
+  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_BLOCK_PANEL_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h
new file mode 100644
index 000000000..caa65fccc
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -0,0 +1,517 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename _LhsScalar, typename _RhsScalar> class level3_blocking;
+
+/* Specialization for a row-major destination matrix => simple transposition of the product */
+template<
+  typename Index,
+  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+  int ResInnerStride>
+struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride>
+{
+  typedef gebp_traits<RhsScalar,LhsScalar> Traits;
+
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  static EIGEN_STRONG_INLINE void run(
+    Index rows, Index cols, Index depth,
+    const LhsScalar* lhs, Index lhsStride,
+    const RhsScalar* rhs, Index rhsStride,
+    ResScalar* res, Index resIncr, Index resStride,
+    ResScalar alpha,
+    level3_blocking<RhsScalar,LhsScalar>& blocking,
+    GemmParallelInfo<Index>* info = 0)
+  {
+    // transpose the product such that the result is column major
+    general_matrix_matrix_product<Index,
+      RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
+      LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
+      ColMajor,ResInnerStride>
+    ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking,info);
+  }
+};
+
+/*  Specialization for a col-major destination matrix
+ *    => Blocking algorithm following Goto's paper */
+template<
+  typename Index,
+  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+  int ResInnerStride>
+struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride>
+{
+
+typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+static void run(Index rows, Index cols, Index depth,
+  const LhsScalar* _lhs, Index lhsStride,
+  const RhsScalar* _rhs, Index rhsStride,
+  ResScalar* _res, Index resIncr, Index resStride,
+  ResScalar alpha,
+  level3_blocking<LhsScalar,RhsScalar>& blocking,
+  GemmParallelInfo<Index>* info = 0)
+{
+  typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+  typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor,Unaligned,ResInnerStride> ResMapper;
+  LhsMapper lhs(_lhs, lhsStride);
+  RhsMapper rhs(_rhs, rhsStride);
+  ResMapper res(_res, resStride, resIncr);
+
+  Index kc = blocking.kc();                   // cache block size along the K direction
+  Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+  Index nc = (std::min)(cols,blocking.nc());  // cache block size along the N direction
+
+  gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
+  gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+  gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+
+#ifdef EIGEN_HAS_OPENMP
+  if(info)
+  {
+    // this is the parallel version!
+    int tid = omp_get_thread_num();
+    int threads = omp_get_num_threads();
+
+    LhsScalar* blockA = blocking.blockA();
+    eigen_internal_assert(blockA!=0);
+
+    std::size_t sizeB = kc*nc;
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
+
+    // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
+    for(Index k=0; k<depth; k+=kc)
+    {
+      const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A'
+
+      // In order to reduce the chance that a thread has to wait for the other,
+      // let's start by packing B'.
+      pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc);
+
+      // Pack A_k to A' in a parallel fashion:
+      // each thread packs the sub block A_k,i to A'_i where i is the thread id.
+
+      // However, before copying to A'_i, we have to make sure that no other thread is still using it,
+      // i.e., we test that info[tid].users equals 0.
+      // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
+      while(info[tid].users!=0) {}
+      info[tid].users = threads;
+
+      pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
+
+      // Notify the other threads that the part A'_i is ready to go.
+      info[tid].sync = k;
+
+      // Computes C_i += A' * B' per A'_i
+      for(int shift=0; shift<threads; ++shift)
+      {
+        int i = (tid+shift)%threads;
+
+        // At this point we have to make sure that A'_i has been updated by the thread i,
+        // we use testAndSetOrdered to mimic a volatile access.
+        // However, no need to wait for the B' part which has been updated by the current thread!
+        if (shift>0) {
+          while(info[i].sync!=k) {
+          }
+        }
+
+        gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
+      }
+
+      // Then keep going as usual with the remaining B'
+      for(Index j=nc; j<cols; j+=nc)
+      {
+        const Index actual_nc = (std::min)(j+nc,cols)-j;
+
+        // pack B_k,j to B'
+        pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc);
+
+        // C_j += A' * B'
+        gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
+      }
+
+      // Release all the sub blocks A'_i of A' for the current thread,
+      // i.e., we simply decrement the number of users by 1
+      for(Index i=0; i<threads; ++i)
+#if !EIGEN_HAS_CXX11_ATOMIC
+        #pragma omp atomic
+#endif
+        info[i].users -= 1;
+    }
+  }
+  else
+#endif // EIGEN_HAS_OPENMP
+  {
+    EIGEN_UNUSED_VARIABLE(info);
+
+    // this is the sequential version!
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*nc;
+
+    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
+
+    const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols;
+
+    // For each horizontal panel of the rhs, and corresponding panel of the lhs...
+    for(Index i2=0; i2<rows; i2+=mc)
+    {
+      const Index actual_mc = (std::min)(i2+mc,rows)-i2;
+
+      for(Index k2=0; k2<depth; k2+=kc)
+      {
+        const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+
+        // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
+        // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
+        // Note that this panel will be read as many times as the number of blocks in the rhs's
+        // horizontal panel which is, in practice, a very low number.
+        pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
+
+        // For each kc x nc block of the rhs's horizontal panel...
+        for(Index j2=0; j2<cols; j2+=nc)
+        {
+          const Index actual_nc = (std::min)(j2+nc,cols)-j2;
+
+          // We pack the rhs's block into a sequential chunk of memory (L2 caching)
+          // Note that this block will be read a very high number of times, which is equal to the number of
+          // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
+          if((!pack_rhs_once) || i2==0)
+            pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
+
+          // Everything is packed, we can now call the panel * block kernel:
+          gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
+        }
+      }
+    }
+  }
+}
+
+};
+
+/*********************************************************************************
+*  Specialization of generic_product_impl for "large" GEMM, i.e.,
+*  implementation of the high level wrapper to general_matrix_matrix_product
+**********************************************************************************/
+
+template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
+struct gemm_functor
+{
+  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking)
+    : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
+  {}
+
+  void initParallelSession(Index num_threads) const
+  {
+    m_blocking.initParallel(m_lhs.rows(), m_rhs.cols(), m_lhs.cols(), num_threads);
+    m_blocking.allocateA();
+  }
+
+  void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const
+  {
+    if(cols==-1)
+      cols = m_rhs.cols();
+
+    Gemm::run(rows, cols, m_lhs.cols(),
+              &m_lhs.coeffRef(row,0), m_lhs.outerStride(),
+              &m_rhs.coeffRef(0,col), m_rhs.outerStride(),
+              (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.innerStride(), m_dest.outerStride(),
+              m_actualAlpha, m_blocking, info);
+  }
+
+  typedef typename Gemm::Traits Traits;
+
+  protected:
+    const Lhs& m_lhs;
+    const Rhs& m_rhs;
+    Dest& m_dest;
+    Scalar m_actualAlpha;
+    BlockingType& m_blocking;
+};
+
+template<int StorageOrder, typename LhsScalar, typename RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor=1,
+bool FiniteAtCompileTime = MaxRows!=Dynamic && MaxCols!=Dynamic && MaxDepth != Dynamic> class gemm_blocking_space;
+
+template<typename _LhsScalar, typename _RhsScalar>
+class level3_blocking
+{
+    typedef _LhsScalar LhsScalar;
+    typedef _RhsScalar RhsScalar;
+
+  protected:
+    LhsScalar* m_blockA;
+    RhsScalar* m_blockB;
+
+    Index m_mc;
+    Index m_nc;
+    Index m_kc;
+
+  public:
+
+    level3_blocking()
+      : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0)
+    {}
+
+    inline Index mc() const { return m_mc; }
+    inline Index nc() const { return m_nc; }
+    inline Index kc() const { return m_kc; }
+
+    inline LhsScalar* blockA() { return m_blockA; }
+    inline RhsScalar* blockB() { return m_blockB; }
+};
+
+template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true /* == FiniteAtCompileTime */>
+  : public level3_blocking<
+      typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
+      typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
+{
+    enum {
+      Transpose = StorageOrder==RowMajor,
+      ActualRows = Transpose ? MaxCols : MaxRows,
+      ActualCols = Transpose ? MaxRows : MaxCols
+    };
+    typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar;
+    typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
+    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+    enum {
+      SizeA = ActualRows * MaxDepth,
+      SizeB = ActualCols * MaxDepth
+    };
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
+    EIGEN_ALIGN_MAX LhsScalar m_staticA[SizeA];
+    EIGEN_ALIGN_MAX RhsScalar m_staticB[SizeB];
+#else
+    EIGEN_ALIGN_MAX char m_staticA[SizeA * sizeof(LhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1];
+    EIGEN_ALIGN_MAX char m_staticB[SizeB * sizeof(RhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1];
+#endif
+
+  public:
+
+    gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, Index /*num_threads*/, bool /*full_rows = false*/)
+    {
+      this->m_mc = ActualRows;
+      this->m_nc = ActualCols;
+      this->m_kc = MaxDepth;
+#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
+      this->m_blockA = m_staticA;
+      this->m_blockB = m_staticB;
+#else
+      this->m_blockA = reinterpret_cast<LhsScalar*>((internal::UIntPtr(m_staticA) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
+      this->m_blockB = reinterpret_cast<RhsScalar*>((internal::UIntPtr(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
+#endif
+    }
+
+    void initParallel(Index, Index, Index, Index)
+    {}
+
+    inline void allocateA() {}
+    inline void allocateB() {}
+    inline void allocateAll() {}
+};
+
+template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, false>
+  : public level3_blocking<
+      typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
+      typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
+{
+    enum {
+      Transpose = StorageOrder==RowMajor
+    };
+    typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar;
+    typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
+    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+    Index m_sizeA;
+    Index m_sizeB;
+
+  public:
+
+    gemm_blocking_space(Index rows, Index cols, Index depth, Index num_threads, bool l3_blocking)
+    {
+      this->m_mc = Transpose ? cols : rows;
+      this->m_nc = Transpose ? rows : cols;
+      this->m_kc = depth;
+
+      if(l3_blocking)
+      {
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
+      }
+      else  // no l3 blocking
+      {
+        Index n = this->m_nc;
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n, num_threads);
+      }
+
+      m_sizeA = this->m_mc * this->m_kc;
+      m_sizeB = this->m_kc * this->m_nc;
+    }
+
+    void initParallel(Index rows, Index cols, Index depth, Index num_threads)
+    {
+      this->m_mc = Transpose ? cols : rows;
+      this->m_nc = Transpose ? rows : cols;
+      this->m_kc = depth;
+
+      eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0);
+      Index m = this->m_mc;
+      computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads);
+      m_sizeA = this->m_mc * this->m_kc;
+      m_sizeB = this->m_kc * this->m_nc;
+    }
+
+    void allocateA()
+    {
+      if(this->m_blockA==0)
+        this->m_blockA = aligned_new<LhsScalar>(m_sizeA);
+    }
+
+    void allocateB()
+    {
+      if(this->m_blockB==0)
+        this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
+    }
+
+    void allocateAll()
+    {
+      allocateA();
+      allocateB();
+    }
+
+    ~gemm_blocking_space()
+    {
+      aligned_delete(this->m_blockA, m_sizeA);
+      aligned_delete(this->m_blockB, m_sizeB);
+    }
+};
+
+} // end namespace internal
+
+namespace internal {
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  typedef typename Lhs::Scalar LhsScalar;
+  typedef typename Rhs::Scalar RhsScalar;
+
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
+
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+
+  enum {
+    MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
+  };
+
+  typedef generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> lazyproduct;
+
+  template<typename Dst>
+  static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program
+    // to determine the following heuristic.
+    // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
+    // unless it has been specialized by the user or for a given architecture.
+    // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs.
+    // I'm not sure it is still required.
+    if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>());
+    else
+    {
+      dst.setZero();
+      scaleAndAddTo(dst, lhs, rhs, Scalar(1));
+    }
+  }
+
+  template<typename Dst>
+  static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>());
+    else
+      scaleAndAddTo(dst,lhs, rhs, Scalar(1));
+  }
+
+  template<typename Dst>
+  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>());
+    else
+      scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
+  }
+
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha)
+  {
+    eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols());
+    if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0)
+      return;
+
+    if (dst.cols() == 1)
+    {
+      // Fallback to GEMV if either the lhs or rhs is a runtime vector
+      typename Dest::ColXpr dst_vec(dst.col(0));
+      return internal::generic_product_impl<Lhs,typename Rhs::ConstColXpr,DenseShape,DenseShape,GemvProduct>
+        ::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha);
+    }
+    else if (dst.rows() == 1)
+    {
+      // Fallback to GEMV if either the lhs or rhs is a runtime vector
+      typename Dest::RowXpr dst_vec(dst.row(0));
+      return internal::generic_product_impl<typename Lhs::ConstRowXpr,Rhs,DenseShape,DenseShape,GemvProduct>
+        ::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha);
+    }
+
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
+
+    Scalar actualAlpha = combine_scalar_factors(alpha, a_lhs, a_rhs);
+
+    typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
+            Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
+
+    typedef internal::gemm_functor<
+      Scalar, Index,
+      internal::general_matrix_matrix_product<
+        Index,
+        LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
+        RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
+        (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,
+        Dest::InnerStrideAtCompileTime>,
+      ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor;
+
+    BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
+    internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
+        (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
new file mode 100644
index 000000000..6ba0d9bdb
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -0,0 +1,317 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
+
+namespace Eigen { 
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update;
+
+namespace internal {
+
+/**********************************************************************
+* This file implements a general A * B product while
+* evaluating only one triangular part of the product.
+* This is a more general version of self adjoint product (C += A A^T)
+* as the level 3 SYRK Blas routine.
+**********************************************************************/
+
+// forward declarations (defined at the end of this file)
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo>
+struct tribb_kernel;
+  
+/* Optimized matrix-matrix product evaluating only one triangular half */
+template <typename Index,
+          typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+                              int ResStorageOrder, int ResInnerStride, int  UpLo, int Version = Specialized>
+struct general_matrix_matrix_triangular_product;
+
+// as usual if the result is row major => we transpose the product
+template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+                          int ResInnerStride, int  UpLo, int Version>
+struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,UpLo,Version>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
+                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resIncr, Index resStride,
+                                      const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking)
+  {
+    general_matrix_matrix_triangular_product<Index,
+        RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
+        LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
+        ColMajor, ResInnerStride, UpLo==Lower?Upper:Lower>
+      ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking);
+  }
+};
+
+template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+                          int ResInnerStride, int  UpLo, int Version>
+struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,UpLo,Version>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
+                                      const RhsScalar* _rhs, Index rhsStride,
+                                      ResScalar* _res, Index resIncr, Index resStride,
+                                      const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
+  {
+    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+    typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride, resIncr);
+
+    Index kc = blocking.kc();
+    Index mc = (std::min)(size,blocking.mc());
+
+    // !!! mc must be a multiple of nr:
+    if(mc > Traits::nr)
+      mc = (mc/Traits::nr)*Traits::nr;
+
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*size;
+
+    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
+
+    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+    tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, ResInnerStride, UpLo> sybb;
+
+    for(Index k2=0; k2<depth; k2+=kc)
+    {
+      const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+
+      // note that the actual rhs is the transpose/adjoint of mat
+      pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size);
+
+      for(Index i2=0; i2<size; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(i2+mc,size)-i2;
+
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+        // the selected actual_mc * size panel of res is split into three different part:
+        //  1 - before the diagonal => processed with gebp or skipped
+        //  2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
+        //  3 - after the diagonal => processed with gebp or skipped
+        if (UpLo==Lower)
+          gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
+               (std::min)(size,i2), alpha, -1, -1, 0, 0);
+
+        sybb(_res+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
+
+        if (UpLo==Upper)
+        {
+          Index j2 = i2+actual_mc;
+          gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc,
+               actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0);
+        }
+      }
+    }
+  }
+};
+
+// Optimized packed Block * packed Block product kernel evaluating only one given triangular part
+// This kernel is built on top of the gebp kernel:
+// - the current destination block is processed per panel of actual_mc x BlockSize
+//   where BlockSize is set to the minimal value allowing gebp to be as fast as possible
+// - then, as usual, each panel is split into three parts along the diagonal,
+//   the sub blocks above and below the diagonal are processed as usual,
+//   while the triangular block overlapping the diagonal is evaluated into a
+//   small temporary buffer which is then accumulated into the result using a
+//   triangular traversal.
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo>
+struct tribb_kernel
+{
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
+  typedef typename Traits::ResScalar ResScalar;
+
+  enum {
+    BlockSize  = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret
+  };
+  void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
+  {
+    typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+    typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned> BufferMapper;
+    ResMapper res(_res, resStride, resIncr);
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel1;
+    gebp_kernel<LhsScalar, RhsScalar, Index, BufferMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel2;
+
+    Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer((internal::constructor_without_unaligned_array_assert()));
+
+    // let's process the block per panel of actual_mc x BlockSize,
+    // again, each is split into three parts, etc.
+    for (Index j=0; j<size; j+=BlockSize)
+    {
+      Index actualBlockSize = std::min<Index>(BlockSize,size - j);
+      const RhsScalar* actual_b = blockB+j*depth;
+
+      if(UpLo==Upper)
+        gebp_kernel1(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
+                     -1, -1, 0, 0);
+      
+      // selfadjoint micro block
+      {
+        Index i = j;
+        buffer.setZero();
+        // 1 - apply the kernel on the temporary buffer
+        gebp_kernel2(BufferMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
+                     -1, -1, 0, 0);
+
+        // 2 - triangular accumulation
+        for(Index j1=0; j1<actualBlockSize; ++j1)
+        {
+          typename ResMapper::LinearMapper r = res.getLinearMapper(i,j+j1);
+          for(Index i1=UpLo==Lower ? j1 : 0;
+              UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
+            r(i1) += buffer(i1,j1);
+        }
+      }
+
+      if(UpLo==Lower)
+      {
+        Index i = j+actualBlockSize;
+        gebp_kernel1(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, 
+                     depth, actualBlockSize, alpha, -1, -1, 0, 0);
+      }
+    }
+  }
+};
+
+} // end namespace internal
+
+// high level API
+
+template<typename MatrixType, typename ProductType, int UpLo, bool IsOuterProduct>
+struct general_product_to_triangular_selector;
+
+
+template<typename MatrixType, typename ProductType, int UpLo>
+struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
+{
+  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta)
+  {
+    typedef typename MatrixType::Scalar Scalar;
+    
+    typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
+    typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
+    typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+    
+    typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
+    typedef typename internal::remove_all<ActualRhs>::type _ActualRhs;
+    typename internal::add_const_on_value_type<ActualRhs>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
+
+    if(!beta)
+      mat.template triangularView<UpLo>().setZero();
+
+    enum {
+      StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+      UseLhsDirectly = _ActualLhs::InnerStrideAtCompileTime==1,
+      UseRhsDirectly = _ActualRhs::InnerStrideAtCompileTime==1
+    };
+    
+    internal::gemv_static_vector_if<Scalar,Lhs::SizeAtCompileTime,Lhs::MaxSizeAtCompileTime,!UseLhsDirectly> static_lhs;
+    ei_declare_aligned_stack_constructed_variable(Scalar, actualLhsPtr, actualLhs.size(),
+      (UseLhsDirectly ? const_cast<Scalar*>(actualLhs.data()) : static_lhs.data()));
+    if(!UseLhsDirectly) Map<typename _ActualLhs::PlainObject>(actualLhsPtr, actualLhs.size()) = actualLhs;
+    
+    internal::gemv_static_vector_if<Scalar,Rhs::SizeAtCompileTime,Rhs::MaxSizeAtCompileTime,!UseRhsDirectly> static_rhs;
+    ei_declare_aligned_stack_constructed_variable(Scalar, actualRhsPtr, actualRhs.size(),
+      (UseRhsDirectly ? const_cast<Scalar*>(actualRhs.data()) : static_rhs.data()));
+    if(!UseRhsDirectly) Map<typename _ActualRhs::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+    
+    
+    selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo,
+                              LhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
+                              RhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex>
+          ::run(actualLhs.size(), mat.data(), mat.outerStride(), actualLhsPtr, actualRhsPtr, actualAlpha);
+  }
+};
+
+template<typename MatrixType, typename ProductType, int UpLo>
+struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
+{
+  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta)
+  {
+    typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
+    typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
+    typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+    
+    typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
+    typedef typename internal::remove_all<ActualRhs>::type _ActualRhs;
+    typename internal::add_const_on_value_type<ActualRhs>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+    typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
+
+    if(!beta)
+      mat.template triangularView<UpLo>().setZero();
+
+    enum {
+      IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
+      LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0,
+      RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0,
+      SkipDiag = (UpLo&(UnitDiag|ZeroDiag))!=0
+    };
+
+    Index size = mat.cols();
+    if(SkipDiag)
+      size--;
+    Index depth = actualLhs.cols();
+
+    typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,typename Lhs::Scalar,typename Rhs::Scalar,
+          MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualRhs::MaxColsAtCompileTime> BlockingType;
+
+    BlockingType blocking(size, size, depth, 1, false);
+
+    internal::general_matrix_matrix_triangular_product<Index,
+      typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+      typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+      IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo&(Lower|Upper)>
+      ::run(size, depth,
+            &actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(),
+            &actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(),
+            mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? mat.innerStride() : mat.outerStride() ) : 0),
+            mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
+  }
+};
+
+template<typename MatrixType, unsigned int UpLo>
+template<typename ProductType>
+EIGEN_DEVICE_FUNC TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
+{
+  EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
+  eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
+
+  general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
+
+  return derived();
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
new file mode 100644
index 000000000..9a650ec23
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
@@ -0,0 +1,145 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Level 3 BLAS SYRK/HERK implementation.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo>
+struct general_matrix_matrix_rankupdate :
+       general_matrix_matrix_triangular_product<
+         Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,1,UpLo,BuiltIn> {};
+
+
+// try to go to BLAS specialization
+#define EIGEN_BLAS_RANKUPDATE_SPECIALIZE(Scalar) \
+template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
+                          int RhsStorageOrder, bool ConjugateRhs, int  UpLo> \
+struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
+               Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,1,UpLo,Specialized> { \
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
+                          const Scalar* rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \
+  { \
+    if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \
+      general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
+      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \
+    } else { \
+      general_matrix_matrix_triangular_product<Index, \
+        Scalar, LhsStorageOrder, ConjugateLhs, \
+        Scalar, RhsStorageOrder, ConjugateRhs, \
+        ColMajor, 1, UpLo, BuiltIn> \
+      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resIncr,resStride,alpha,blocking); \
+    } \
+  } \
+};
+
+EIGEN_BLAS_RANKUPDATE_SPECIALIZE(double)
+EIGEN_BLAS_RANKUPDATE_SPECIALIZE(float)
+// TODO handle complex cases
+// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(dcomplex)
+// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(scomplex)
+
+// SYRK for float/double
+#define EIGEN_BLAS_RANKUPDATE_R(EIGTYPE, BLASTYPE, BLASFUNC) \
+template <typename Index, int AStorageOrder, bool ConjugateA, int  UpLo> \
+struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
+  enum { \
+    IsLower = (UpLo&Lower) == Lower, \
+    LowUp = IsLower ? Lower : Upper, \
+    conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \
+  }; \
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
+                          const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+  { \
+  /* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \
+\
+   BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
+   char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \
+   EIGTYPE beta(1); \
+   BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \
+  } \
+};
+
+// HERK for complex data
+#define EIGEN_BLAS_RANKUPDATE_C(EIGTYPE, BLASTYPE, RTYPE, BLASFUNC) \
+template <typename Index, int AStorageOrder, bool ConjugateA, int  UpLo> \
+struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
+  enum { \
+    IsLower = (UpLo&Lower) == Lower, \
+    LowUp = IsLower ? Lower : Upper, \
+    conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \
+  }; \
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
+                          const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+  { \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \
+\
+   BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
+   char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'C':'N'); \
+   RTYPE alpha_, beta_; \
+   const EIGTYPE* a_ptr; \
+\
+   alpha_ = alpha.real(); \
+   beta_ = 1.0; \
+/* Copy with conjugation in some cases*/ \
+   MatrixType a; \
+   if (conjA) { \
+     Map<const MatrixType, 0, OuterStride<> > mapA(lhs,n,k,OuterStride<>(lhsStride)); \
+     a = mapA.conjugate(); \
+     lda = a.outerStride(); \
+     a_ptr = a.data(); \
+   } else a_ptr=lhs; \
+   BLASFUNC(&uplo, &trans, &n, &k, &alpha_, (BLASTYPE*)a_ptr, &lda, &beta_, (BLASTYPE*)res, &ldc); \
+  } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk)
+EIGEN_BLAS_RANKUPDATE_R(float,  float,  ssyrk)
+#else
+EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
+EIGEN_BLAS_RANKUPDATE_R(float,  float,  ssyrk_)
+#endif
+
+// TODO hanlde complex cases
+// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)
+// EIGEN_BLAS_RANKUPDATE_C(scomplex, float,  float, cherk_)
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
new file mode 100644
index 000000000..71abf4013
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
@@ -0,0 +1,124 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   General matrix-matrix product functionality based on ?GEMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************************************************
+* This file implements general matrix-matrix multiplication using BLAS
+* gemm function via partial specialization of
+* general_matrix_matrix_product::run(..) method for float, double,
+* std::complex<float> and std::complex<double> types
+**********************************************************************/
+
+// gemm specialization
+
+#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC) \
+template< \
+  typename Index, \
+  int LhsStorageOrder, bool ConjugateLhs, \
+  int RhsStorageOrder, bool ConjugateRhs> \
+struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1> \
+{ \
+typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \
+\
+static void run(Index rows, Index cols, Index depth, \
+  const EIGTYPE* _lhs, Index lhsStride, \
+  const EIGTYPE* _rhs, Index rhsStride, \
+  EIGTYPE* res, Index resIncr, Index resStride, \
+  EIGTYPE alpha, \
+  level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
+  GemmParallelInfo<Index>* /*info = 0*/) \
+{ \
+  using std::conj; \
+\
+  EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+  eigen_assert(resIncr == 1); \
+  char transa, transb; \
+  BlasIndex m, n, k, lda, ldb, ldc; \
+  const EIGTYPE *a, *b; \
+  EIGTYPE beta(1); \
+  MatrixX##EIGPREFIX a_tmp, b_tmp; \
+\
+/* Set transpose options */ \
+  transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
+  transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set m, n, k */ \
+  m = convert_index<BlasIndex>(rows);  \
+  n = convert_index<BlasIndex>(cols);  \
+  k = convert_index<BlasIndex>(depth); \
+\
+/* Set lda, ldb, ldc */ \
+  lda = convert_index<BlasIndex>(lhsStride); \
+  ldb = convert_index<BlasIndex>(rhsStride); \
+  ldc = convert_index<BlasIndex>(resStride); \
+\
+/* Set a, b, c */ \
+  if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
+    Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
+    a_tmp = lhs.conjugate(); \
+    a = a_tmp.data(); \
+    lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+  } else a = _lhs; \
+\
+  if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
+    Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
+    b_tmp = rhs.conjugate(); \
+    b = b_tmp.data(); \
+    ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+  } else b = _rhs; \
+\
+  BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+}};
+
+#ifdef EIGEN_USE_MKL
+GEMM_SPECIALIZATION(double,   d,  double, dgemm)
+GEMM_SPECIALIZATION(float,    f,  float,  sgemm)
+GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
+GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8,  cgemm)
+#else
+GEMM_SPECIALIZATION(double,   d,  double, dgemm_)
+GEMM_SPECIALIZATION(float,    f,  float,  sgemm_)
+GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
+GEMM_SPECIALIZATION(scomplex, cf, float,  cgemm_)
+#endif
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector.h
new file mode 100644
index 000000000..dfb6aebce
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -0,0 +1,518 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
+#define EIGEN_GENERAL_MATRIX_VECTOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+enum GEMVPacketSizeType {
+  GEMVPacketFull = 0,
+  GEMVPacketHalf,
+  GEMVPacketQuarter
+};
+
+template <int N, typename T1, typename T2, typename T3>
+struct gemv_packet_cond { typedef T3 type; };
+
+template <typename T1, typename T2, typename T3>
+struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> { typedef T1 type; };
+
+template <typename T1, typename T2, typename T3>
+struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> { typedef T2 type; };
+
+template<typename LhsScalar, typename RhsScalar, int _PacketSize=GEMVPacketFull>
+class gemv_traits
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size)                        \
+  typedef typename gemv_packet_cond<packet_size,                                  \
+                                    typename packet_traits<name ## Scalar>::type, \
+                                    typename packet_traits<name ## Scalar>::half, \
+                                    typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
+  prefix ## name ## Packet
+
+  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+#undef PACKET_DECL_COND_PREFIX
+
+public:
+  enum {
+        Vectorizable = unpacket_traits<_LhsPacket>::vectorizable &&
+        unpacket_traits<_RhsPacket>::vectorizable &&
+        int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size),
+        LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+        RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+        ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1
+  };
+
+  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+};
+
+
+/* Optimized col-major matrix * vector product:
+ * This algorithm processes the matrix per vertical panels,
+ * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments.
+ *
+ * Mixing type logic: C += alpha * A * B
+ *  |  A  |  B  |alpha| comments
+ *  |real |cplx |cplx | no vectorization
+ *  |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
+ *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
+ *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
+ *
+ * The same reasoning apply for the transposed case.
+ */
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
+{
+  typedef gemv_traits<LhsScalar,RhsScalar> Traits;
+  typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
+  typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
+
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
+
+  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+  typedef typename HalfTraits::ResPacket ResPacketHalf;
+
+  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
+
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
+  Index rows, Index cols,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+        ResScalar* res, Index resIncr,
+  RhsScalar alpha);
+};
+
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+  Index rows, Index cols,
+  const LhsMapper& alhs,
+  const RhsMapper& rhs,
+        ResScalar* res, Index resIncr,
+  RhsScalar alpha)
+{
+  EIGEN_UNUSED_VARIABLE(resIncr);
+  eigen_internal_assert(resIncr==1);
+
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate propoer code.
+  LhsMapper lhs(alhs);
+
+  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+  conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
+  conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
+  conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
+
+  const Index lhsStride = lhs.stride();
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum { LhsAlignment = Unaligned,
+         ResPacketSize = Traits::ResPacketSize,
+         ResPacketSizeHalf = HalfTraits::ResPacketSize,
+         ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
+         LhsPacketSize = Traits::LhsPacketSize,
+         HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
+         HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
+  };
+
+  const Index n8 = rows-8*ResPacketSize+1;
+  const Index n4 = rows-4*ResPacketSize+1;
+  const Index n3 = rows-3*ResPacketSize+1;
+  const Index n2 = rows-2*ResPacketSize+1;
+  const Index n1 = rows-1*ResPacketSize+1;
+  const Index n_half = rows-1*ResPacketSizeHalf+1;
+  const Index n_quarter = rows-1*ResPacketSizeQuarter+1;
+
+  // TODO: improve the following heuristic:
+  const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4);
+  ResPacket palpha = pset1<ResPacket>(alpha);
+  ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
+  ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
+
+  for(Index j2=0; j2<cols; j2+=block_cols)
+  {
+    Index jend = numext::mini(j2+block_cols,cols);
+    Index i=0;
+    for(; i<n8; i+=ResPacketSize*8)
+    {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+                c1 = pset1<ResPacket>(ResScalar(0)),
+                c2 = pset1<ResPacket>(ResScalar(0)),
+                c3 = pset1<ResPacket>(ResScalar(0)),
+                c4 = pset1<ResPacket>(ResScalar(0)),
+                c5 = pset1<ResPacket>(ResScalar(0)),
+                c6 = pset1<ResPacket>(ResScalar(0)),
+                c7 = pset1<ResPacket>(ResScalar(0));
+
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+        c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
+        c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
+        c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4);
+        c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5);
+        c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6);
+        c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7);
+      }
+      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+      pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+      pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
+      pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
+      pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4)));
+      pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5)));
+      pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6)));
+      pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7)));
+    }
+    if(i<n4)
+    {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+                c1 = pset1<ResPacket>(ResScalar(0)),
+                c2 = pset1<ResPacket>(ResScalar(0)),
+                c3 = pset1<ResPacket>(ResScalar(0));
+
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+        c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
+        c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
+      }
+      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+      pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+      pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
+      pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
+
+      i+=ResPacketSize*4;
+    }
+    if(i<n3)
+    {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+                c1 = pset1<ResPacket>(ResScalar(0)),
+                c2 = pset1<ResPacket>(ResScalar(0));
+
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+        c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
+      }
+      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+      pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+      pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
+
+      i+=ResPacketSize*3;
+    }
+    if(i<n2)
+    {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+                c1 = pset1<ResPacket>(ResScalar(0));
+
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+      }
+      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+      pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+      i+=ResPacketSize*2;
+    }
+    if(i<n1)
+    {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0));
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+      }
+      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+      i+=ResPacketSize;
+    }
+    if(HasHalf && i<n_half)
+    {
+      ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j,0));
+        c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0);
+      }
+      pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu<ResPacketHalf>(res+i+ResPacketSizeHalf*0)));
+      i+=ResPacketSizeHalf;
+    }
+    if(HasQuarter && i<n_quarter)
+    {
+      ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j,0));
+        c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0);
+      }
+      pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu<ResPacketQuarter>(res+i+ResPacketSizeQuarter*0)));
+      i+=ResPacketSizeQuarter;
+    }
+    for(;i<rows;++i)
+    {
+      ResScalar c0(0);
+      for(Index j=j2; j<jend; j+=1)
+        c0 += cj.pmul(lhs(i,j), rhs(j,0));
+      res[i] += alpha*c0;
+    }
+  }
+}
+
+/* Optimized row-major matrix * vector product:
+ * This algorithm processes 4 rows at once that allows to both reduce
+ * the number of load/stores of the result by a factor 4 and to reduce
+ * the instruction dependency. Moreover, we know that all bands have the
+ * same alignment pattern.
+ *
+ * Mixing type logic:
+ *  - alpha is always a complex (or converted to a complex)
+ *  - no vectorization
+ */
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
+{
+  typedef gemv_traits<LhsScalar,RhsScalar> Traits;
+  typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
+  typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
+
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
+
+  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+  typedef typename HalfTraits::ResPacket ResPacketHalf;
+
+  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
+
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
+  Index rows, Index cols,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+        ResScalar* res, Index resIncr,
+  ResScalar alpha);
+};
+
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+  Index rows, Index cols,
+  const LhsMapper& alhs,
+  const RhsMapper& rhs,
+  ResScalar* res, Index resIncr,
+  ResScalar alpha)
+{
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate propoer code.
+  LhsMapper lhs(alhs);
+
+  eigen_internal_assert(rhs.stride()==1);
+  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+  conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
+  conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
+  conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
+
+  // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
+  //       processing 8 rows at once might be counter productive wrt cache.
+  const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7;
+  const Index n4 = rows-3;
+  const Index n2 = rows-1;
+
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum { LhsAlignment = Unaligned,
+         ResPacketSize = Traits::ResPacketSize,
+         ResPacketSizeHalf = HalfTraits::ResPacketSize,
+         ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
+         LhsPacketSize = Traits::LhsPacketSize,
+         LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
+         LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
+         HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
+         HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
+  };
+
+  Index i=0;
+  for(; i<n8; i+=8)
+  {
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+              c1 = pset1<ResPacket>(ResScalar(0)),
+              c2 = pset1<ResPacket>(ResScalar(0)),
+              c3 = pset1<ResPacket>(ResScalar(0)),
+              c4 = pset1<ResPacket>(ResScalar(0)),
+              c5 = pset1<ResPacket>(ResScalar(0)),
+              c6 = pset1<ResPacket>(ResScalar(0)),
+              c7 = pset1<ResPacket>(ResScalar(0));
+
+    Index j=0;
+    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+    {
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
+
+      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+      c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
+      c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
+      c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
+      c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+4,j),b0,c4);
+      c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+5,j),b0,c5);
+      c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6);
+      c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7);
+    }
+    ResScalar cc0 = predux(c0);
+    ResScalar cc1 = predux(c1);
+    ResScalar cc2 = predux(c2);
+    ResScalar cc3 = predux(c3);
+    ResScalar cc4 = predux(c4);
+    ResScalar cc5 = predux(c5);
+    ResScalar cc6 = predux(c6);
+    ResScalar cc7 = predux(c7);
+    for(; j<cols; ++j)
+    {
+      RhsScalar b0 = rhs(j,0);
+
+      cc0 += cj.pmul(lhs(i+0,j), b0);
+      cc1 += cj.pmul(lhs(i+1,j), b0);
+      cc2 += cj.pmul(lhs(i+2,j), b0);
+      cc3 += cj.pmul(lhs(i+3,j), b0);
+      cc4 += cj.pmul(lhs(i+4,j), b0);
+      cc5 += cj.pmul(lhs(i+5,j), b0);
+      cc6 += cj.pmul(lhs(i+6,j), b0);
+      cc7 += cj.pmul(lhs(i+7,j), b0);
+    }
+    res[(i+0)*resIncr] += alpha*cc0;
+    res[(i+1)*resIncr] += alpha*cc1;
+    res[(i+2)*resIncr] += alpha*cc2;
+    res[(i+3)*resIncr] += alpha*cc3;
+    res[(i+4)*resIncr] += alpha*cc4;
+    res[(i+5)*resIncr] += alpha*cc5;
+    res[(i+6)*resIncr] += alpha*cc6;
+    res[(i+7)*resIncr] += alpha*cc7;
+  }
+  for(; i<n4; i+=4)
+  {
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+              c1 = pset1<ResPacket>(ResScalar(0)),
+              c2 = pset1<ResPacket>(ResScalar(0)),
+              c3 = pset1<ResPacket>(ResScalar(0));
+
+    Index j=0;
+    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+    {
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
+
+      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+      c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
+      c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
+      c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
+    }
+    ResScalar cc0 = predux(c0);
+    ResScalar cc1 = predux(c1);
+    ResScalar cc2 = predux(c2);
+    ResScalar cc3 = predux(c3);
+    for(; j<cols; ++j)
+    {
+      RhsScalar b0 = rhs(j,0);
+
+      cc0 += cj.pmul(lhs(i+0,j), b0);
+      cc1 += cj.pmul(lhs(i+1,j), b0);
+      cc2 += cj.pmul(lhs(i+2,j), b0);
+      cc3 += cj.pmul(lhs(i+3,j), b0);
+    }
+    res[(i+0)*resIncr] += alpha*cc0;
+    res[(i+1)*resIncr] += alpha*cc1;
+    res[(i+2)*resIncr] += alpha*cc2;
+    res[(i+3)*resIncr] += alpha*cc3;
+  }
+  for(; i<n2; i+=2)
+  {
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+              c1 = pset1<ResPacket>(ResScalar(0));
+
+    Index j=0;
+    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+    {
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
+
+      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+      c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
+    }
+    ResScalar cc0 = predux(c0);
+    ResScalar cc1 = predux(c1);
+    for(; j<cols; ++j)
+    {
+      RhsScalar b0 = rhs(j,0);
+
+      cc0 += cj.pmul(lhs(i+0,j), b0);
+      cc1 += cj.pmul(lhs(i+1,j), b0);
+    }
+    res[(i+0)*resIncr] += alpha*cc0;
+    res[(i+1)*resIncr] += alpha*cc1;
+  }
+  for(; i<rows; ++i)
+  {
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0));
+    ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
+    ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
+    Index j=0;
+    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+    {
+      RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0);
+      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
+    }
+    ResScalar cc0 = predux(c0);
+    if (HasHalf) {
+      for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf)
+        {
+          RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0);
+          c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h);
+        }
+      cc0 += predux(c0_h);
+    }
+    if (HasQuarter) {
+      for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter)
+        {
+          RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0);
+          c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q);
+        }
+      cc0 += predux(c0_q);
+    }
+    for(; j<cols; ++j)
+    {
+      cc0 += cj.pmul(lhs(i,j), rhs(j,0));
+    }
+    res[i*resIncr] += alpha*cc0;
+  }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
new file mode 100644
index 000000000..6e36c2b3c
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
@@ -0,0 +1,136 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   General matrix-vector product functionality based on ?GEMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
+#define EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************************************************
+* This file implements general matrix-vector multiplication using BLAS
+* gemv function via partial specialization of
+* general_matrix_vector_product::run(..) method for float, double,
+* std::complex<float> and std::complex<double> types
+**********************************************************************/
+
+// gemv specialization
+
+template<typename Index, typename LhsScalar, int StorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+struct general_matrix_vector_product_gemv;
+
+#define EIGEN_BLAS_GEMV_SPECIALIZE(Scalar) \
+template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,Specialized> { \
+static void run( \
+  Index rows, Index cols, \
+  const const_blas_data_mapper<Scalar,Index,ColMajor> &lhs, \
+  const const_blas_data_mapper<Scalar,Index,RowMajor> &rhs, \
+  Scalar* res, Index resIncr, Scalar alpha) \
+{ \
+  if (ConjugateLhs) { \
+    general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,BuiltIn>::run( \
+      rows, cols, lhs, rhs, res, resIncr, alpha); \
+  } else { \
+    general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
+      rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \
+  } \
+} \
+}; \
+template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,RowMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ConjugateRhs,Specialized> { \
+static void run( \
+  Index rows, Index cols, \
+  const const_blas_data_mapper<Scalar,Index,RowMajor> &lhs, \
+  const const_blas_data_mapper<Scalar,Index,ColMajor> &rhs, \
+  Scalar* res, Index resIncr, Scalar alpha) \
+{ \
+    general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
+      rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \
+} \
+}; \
+
+EIGEN_BLAS_GEMV_SPECIALIZE(double)
+EIGEN_BLAS_GEMV_SPECIALIZE(float)
+EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_GEMV_SPECIALIZE(scomplex)
+
+#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \
+template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
+{ \
+typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> GEMVVector;\
+\
+static void run( \
+  Index rows, Index cols, \
+  const EIGTYPE* lhs, Index lhsStride, \
+  const EIGTYPE* rhs, Index rhsIncr, \
+  EIGTYPE* res, Index resIncr, EIGTYPE alpha) \
+{ \
+  BlasIndex m=convert_index<BlasIndex>(rows), n=convert_index<BlasIndex>(cols), \
+            lda=convert_index<BlasIndex>(lhsStride), incx=convert_index<BlasIndex>(rhsIncr), incy=convert_index<BlasIndex>(resIncr); \
+  const EIGTYPE beta(1); \
+  const EIGTYPE *x_ptr; \
+  char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \
+  if (LhsStorageOrder==RowMajor) { \
+    m = convert_index<BlasIndex>(cols); \
+    n = convert_index<BlasIndex>(rows); \
+  }\
+  GEMVVector x_tmp; \
+  if (ConjugateRhs) { \
+    Map<const GEMVVector, 0, InnerStride<> > map_x(rhs,cols,1,InnerStride<>(incx)); \
+    x_tmp=map_x.conjugate(); \
+    x_ptr=x_tmp.data(); \
+    incx=1; \
+  } else x_ptr=rhs; \
+  BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
+}\
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_GEMV_SPECIALIZATION(double,   double, dgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(float,    float,  sgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv)
+#else
+EIGEN_BLAS_GEMV_SPECIALIZATION(double,   double, dgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(float,    float,  sgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float,  cgemv_)
+#endif
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/Parallelizer.h b/src/3rdparty/eigen/Eigen/src/Core/products/Parallelizer.h
new file mode 100644
index 000000000..8f91879e4
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/Parallelizer.h
@@ -0,0 +1,180 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PARALLELIZER_H
+#define EIGEN_PARALLELIZER_H
+
+#if EIGEN_HAS_CXX11_ATOMIC
+#include <atomic>
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal */
+inline void manage_multi_threading(Action action, int* v)
+{
+  static int m_maxThreads = -1;
+  EIGEN_UNUSED_VARIABLE(m_maxThreads)
+
+  if(action==SetAction)
+  {
+    eigen_internal_assert(v!=0);
+    m_maxThreads = *v;
+  }
+  else if(action==GetAction)
+  {
+    eigen_internal_assert(v!=0);
+    #ifdef EIGEN_HAS_OPENMP
+    if(m_maxThreads>0)
+      *v = m_maxThreads;
+    else
+      *v = omp_get_max_threads();
+    #else
+    *v = 1;
+    #endif
+  }
+  else
+  {
+    eigen_internal_assert(false);
+  }
+}
+
+}
+
+/** Must be call first when calling Eigen from multiple threads */
+inline void initParallel()
+{
+  int nbt;
+  internal::manage_multi_threading(GetAction, &nbt);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+}
+
+/** \returns the max number of threads reserved for Eigen
+  * \sa setNbThreads */
+inline int nbThreads()
+{
+  int ret;
+  internal::manage_multi_threading(GetAction, &ret);
+  return ret;
+}
+
+/** Sets the max number of threads reserved for Eigen
+  * \sa nbThreads */
+inline void setNbThreads(int v)
+{
+  internal::manage_multi_threading(SetAction, &v);
+}
+
+namespace internal {
+
+template<typename Index> struct GemmParallelInfo
+{
+  GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
+
+  // volatile is not enough on all architectures (see bug 1572)
+  // to guarantee that when thread A says to thread B that it is
+  // done with packing a block, then all writes have been really
+  // carried out... C++11 memory model+atomic guarantees this.
+#if EIGEN_HAS_CXX11_ATOMIC
+  std::atomic<Index> sync;
+  std::atomic<int> users;
+#else
+  Index volatile sync;
+  int volatile users;
+#endif
+
+  Index lhs_start;
+  Index lhs_length;
+};
+
+template<bool Condition, typename Functor, typename Index>
+void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose)
+{
+  // TODO when EIGEN_USE_BLAS is defined,
+  // we should still enable OMP for other scalar types
+  // Without C++11, we have to disable GEMM's parallelization on
+  // non x86 architectures because there volatile is not enough for our purpose.
+  // See bug 1572.
+#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) || ((!EIGEN_HAS_CXX11_ATOMIC) && !(EIGEN_ARCH_i386_OR_x86_64))
+  // FIXME the transpose variable is only needed to properly split
+  // the matrix product when multithreading is enabled. This is a temporary
+  // fix to support row-major destination matrices. This whole
+  // parallelizer mechanism has to be redesigned anyway.
+  EIGEN_UNUSED_VARIABLE(depth);
+  EIGEN_UNUSED_VARIABLE(transpose);
+  func(0,rows, 0,cols);
+#else
+
+  // Dynamically check whether we should enable or disable OpenMP.
+  // The conditions are:
+  // - the max number of threads we can create is greater than 1
+  // - we are not already in a parallel code
+  // - the sizes are large enough
+
+  // compute the maximal number of threads from the size of the product:
+  // This first heuristic takes into account that the product kernel is fully optimized when working with nr columns at once.
+  Index size = transpose ? rows : cols;
+  Index pb_max_threads = std::max<Index>(1,size / Functor::Traits::nr);
+
+  // compute the maximal number of threads from the total amount of work:
+  double work = static_cast<double>(rows) * static_cast<double>(cols) *
+      static_cast<double>(depth);
+  double kMinTaskSize = 50000;  // FIXME improve this heuristic.
+  pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, static_cast<Index>( work / kMinTaskSize ) ));
+
+  // compute the number of threads we are going to use
+  Index threads = std::min<Index>(nbThreads(), pb_max_threads);
+
+  // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session,
+  // then abort multi-threading
+  // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
+  if((!Condition) || (threads==1) || (omp_get_num_threads()>1))
+    return func(0,rows, 0,cols);
+
+  Eigen::initParallel();
+  func.initParallelSession(threads);
+
+  if(transpose)
+    std::swap(rows,cols);
+
+  ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
+
+  #pragma omp parallel num_threads(threads)
+  {
+    Index i = omp_get_thread_num();
+    // Note that the actual number of threads might be lower than the number of request ones.
+    Index actual_threads = omp_get_num_threads();
+
+    Index blockCols = (cols / actual_threads) & ~Index(0x3);
+    Index blockRows = (rows / actual_threads);
+    blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
+
+    Index r0 = i*blockRows;
+    Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;
+
+    Index c0 = i*blockCols;
+    Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols;
+
+    info[i].lhs_start = r0;
+    info[i].lhs_length = actualBlockRows;
+
+    if(transpose) func(c0, actualBlockCols, 0, rows, info);
+    else          func(0, rows, c0, actualBlockCols, info);
+  }
+#endif
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PARALLELIZER_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
new file mode 100644
index 000000000..33ecf10f6
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -0,0 +1,544 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H
+#define EIGEN_SELFADJOINT_MATRIX_MATRIX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+// pack a selfadjoint block diagonal for use with the gebp_kernel
+template<typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs
+{
+  template<int BlockRows> inline
+  void pack(Scalar* blockA, const const_blas_data_mapper<Scalar,Index,StorageOrder>& lhs, Index cols, Index i, Index& count)
+  {
+    // normal copy
+    for(Index k=0; k<i; k++)
+      for(Index w=0; w<BlockRows; w++)
+        blockA[count++] = lhs(i+w,k);           // normal
+    // symmetric copy
+    Index h = 0;
+    for(Index k=i; k<i+BlockRows; k++)
+    {
+      for(Index w=0; w<h; w++)
+        blockA[count++] = numext::conj(lhs(k, i+w)); // transposed
+
+      blockA[count++] = numext::real(lhs(k,k));   // real (diagonal)
+
+      for(Index w=h+1; w<BlockRows; w++)
+        blockA[count++] = lhs(i+w, k);          // normal
+      ++h;
+    }
+    // transposed copy
+    for(Index k=i+BlockRows; k<cols; k++)
+      for(Index w=0; w<BlockRows; w++)
+        blockA[count++] = numext::conj(lhs(k, i+w)); // transposed
+  }
+  void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
+  {
+    typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket;
+    typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket;
+    enum { PacketSize = packet_traits<Scalar>::size,
+           HalfPacketSize = unpacket_traits<HalfPacket>::size,
+           QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+           HasHalf = (int)HalfPacketSize < (int)PacketSize,
+           HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
+
+    const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
+    Index count = 0;
+    //Index peeled_mc3 = (rows/Pack1)*Pack1;
+    
+    const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+    const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+    const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
+    const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
+    const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0;
+    
+    if(Pack1>=3*PacketSize)
+      for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
+        pack<3*PacketSize>(blockA, lhs, cols, i, count);
+    
+    if(Pack1>=2*PacketSize)
+      for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize)
+        pack<2*PacketSize>(blockA, lhs, cols, i, count);
+    
+    if(Pack1>=1*PacketSize)
+      for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
+        pack<1*PacketSize>(blockA, lhs, cols, i, count);
+
+    if(HasHalf && Pack1>=HalfPacketSize)
+      for(Index i=peeled_mc1; i<peeled_mc_half; i+=HalfPacketSize)
+        pack<HalfPacketSize>(blockA, lhs, cols, i, count);
+
+    if(HasQuarter && Pack1>=QuarterPacketSize)
+      for(Index i=peeled_mc_half; i<peeled_mc_quarter; i+=QuarterPacketSize)
+        pack<QuarterPacketSize>(blockA, lhs, cols, i, count);
+
+    // do the same with mr==1
+    for(Index i=peeled_mc_quarter; i<rows; i++)
+    {
+      for(Index k=0; k<i; k++)
+        blockA[count++] = lhs(i, k);                   // normal
+
+      blockA[count++] = numext::real(lhs(i, i));       // real (diagonal)
+
+      for(Index k=i+1; k<cols; k++)
+        blockA[count++] = numext::conj(lhs(k, i));     // transposed
+    }
+  }
+};
+
+template<typename Scalar, typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs
+{
+  enum { PacketSize = packet_traits<Scalar>::size };
+  void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+  {
+    Index end_k = k2 + rows;
+    Index count = 0;
+    const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
+    Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+    Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+
+    // first part: normal case
+    for(Index j2=0; j2<k2; j2+=nr)
+    {
+      for(Index k=k2; k<end_k; k++)
+      {
+        blockB[count+0] = rhs(k,j2+0);
+        blockB[count+1] = rhs(k,j2+1);
+        if (nr>=4)
+        {
+          blockB[count+2] = rhs(k,j2+2);
+          blockB[count+3] = rhs(k,j2+3);
+        }
+        if (nr>=8)
+        {
+          blockB[count+4] = rhs(k,j2+4);
+          blockB[count+5] = rhs(k,j2+5);
+          blockB[count+6] = rhs(k,j2+6);
+          blockB[count+7] = rhs(k,j2+7);
+        }
+        count += nr;
+      }
+    }
+
+    // second part: diagonal block
+    Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2;
+    if(nr>=8)
+    {
+      for(Index j2=k2; j2<end8; j2+=8)
+      {
+        // again we can split vertically in three different parts (transpose, symmetric, normal)
+        // transpose
+        for(Index k=k2; k<j2; k++)
+        {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
+          blockB[count+2] = numext::conj(rhs(j2+2,k));
+          blockB[count+3] = numext::conj(rhs(j2+3,k));
+          blockB[count+4] = numext::conj(rhs(j2+4,k));
+          blockB[count+5] = numext::conj(rhs(j2+5,k));
+          blockB[count+6] = numext::conj(rhs(j2+6,k));
+          blockB[count+7] = numext::conj(rhs(j2+7,k));
+          count += 8;
+        }
+        // symmetric
+        Index h = 0;
+        for(Index k=j2; k<j2+8; k++)
+        {
+          // normal
+          for (Index w=0 ; w<h; ++w)
+            blockB[count+w] = rhs(k,j2+w);
+
+          blockB[count+h] = numext::real(rhs(k,k));
+
+          // transpose
+          for (Index w=h+1 ; w<8; ++w)
+            blockB[count+w] = numext::conj(rhs(j2+w,k));
+          count += 8;
+          ++h;
+        }
+        // normal
+        for(Index k=j2+8; k<end_k; k++)
+        {
+          blockB[count+0] = rhs(k,j2+0);
+          blockB[count+1] = rhs(k,j2+1);
+          blockB[count+2] = rhs(k,j2+2);
+          blockB[count+3] = rhs(k,j2+3);
+          blockB[count+4] = rhs(k,j2+4);
+          blockB[count+5] = rhs(k,j2+5);
+          blockB[count+6] = rhs(k,j2+6);
+          blockB[count+7] = rhs(k,j2+7);
+          count += 8;
+        }
+      }
+    }
+    if(nr>=4)
+    {
+      for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4)
+      {
+        // again we can split vertically in three different parts (transpose, symmetric, normal)
+        // transpose
+        for(Index k=k2; k<j2; k++)
+        {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
+          blockB[count+2] = numext::conj(rhs(j2+2,k));
+          blockB[count+3] = numext::conj(rhs(j2+3,k));
+          count += 4;
+        }
+        // symmetric
+        Index h = 0;
+        for(Index k=j2; k<j2+4; k++)
+        {
+          // normal
+          for (Index w=0 ; w<h; ++w)
+            blockB[count+w] = rhs(k,j2+w);
+
+          blockB[count+h] = numext::real(rhs(k,k));
+
+          // transpose
+          for (Index w=h+1 ; w<4; ++w)
+            blockB[count+w] = numext::conj(rhs(j2+w,k));
+          count += 4;
+          ++h;
+        }
+        // normal
+        for(Index k=j2+4; k<end_k; k++)
+        {
+          blockB[count+0] = rhs(k,j2+0);
+          blockB[count+1] = rhs(k,j2+1);
+          blockB[count+2] = rhs(k,j2+2);
+          blockB[count+3] = rhs(k,j2+3);
+          count += 4;
+        }
+      }
+    }
+
+    // third part: transposed
+    if(nr>=8)
+    {
+      for(Index j2=k2+rows; j2<packet_cols8; j2+=8)
+      {
+        for(Index k=k2; k<end_k; k++)
+        {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
+          blockB[count+2] = numext::conj(rhs(j2+2,k));
+          blockB[count+3] = numext::conj(rhs(j2+3,k));
+          blockB[count+4] = numext::conj(rhs(j2+4,k));
+          blockB[count+5] = numext::conj(rhs(j2+5,k));
+          blockB[count+6] = numext::conj(rhs(j2+6,k));
+          blockB[count+7] = numext::conj(rhs(j2+7,k));
+          count += 8;
+        }
+      }
+    }
+    if(nr>=4)
+    {
+      for(Index j2=(std::max)(packet_cols8,k2+rows); j2<packet_cols4; j2+=4)
+      {
+        for(Index k=k2; k<end_k; k++)
+        {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
+          blockB[count+2] = numext::conj(rhs(j2+2,k));
+          blockB[count+3] = numext::conj(rhs(j2+3,k));
+          count += 4;
+        }
+      }
+    }
+
+    // copy the remaining columns one at a time (=> the same with nr==1)
+    for(Index j2=packet_cols4; j2<cols; ++j2)
+    {
+      // transpose
+      Index half = (std::min)(end_k,j2);
+      for(Index k=k2; k<half; k++)
+      {
+        blockB[count] = numext::conj(rhs(j2,k));
+        count += 1;
+      }
+
+      if(half==j2 && half<k2+rows)
+      {
+        blockB[count] = numext::real(rhs(j2,j2));
+        count += 1;
+      }
+      else
+        half--;
+
+      // normal
+      for(Index k=half+1; k<k2+rows; k++)
+      {
+        blockB[count] = rhs(k,j2);
+        count += 1;
+      }
+    }
+  }
+};
+
+/* Optimized selfadjoint matrix * matrix (_SYMM) product built on top of
+ * the general matrix matrix product.
+ */
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
+          int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
+          int ResStorageOrder, int ResInnerStride>
+struct product_selfadjoint_matrix;
+
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
+          int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
+          int ResInnerStride>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor,ResInnerStride>
+{
+
+  static EIGEN_STRONG_INLINE void run(
+    Index rows, Index cols,
+    const Scalar* lhs, Index lhsStride,
+    const Scalar* rhs, Index rhsStride,
+    Scalar* res,       Index resIncr, Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+  {
+    product_selfadjoint_matrix<Scalar, Index,
+      EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
+      RhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs),
+      EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
+      LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs),
+      ColMajor,ResInnerStride>
+      ::run(cols, rows,  rhs, rhsStride,  lhs, lhsStride,  res, resIncr, resStride,  alpha, blocking);
+  }
+};
+
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>
+{
+
+  static EIGEN_DONT_INLINE void run(
+    Index rows, Index cols,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resIncr, Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+};
+
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride>
+EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>::run(
+    Index rows, Index cols,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* _res,       Index resIncr, Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+  {
+    Index size = rows;
+
+    typedef gebp_traits<Scalar,Scalar> Traits;
+
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride, resIncr);
+
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+    // kc must be smaller than mc
+    kc = (std::min)(kc,mc);
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*cols;
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+    gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
+
+    for(Index k2=0; k2<size; k2+=kc)
+    {
+      const Index actual_kc = (std::min)(k2+kc,size)-k2;
+
+      // we have selected one row panel of rhs and one column panel of lhs
+      // pack rhs's panel into a sequential chunk of memory
+      // and expand each coeff to a constant packet for further reuse
+      pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols);
+
+      // the select lhs's panel has to be split in three different parts:
+      //  1 - the transposed panel above the diagonal block => transposed packed copy
+      //  2 - the diagonal block => special packed copy
+      //  3 - the panel below the diagonal block => generic packed copy
+      for(Index i2=0; i2<k2; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(i2+mc,k2)-i2;
+        // transposed packed copy
+        pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+      }
+      // the block diagonal
+      {
+        const Index actual_mc = (std::min)(k2+kc,size)-k2;
+        // symmetric packed copy
+        pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
+
+        gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+      }
+
+      for(Index i2=k2+kc; i2<size; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(i2+mc,size)-i2;
+        gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>()
+          (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+      }
+    }
+  }
+
+// matrix * selfadjoint product
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>
+{
+
+  static EIGEN_DONT_INLINE void run(
+    Index rows, Index cols,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resIncr, Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+};
+
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride>
+EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>::run(
+    Index rows, Index cols,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* _res,       Index resIncr, Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+  {
+    Index size = cols;
+
+    typedef gebp_traits<Scalar,Scalar> Traits;
+
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    ResMapper res(_res,resStride, resIncr);
+
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*cols;
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
+    symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+
+    for(Index k2=0; k2<size; k2+=kc)
+    {
+      const Index actual_kc = (std::min)(k2+kc,size)-k2;
+
+      pack_rhs(blockB, _rhs, rhsStride, actual_kc, cols, k2);
+
+      // => GEPP
+      for(Index i2=0; i2<rows; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(i2+mc,rows)-i2;
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+      }
+    }
+  }
+
+} // end namespace internal
+
+/***************************************************************************
+* Wrapper to product_selfadjoint_matrix
+***************************************************************************/
+
+namespace internal {
+  
+template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
+struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false>
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  
+  enum {
+    LhsIsUpper = (LhsMode&(Upper|Lower))==Upper,
+    LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint,
+    RhsIsUpper = (RhsMode&(Upper|Lower))==Upper,
+    RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint
+  };
+  
+  template<typename Dest>
+  static void run(Dest &dst, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
+  {
+    eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols());
+
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+                               * RhsBlasTraits::extractScalarFactor(a_rhs);
+
+    typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
+              Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,1> BlockingType;
+
+    BlockingType blocking(lhs.rows(), rhs.cols(), lhs.cols(), 1, false);
+
+    internal::product_selfadjoint_matrix<Scalar, Index,
+      EIGEN_LOGICAL_XOR(LhsIsUpper,internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
+      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
+      EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
+      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
+      internal::traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor,
+      Dest::InnerStrideAtCompileTime>
+      ::run(
+        lhs.rows(), rhs.cols(),                 // sizes
+        &lhs.coeffRef(0,0), lhs.outerStride(),  // lhs info
+        &rhs.coeffRef(0,0), rhs.outerStride(),  // rhs info
+        &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(),  // result info
+        actualAlpha, blocking                   // alpha
+      );
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
new file mode 100644
index 000000000..61396dbdf
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
@@ -0,0 +1,295 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
+#define EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
+
+namespace Eigen { 
+
+namespace internal {
+
+
+/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
+
+#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \
+{\
+\
+  static void run( \
+    Index rows, Index cols, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resIncr, Index resStride, \
+    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+  { \
+    EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+    eigen_assert(resIncr == 1); \
+    char side='L', uplo='L'; \
+    BlasIndex m, n, lda, ldb, ldc; \
+    const EIGTYPE *a, *b; \
+    EIGTYPE beta(1); \
+    MatrixX##EIGPREFIX b_tmp; \
+\
+/* Set transpose options */ \
+/* Set m, n, k */ \
+    m = convert_index<BlasIndex>(rows);  \
+    n = convert_index<BlasIndex>(cols);  \
+\
+/* Set lda, ldb, ldc */ \
+    lda = convert_index<BlasIndex>(lhsStride); \
+    ldb = convert_index<BlasIndex>(rhsStride); \
+    ldc = convert_index<BlasIndex>(resStride); \
+\
+/* Set a, b, c */ \
+    if (LhsStorageOrder==RowMajor) uplo='U'; \
+    a = _lhs; \
+\
+    if (RhsStorageOrder==RowMajor) { \
+      Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+      b_tmp = rhs.adjoint(); \
+      b = b_tmp.data(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+    } else b = _rhs; \
+\
+    BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+\
+  } \
+};
+
+
+#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \
+{\
+  static void run( \
+    Index rows, Index cols, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resIncr, Index resStride, \
+    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+  { \
+    EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+    eigen_assert(resIncr == 1); \
+    char side='L', uplo='L'; \
+    BlasIndex m, n, lda, ldb, ldc; \
+    const EIGTYPE *a, *b; \
+    EIGTYPE beta(1); \
+    MatrixX##EIGPREFIX b_tmp; \
+    Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp; \
+\
+/* Set transpose options */ \
+/* Set m, n, k */ \
+    m = convert_index<BlasIndex>(rows); \
+    n = convert_index<BlasIndex>(cols); \
+\
+/* Set lda, ldb, ldc */ \
+    lda = convert_index<BlasIndex>(lhsStride); \
+    ldb = convert_index<BlasIndex>(rhsStride); \
+    ldc = convert_index<BlasIndex>(resStride); \
+\
+/* Set a, b, c */ \
+    if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \
+      Map<const Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder>, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \
+      a_tmp = lhs.conjugate(); \
+      a = a_tmp.data(); \
+      lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+    } else a = _lhs; \
+    if (LhsStorageOrder==RowMajor) uplo='U'; \
+\
+    if (RhsStorageOrder==ColMajor && (!ConjugateRhs)) { \
+       b = _rhs; } \
+    else { \
+      if (RhsStorageOrder==ColMajor && ConjugateRhs) { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,m,n,OuterStride<>(rhsStride)); \
+        b_tmp = rhs.conjugate(); \
+      } else \
+      if (ConjugateRhs) { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+        b_tmp = rhs.adjoint(); \
+      } else { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+        b_tmp = rhs.transpose(); \
+      } \
+      b = b_tmp.data(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+    } \
+\
+    BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+\
+  } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMM_L(double, double, d, dsymm)
+EIGEN_BLAS_SYMM_L(float, float, f, ssymm)
+EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm)
+EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm)
+#else
+EIGEN_BLAS_SYMM_L(double, double, d, dsymm_)
+EIGEN_BLAS_SYMM_L(float, float, f, ssymm_)
+EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_)
+EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_)
+#endif
+
+/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
+
+#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \
+{\
+\
+  static void run( \
+    Index rows, Index cols, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resIncr, Index resStride, \
+    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+  { \
+    EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+    eigen_assert(resIncr == 1); \
+    char side='R', uplo='L'; \
+    BlasIndex m, n, lda, ldb, ldc; \
+    const EIGTYPE *a, *b; \
+    EIGTYPE beta(1); \
+    MatrixX##EIGPREFIX b_tmp; \
+\
+/* Set m, n, k */ \
+    m = convert_index<BlasIndex>(rows);  \
+    n = convert_index<BlasIndex>(cols);  \
+\
+/* Set lda, ldb, ldc */ \
+    lda = convert_index<BlasIndex>(rhsStride); \
+    ldb = convert_index<BlasIndex>(lhsStride); \
+    ldc = convert_index<BlasIndex>(resStride); \
+\
+/* Set a, b, c */ \
+    if (RhsStorageOrder==RowMajor) uplo='U'; \
+    a = _rhs; \
+\
+    if (LhsStorageOrder==RowMajor) { \
+      Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \
+      b_tmp = lhs.adjoint(); \
+      b = b_tmp.data(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+    } else b = _lhs; \
+\
+    BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+\
+  } \
+};
+
+
+#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \
+{\
+  static void run( \
+    Index rows, Index cols, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resIncr, Index resStride, \
+    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
+  { \
+    EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+    eigen_assert(resIncr == 1); \
+    char side='R', uplo='L'; \
+    BlasIndex m, n, lda, ldb, ldc; \
+    const EIGTYPE *a, *b; \
+    EIGTYPE beta(1); \
+    MatrixX##EIGPREFIX b_tmp; \
+    Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp; \
+\
+/* Set m, n, k */ \
+    m = convert_index<BlasIndex>(rows); \
+    n = convert_index<BlasIndex>(cols); \
+\
+/* Set lda, ldb, ldc */ \
+    lda = convert_index<BlasIndex>(rhsStride); \
+    ldb = convert_index<BlasIndex>(lhsStride); \
+    ldc = convert_index<BlasIndex>(resStride); \
+\
+/* Set a, b, c */ \
+    if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \
+      Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \
+      a_tmp = rhs.conjugate(); \
+      a = a_tmp.data(); \
+      lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+    } else a = _rhs; \
+    if (RhsStorageOrder==RowMajor) uplo='U'; \
+\
+    if (LhsStorageOrder==ColMajor && (!ConjugateLhs)) { \
+       b = _lhs; } \
+    else { \
+      if (LhsStorageOrder==ColMajor && ConjugateLhs) { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,n,OuterStride<>(lhsStride)); \
+        b_tmp = lhs.conjugate(); \
+      } else \
+      if (ConjugateLhs) { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
+        b_tmp = lhs.adjoint(); \
+      } else { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
+        b_tmp = lhs.transpose(); \
+      } \
+      b = b_tmp.data(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+    } \
+\
+    BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+  } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMM_R(double, double, d, dsymm)
+EIGEN_BLAS_SYMM_R(float, float, f, ssymm)
+EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm)
+EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm)
+#else
+EIGEN_BLAS_SYMM_R(double, double, d, dsymm_)
+EIGEN_BLAS_SYMM_R(float, float, f, ssymm_)
+EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_)
+EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_)
+#endif
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h
new file mode 100644
index 000000000..d38fd72b2
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -0,0 +1,262 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H
+#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/* Optimized selfadjoint matrix * vector product:
+ * This algorithm processes 2 columns at once that allows to both reduce
+ * the number of load/stores of the result by a factor 2 and to reduce
+ * the instruction dependency.
+ */
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version=Specialized>
+struct selfadjoint_matrix_vector_product;
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
+struct selfadjoint_matrix_vector_product
+
+{
+static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
+void run(
+  Index size,
+  const Scalar*  lhs, Index lhsStride,
+  const Scalar*  rhs,
+  Scalar* res,
+  Scalar alpha);
+};
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
+void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
+  Index size,
+  const Scalar*  lhs, Index lhsStride,
+  const Scalar*  rhs,
+  Scalar* res,
+  Scalar alpha)
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  const Index PacketSize = sizeof(Packet)/sizeof(Scalar);
+
+  enum {
+    IsRowMajor = StorageOrder==RowMajor ? 1 : 0,
+    IsLower = UpLo == Lower ? 1 : 0,
+    FirstTriangular = IsRowMajor == IsLower
+  };
+
+  conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> cj0;
+  conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
+  conj_helper<RealScalar,Scalar,false, ConjugateRhs> cjd;
+
+  conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> pcj0;
+  conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1;
+
+  Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
+
+  Index bound = numext::maxi(Index(0), size-8) & 0xfffffffe;
+  if (FirstTriangular)
+    bound = size - bound;
+
+  for (Index j=FirstTriangular ? bound : 0;
+       j<(FirstTriangular ? size : bound);j+=2)
+  {
+    const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+    const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;
+
+    Scalar t0 = cjAlpha * rhs[j];
+    Packet ptmp0 = pset1<Packet>(t0);
+    Scalar t1 = cjAlpha * rhs[j+1];
+    Packet ptmp1 = pset1<Packet>(t1);
+
+    Scalar t2(0);
+    Packet ptmp2 = pset1<Packet>(t2);
+    Scalar t3(0);
+    Packet ptmp3 = pset1<Packet>(t3);
+
+    Index starti = FirstTriangular ? 0 : j+2;
+    Index endi   = FirstTriangular ? j : size;
+    Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti);
+    Index alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
+
+    res[j]   += cjd.pmul(numext::real(A0[j]), t0);
+    res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1);
+    if(FirstTriangular)
+    {
+      res[j]   += cj0.pmul(A1[j],   t1);
+      t3       += cj1.pmul(A1[j],   rhs[j]);
+    }
+    else
+    {
+      res[j+1] += cj0.pmul(A0[j+1],t0);
+      t2 += cj1.pmul(A0[j+1], rhs[j+1]);
+    }
+
+    for (Index i=starti; i<alignedStart; ++i)
+    {
+      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
+      t2 += cj1.pmul(A0[i], rhs[i]);
+      t3 += cj1.pmul(A1[i], rhs[i]);
+    }
+    // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
+    // gcc 4.2 does this optimization automatically.
+    const Scalar* EIGEN_RESTRICT a0It  = A0  + alignedStart;
+    const Scalar* EIGEN_RESTRICT a1It  = A1  + alignedStart;
+    const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
+          Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
+    for (Index i=alignedStart; i<alignedEnd; i+=PacketSize)
+    {
+      Packet A0i = ploadu<Packet>(a0It);  a0It  += PacketSize;
+      Packet A1i = ploadu<Packet>(a1It);  a1It  += PacketSize;
+      Packet Bi  = ploadu<Packet>(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases
+      Packet Xi  = pload <Packet>(resIt);
+
+      Xi    = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi));
+      ptmp2 = pcj1.pmadd(A0i,  Bi, ptmp2);
+      ptmp3 = pcj1.pmadd(A1i,  Bi, ptmp3);
+      pstore(resIt,Xi); resIt += PacketSize;
+    }
+    for (Index i=alignedEnd; i<endi; i++)
+    {
+      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
+      t2 += cj1.pmul(A0[i], rhs[i]);
+      t3 += cj1.pmul(A1[i], rhs[i]);
+    }
+
+    res[j]   += alpha * (t2 + predux(ptmp2));
+    res[j+1] += alpha * (t3 + predux(ptmp3));
+  }
+  for (Index j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++)
+  {
+    const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+
+    Scalar t1 = cjAlpha * rhs[j];
+    Scalar t2(0);
+    res[j] += cjd.pmul(numext::real(A0[j]), t1);
+    for (Index i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++)
+    {
+      res[i] += cj0.pmul(A0[i], t1);
+      t2 += cj1.pmul(A0[i], rhs[i]);
+    }
+    res[j] += alpha * t2;
+  }
+}
+
+} // end namespace internal 
+
+/***************************************************************************
+* Wrapper to product_selfadjoint_vector
+***************************************************************************/
+
+namespace internal {
+
+template<typename Lhs, int LhsMode, typename Rhs>
+struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
+  
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+
+  enum { LhsUpLo = LhsMode&(Upper|Lower) };
+
+  template<typename Dest>
+  static EIGEN_DEVICE_FUNC
+  void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
+  {
+    typedef typename Dest::Scalar ResScalar;
+    typedef typename Rhs::Scalar RhsScalar;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
+    
+    eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols());
+
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+                               * RhsBlasTraits::extractScalarFactor(a_rhs);
+
+    enum {
+      EvalToDest = (Dest::InnerStrideAtCompileTime==1),
+      UseRhs = (ActualRhsTypeCleaned::InnerStrideAtCompileTime==1)
+    };
+    
+    internal::gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,!EvalToDest> static_dest;
+    internal::gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!UseRhs> static_rhs;
+
+    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+                                                  EvalToDest ? dest.data() : static_dest.data());
+                                                  
+    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,rhs.size(),
+        UseRhs ? const_cast<RhsScalar*>(rhs.data()) : static_rhs.data());
+    
+    if(!EvalToDest)
+    {
+      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      Index size = dest.size();
+      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      #endif
+      MappedDest(actualDestPtr, dest.size()) = dest;
+    }
+      
+    if(!UseRhs)
+    {
+      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      Index size = rhs.size();
+      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      #endif
+      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
+    }
+      
+      
+    internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+                                                int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run
+      (
+        lhs.rows(),                             // size
+        &lhs.coeffRef(0,0),  lhs.outerStride(), // lhs info
+        actualRhsPtr,                           // rhs info
+        actualDestPtr,                          // result info
+        actualAlpha                             // scale factor
+      );
+    
+    if(!EvalToDest)
+      dest = MappedDest(actualDestPtr, dest.size());
+  }
+};
+
+template<typename Lhs, typename Rhs, int RhsMode>
+struct selfadjoint_product_impl<Lhs,0,true,Rhs,RhsMode,false>
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  enum { RhsUpLo = RhsMode&(Upper|Lower)  };
+
+  template<typename Dest>
+  static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
+  {
+    // let's simply transpose the product
+    Transpose<Dest> destT(dest);
+    selfadjoint_product_impl<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false,
+                             Transpose<const Lhs>, 0, true>::run(destT, a_rhs.transpose(), a_lhs.transpose(), alpha);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
new file mode 100644
index 000000000..1238345e3
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
@@ -0,0 +1,118 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Selfadjoint matrix-vector product functionality based on ?SYMV/HEMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
+#define EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************************************************
+* This file implements selfadjoint matrix-vector multiplication using BLAS
+**********************************************************************/
+
+// symv/hemv specialization
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs>
+struct selfadjoint_matrix_vector_product_symv :
+  selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn> {};
+
+#define EIGEN_BLAS_SYMV_SPECIALIZE(Scalar) \
+template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
+struct selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Specialized> { \
+static void run( \
+  Index size, const Scalar*  lhs, Index lhsStride, \
+  const Scalar* _rhs, Scalar* res, Scalar alpha) { \
+    enum {\
+      IsColMajor = StorageOrder==ColMajor \
+    }; \
+    if (IsColMajor == ConjugateLhs) {\
+      selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn>::run( \
+        size, lhs, lhsStride, _rhs, res, alpha);  \
+    } else {\
+      selfadjoint_matrix_vector_product_symv<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs>::run( \
+        size, lhs, lhsStride, _rhs, res, alpha);  \
+    }\
+  } \
+}; \
+
+EIGEN_BLAS_SYMV_SPECIALIZE(double)
+EIGEN_BLAS_SYMV_SPECIALIZE(float)
+EIGEN_BLAS_SYMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_SYMV_SPECIALIZE(scomplex)
+
+#define EIGEN_BLAS_SYMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \
+template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
+struct selfadjoint_matrix_vector_product_symv<EIGTYPE,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs> \
+{ \
+typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> SYMVVector;\
+\
+static void run( \
+Index size, const EIGTYPE*  lhs, Index lhsStride, \
+const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \
+{ \
+  enum {\
+    IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \
+    IsLower = UpLo == Lower ? 1 : 0 \
+  }; \
+  BlasIndex n=convert_index<BlasIndex>(size), lda=convert_index<BlasIndex>(lhsStride), incx=1, incy=1; \
+  EIGTYPE beta(1); \
+  const EIGTYPE *x_ptr; \
+  char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \
+  SYMVVector x_tmp; \
+  if (ConjugateRhs) { \
+    Map<const SYMVVector, 0 > map_x(_rhs,size,1); \
+    x_tmp=map_x.conjugate(); \
+    x_ptr=x_tmp.data(); \
+  } else x_ptr=_rhs; \
+  BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
+}\
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMV_SPECIALIZATION(double,   double, dsymv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(float,    float,  ssymv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8,  chemv)
+#else
+EIGEN_BLAS_SYMV_SPECIALIZATION(double,   double, dsymv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(float,    float,  ssymv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float,  chemv_)
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointProduct.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointProduct.h
new file mode 100644
index 000000000..a21be8050
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointProduct.h
@@ -0,0 +1,133 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINT_PRODUCT_H
+#define EIGEN_SELFADJOINT_PRODUCT_H
+
+/**********************************************************************
+* This file implements a self adjoint product: C += A A^T updating only
+* half of the selfadjoint matrix C.
+* It corresponds to the level 3 SYRK and level 2 SYR Blas routines.
+**********************************************************************/
+
+namespace Eigen { 
+
+
+template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update<Scalar,Index,ColMajor,UpLo,ConjLhs,ConjRhs>
+{
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha)
+  {
+    internal::conj_if<ConjRhs> cj;
+    typedef Map<const Matrix<Scalar,Dynamic,1> > OtherMap;
+    typedef typename internal::conditional<ConjLhs,typename OtherMap::ConjugateReturnType,const OtherMap&>::type ConjLhsType;
+    for (Index i=0; i<size; ++i)
+    {
+      Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+(UpLo==Lower ? i : 0), (UpLo==Lower ? size-i : (i+1)))
+          += (alpha * cj(vecY[i])) * ConjLhsType(OtherMap(vecX+(UpLo==Lower ? i : 0),UpLo==Lower ? size-i : (i+1)));
+    }
+  }
+};
+
+template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update<Scalar,Index,RowMajor,UpLo,ConjLhs,ConjRhs>
+{
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha)
+  {
+    selfadjoint_rank1_update<Scalar,Index,ColMajor,UpLo==Lower?Upper:Lower,ConjRhs,ConjLhs>::run(size,mat,stride,vecY,vecX,alpha);
+  }
+};
+
+template<typename MatrixType, typename OtherType, int UpLo, bool OtherIsVector = OtherType::IsVectorAtCompileTime>
+struct selfadjoint_product_selector;
+
+template<typename MatrixType, typename OtherType, int UpLo>
+struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,true>
+{
+  static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
+  {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef internal::blas_traits<OtherType> OtherBlasTraits;
+    typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
+    typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
+    typename internal::add_const_on_value_type<ActualOtherType>::type actualOther = OtherBlasTraits::extract(other.derived());
+
+    Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived());
+
+    enum {
+      StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+      UseOtherDirectly = _ActualOtherType::InnerStrideAtCompileTime==1
+    };
+    internal::gemv_static_vector_if<Scalar,OtherType::SizeAtCompileTime,OtherType::MaxSizeAtCompileTime,!UseOtherDirectly> static_other;
+
+    ei_declare_aligned_stack_constructed_variable(Scalar, actualOtherPtr, other.size(),
+      (UseOtherDirectly ? const_cast<Scalar*>(actualOther.data()) : static_other.data()));
+      
+    if(!UseOtherDirectly)
+      Map<typename _ActualOtherType::PlainObject>(actualOtherPtr, actualOther.size()) = actualOther;
+    
+    selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo,
+                              OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
+                            (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex>
+          ::run(other.size(), mat.data(), mat.outerStride(), actualOtherPtr, actualOtherPtr, actualAlpha);
+  }
+};
+
+template<typename MatrixType, typename OtherType, int UpLo>
+struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
+{
+  static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
+  {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef internal::blas_traits<OtherType> OtherBlasTraits;
+    typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
+    typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
+    typename internal::add_const_on_value_type<ActualOtherType>::type actualOther = OtherBlasTraits::extract(other.derived());
+
+    Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived());
+
+    enum {
+      IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
+      OtherIsRowMajor = _ActualOtherType::Flags&RowMajorBit ? 1 : 0
+    };
+
+    Index size = mat.cols();
+    Index depth = actualOther.cols();
+
+    typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,Scalar,Scalar,
+              MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualOtherType::MaxColsAtCompileTime> BlockingType;
+
+    BlockingType blocking(size, size, depth, 1, false);
+
+
+    internal::general_matrix_matrix_triangular_product<Index,
+      Scalar, OtherIsRowMajor ? RowMajor : ColMajor,   OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
+      Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
+      IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo>
+      ::run(size, depth,
+            actualOther.data(), actualOther.outerStride(), actualOther.data(), actualOther.outerStride(),
+            mat.data(), mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
+  }
+};
+
+// high level API
+
+template<typename MatrixType, unsigned int UpLo>
+template<typename DerivedU>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha)
+{
+  selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);
+
+  return *this;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_PRODUCT_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h
new file mode 100644
index 000000000..f752a0bf0
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -0,0 +1,94 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINTRANK2UPTADE_H
+#define EIGEN_SELFADJOINTRANK2UPTADE_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/* Optimized selfadjoint matrix += alpha * uv' + conj(alpha)*vu'
+ * It corresponds to the Level2 syr2 BLAS routine
+ */
+
+template<typename Scalar, typename Index, typename UType, typename VType, int UpLo>
+struct selfadjoint_rank2_update_selector;
+
+template<typename Scalar, typename Index, typename UType, typename VType>
+struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower>
+{
+  static EIGEN_DEVICE_FUNC
+  void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+  {
+    const Index size = u.size();
+    for (Index i=0; i<size; ++i)
+    {
+      Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+i, size-i) +=
+                        (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.tail(size-i)
+                      + (alpha * numext::conj(v.coeff(i))) * u.tail(size-i);
+    }
+  }
+};
+
+template<typename Scalar, typename Index, typename UType, typename VType>
+struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Upper>
+{
+  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+  {
+    const Index size = u.size();
+    for (Index i=0; i<size; ++i)
+      Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i, i+1) +=
+                        (numext::conj(alpha)  * numext::conj(u.coeff(i))) * v.head(i+1)
+                      + (alpha * numext::conj(v.coeff(i))) * u.head(i+1);
+  }
+};
+
+template<bool Cond, typename T> struct conj_expr_if
+  : conditional<!Cond, const T&,
+      CwiseUnaryOp<scalar_conjugate_op<typename traits<T>::Scalar>,T> > {};
+
+} // end namespace internal
+
+template<typename MatrixType, unsigned int UpLo>
+template<typename DerivedU, typename DerivedV>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha)
+{
+  typedef internal::blas_traits<DerivedU> UBlasTraits;
+  typedef typename UBlasTraits::DirectLinearAccessType ActualUType;
+  typedef typename internal::remove_all<ActualUType>::type _ActualUType;
+  typename internal::add_const_on_value_type<ActualUType>::type actualU = UBlasTraits::extract(u.derived());
+
+  typedef internal::blas_traits<DerivedV> VBlasTraits;
+  typedef typename VBlasTraits::DirectLinearAccessType ActualVType;
+  typedef typename internal::remove_all<ActualVType>::type _ActualVType;
+  typename internal::add_const_on_value_type<ActualVType>::type actualV = VBlasTraits::extract(v.derived());
+
+  // If MatrixType is row major, then we use the routine for lower triangular in the upper triangular case and
+  // vice versa, and take the complex conjugate of all coefficients and vector entries.
+
+  enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 };
+  Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived())
+                             * numext::conj(VBlasTraits::extractScalarFactor(v.derived()));
+  if (IsRowMajor)
+    actualAlpha = numext::conj(actualAlpha);
+
+  typedef typename internal::remove_all<typename internal::conj_expr_if<int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate), _ActualUType>::type>::type UType;
+  typedef typename internal::remove_all<typename internal::conj_expr_if<int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate), _ActualVType>::type>::type VType;
+  internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType,
+    (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)>
+    ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha);
+
+  return *this;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINTRANK2UPTADE_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h
new file mode 100644
index 000000000..f0c60507a
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -0,0 +1,472 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_H
+#define EIGEN_TRIANGULAR_MATRIX_MATRIX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+// template<typename Scalar, int mr, int StorageOrder, bool Conjugate, int Mode>
+// struct gemm_pack_lhs_triangular
+// {
+//   Matrix<Scalar,mr,mr,
+//   void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows)
+//   {
+//     conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+//     const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
+//     int count = 0;
+//     const int peeled_mc = (rows/mr)*mr;
+//     for(int i=0; i<peeled_mc; i+=mr)
+//     {
+//       for(int k=0; k<depth; k++)
+//         for(int w=0; w<mr; w++)
+//           blockA[count++] = cj(lhs(i+w, k));
+//     }
+//     for(int i=peeled_mc; i<rows; i++)
+//     {
+//       for(int k=0; k<depth; k++)
+//         blockA[count++] = cj(lhs(i, k));
+//     }
+//   }
+// };
+
+/* Optimized triangular matrix * matrix (_TRMM++) product built on top of
+ * the general matrix matrix product.
+ */
+template <typename Scalar, typename Index,
+          int Mode, bool LhsIsTriangular,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResStorageOrder, int ResInnerStride,
+          int Version = Specialized>
+struct product_triangular_matrix_matrix;
+
+template <typename Scalar, typename Index,
+          int Mode, bool LhsIsTriangular,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride, int Version>
+struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
+                                           LhsStorageOrder,ConjugateLhs,
+                                           RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,Version>
+{
+  static EIGEN_STRONG_INLINE void run(
+    Index rows, Index cols, Index depth,
+    const Scalar* lhs, Index lhsStride,
+    const Scalar* rhs, Index rhsStride,
+    Scalar* res,       Index resIncr, Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+  {
+    product_triangular_matrix_matrix<Scalar, Index,
+      (Mode&(UnitDiag|ZeroDiag)) | ((Mode&Upper) ? Lower : Upper),
+      (!LhsIsTriangular),
+      RhsStorageOrder==RowMajor ? ColMajor : RowMajor,
+      ConjugateRhs,
+      LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
+      ConjugateLhs,
+      ColMajor, ResInnerStride>
+      ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking);
+  }
+};
+
+// implements col-major += alpha * op(triangular) * op(general)
+template <typename Scalar, typename Index, int Mode,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride, int Version>
+struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
+                                           LhsStorageOrder,ConjugateLhs,
+                                           RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>
+{
+  
+  typedef gebp_traits<Scalar,Scalar> Traits;
+  enum {
+    SmallPanelWidth   = 2 * EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+    IsLower = (Mode&Lower) == Lower,
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
+  };
+
+  static EIGEN_DONT_INLINE void run(
+    Index _rows, Index _cols, Index _depth,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resIncr, Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+};
+
+template <typename Scalar, typename Index, int Mode,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride, int Version>
+EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
+                                                        LhsStorageOrder,ConjugateLhs,
+                                                        RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
+    Index _rows, Index _cols, Index _depth,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* _res,       Index resIncr, Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+  {
+    // strip zeros
+    Index diagSize  = (std::min)(_rows,_depth);
+    Index rows      = IsLower ? _rows : diagSize;
+    Index depth     = IsLower ? diagSize : _depth;
+    Index cols      = _cols;
+    
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride, resIncr);
+
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+    // The small panel size must not be larger than blocking size.
+    // Usually this should never be the case because SmallPanelWidth^2 is very small
+    // compared to L2 cache size, but let's be safe:
+    Index panelWidth = (std::min)(Index(SmallPanelWidth),(std::min)(kc,mc));
+
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*cols;
+
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+    // To work around an "error: member reference base type 'Matrix<...>
+    // (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is
+    // not a structure or union" compilation error in nvcc (tested V8.0.61),
+    // create a dummy internal::constructor_without_unaligned_array_assert
+    // object to pass to the Matrix constructor.
+    internal::constructor_without_unaligned_array_assert a;
+    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer(a);
+    triangularBuffer.setZero();
+    if((Mode&ZeroDiag)==ZeroDiag)
+      triangularBuffer.diagonal().setZero();
+    else
+      triangularBuffer.diagonal().setOnes();
+
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+
+    for(Index k2=IsLower ? depth : 0;
+        IsLower ? k2>0 : k2<depth;
+        IsLower ? k2-=kc : k2+=kc)
+    {
+      Index actual_kc = (std::min)(IsLower ? k2 : depth-k2, kc);
+      Index actual_k2 = IsLower ? k2-actual_kc : k2;
+
+      // align blocks with the end of the triangular part for trapezoidal lhs
+      if((!IsLower)&&(k2<rows)&&(k2+actual_kc>rows))
+      {
+        actual_kc = rows-k2;
+        k2 = k2+actual_kc-kc;
+      }
+
+      pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols);
+
+      // the selected lhs's panel has to be split in three different parts:
+      //  1 - the part which is zero => skip it
+      //  2 - the diagonal block => special kernel
+      //  3 - the dense panel below (lower case) or above (upper case) the diagonal block => GEPP
+
+      // the block diagonal, if any:
+      if(IsLower || actual_k2<rows)
+      {
+        // for each small vertical panels of lhs
+        for (Index k1=0; k1<actual_kc; k1+=panelWidth)
+        {
+          Index actualPanelWidth = std::min<Index>(actual_kc-k1, panelWidth);
+          Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1;
+          Index startBlock   = actual_k2+k1;
+          Index blockBOffset = k1;
+
+          // => GEBP with the micro triangular block
+          // The trick is to pack this micro block while filling the opposite triangular part with zeros.
+          // To this end we do an extra triangular copy to a small temporary buffer
+          for (Index k=0;k<actualPanelWidth;++k)
+          {
+            if (SetDiag)
+              triangularBuffer.coeffRef(k,k) = lhs(startBlock+k,startBlock+k);
+            for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
+              triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
+          }
+          pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth);
+
+          gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB,
+                      actualPanelWidth, actualPanelWidth, cols, alpha,
+                      actualPanelWidth, actual_kc, 0, blockBOffset);
+
+          // GEBP with remaining micro panel
+          if (lengthTarget>0)
+          {
+            Index startTarget  = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
+
+            pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
+
+            gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB,
+                        lengthTarget, actualPanelWidth, cols, alpha,
+                        actualPanelWidth, actual_kc, 0, blockBOffset);
+          }
+        }
+      }
+      // the part below (lower case) or above (upper case) the diagonal => GEPP
+      {
+        Index start = IsLower ? k2 : 0;
+        Index end   = IsLower ? rows : (std::min)(actual_k2,rows);
+        for(Index i2=start; i2<end; i2+=mc)
+        {
+          const Index actual_mc = (std::min)(i2+mc,end)-i2;
+          gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>()
+            (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
+
+          gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
+                      actual_kc, cols, alpha, -1, -1, 0, 0);
+        }
+      }
+    }
+  }
+
+// implements col-major += alpha * op(general) * op(triangular)
+template <typename Scalar, typename Index, int Mode,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride, int Version>
+struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
+                                        LhsStorageOrder,ConjugateLhs,
+                                        RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>
+{
+  typedef gebp_traits<Scalar,Scalar> Traits;
+  enum {
+    SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+    IsLower = (Mode&Lower) == Lower,
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
+  };
+
+  static EIGEN_DONT_INLINE void run(
+    Index _rows, Index _cols, Index _depth,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resIncr, Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+};
+
+template <typename Scalar, typename Index, int Mode,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride, int Version>
+EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
+                                                        LhsStorageOrder,ConjugateLhs,
+                                                        RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
+    Index _rows, Index _cols, Index _depth,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* _res,       Index resIncr, Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+  {
+    const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar);
+    // strip zeros
+    Index diagSize  = (std::min)(_cols,_depth);
+    Index rows      = _rows;
+    Index depth     = IsLower ? _depth : diagSize;
+    Index cols      = IsLower ? diagSize : _cols;
+    
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride, resIncr);
+
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*cols+EIGEN_MAX_ALIGN_BYTES/sizeof(Scalar);
+
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+    internal::constructor_without_unaligned_array_assert a;
+    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer(a);
+    triangularBuffer.setZero();
+    if((Mode&ZeroDiag)==ZeroDiag)
+      triangularBuffer.diagonal().setZero();
+    else
+      triangularBuffer.diagonal().setOnes();
+
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
+
+    for(Index k2=IsLower ? 0 : depth;
+        IsLower ? k2<depth  : k2>0;
+        IsLower ? k2+=kc   : k2-=kc)
+    {
+      Index actual_kc = (std::min)(IsLower ? depth-k2 : k2, kc);
+      Index actual_k2 = IsLower ? k2 : k2-actual_kc;
+
+      // align blocks with the end of the triangular part for trapezoidal rhs
+      if(IsLower && (k2<cols) && (actual_k2+actual_kc>cols))
+      {
+        actual_kc = cols-k2;
+        k2 = actual_k2 + actual_kc - kc;
+      }
+
+      // remaining size
+      Index rs = IsLower ? (std::min)(cols,actual_k2) : cols - k2;
+      // size of the triangular part
+      Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
+
+      Scalar* geb = blockB+ts*ts;
+      geb = geb + internal::first_aligned<PacketBytes>(geb,PacketBytes/sizeof(Scalar));
+
+      pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
+
+      // pack the triangular part of the rhs padding the unrolled blocks with zeros
+      if(ts>0)
+      {
+        for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
+        {
+          Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+          Index actual_j2 = actual_k2 + j2;
+          Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
+          Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
+          // general part
+          pack_rhs_panel(blockB+j2*actual_kc,
+                         rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
+                         panelLength, actualPanelWidth,
+                         actual_kc, panelOffset);
+
+          // append the triangular part via a temporary buffer
+          for (Index j=0;j<actualPanelWidth;++j)
+          {
+            if (SetDiag)
+              triangularBuffer.coeffRef(j,j) = rhs(actual_j2+j,actual_j2+j);
+            for (Index k=IsLower ? j+1 : 0; IsLower ? k<actualPanelWidth : k<j; ++k)
+              triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j);
+          }
+
+          pack_rhs_panel(blockB+j2*actual_kc,
+                         RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()),
+                         actualPanelWidth, actualPanelWidth,
+                         actual_kc, j2);
+        }
+      }
+
+      for (Index i2=0; i2<rows; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(mc,rows-i2);
+        pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
+
+        // triangular kernel
+        if(ts>0)
+        {
+          for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
+          {
+            Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+            Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
+            Index blockOffset = IsLower ? j2 : 0;
+
+            gebp_kernel(res.getSubMapper(i2, actual_k2 + j2),
+                        blockA, blockB+j2*actual_kc,
+                        actual_mc, panelLength, actualPanelWidth,
+                        alpha,
+                        actual_kc, actual_kc,  // strides
+                        blockOffset, blockOffset);// offsets
+          }
+        }
+        gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2),
+                    blockA, geb, actual_mc, actual_kc, rs,
+                    alpha,
+                    -1, -1, 0, 0);
+      }
+    }
+  }
+
+/***************************************************************************
+* Wrapper to product_triangular_matrix_matrix
+***************************************************************************/
+
+} // end namespace internal
+
+namespace internal {
+template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
+struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
+{
+  template<typename Dest> static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha)
+  {
+    typedef typename Lhs::Scalar  LhsScalar;
+    typedef typename Rhs::Scalar  RhsScalar;
+    typedef typename Dest::Scalar Scalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+    
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
+
+    LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs);
+    RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs);
+    Scalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
+
+    typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
+              Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType;
+
+    enum { IsLower = (Mode&Lower) == Lower };
+    Index stripedRows  = ((!LhsIsTriangular) || (IsLower))  ? lhs.rows() : (std::min)(lhs.rows(),lhs.cols());
+    Index stripedCols  = ((LhsIsTriangular)  || (!IsLower)) ? rhs.cols() : (std::min)(rhs.cols(),rhs.rows());
+    Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
+                                         : ((IsLower)  ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
+
+    BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false);
+
+    internal::product_triangular_matrix_matrix<Scalar, Index,
+      Mode, LhsIsTriangular,
+      (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+      (internal::traits<ActualRhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+      (internal::traits<Dest          >::Flags&RowMajorBit) ? RowMajor : ColMajor, Dest::InnerStrideAtCompileTime>
+      ::run(
+        stripedRows, stripedCols, stripedDepth,   // sizes
+        &lhs.coeffRef(0,0), lhs.outerStride(),    // lhs info
+        &rhs.coeffRef(0,0), rhs.outerStride(),    // rhs info
+        &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(),    // result info
+        actualAlpha, blocking
+      );
+
+    // Apply correction if the diagonal is unit and a scalar factor was nested:
+    if ((Mode&UnitDiag)==UnitDiag)
+    {
+      if (LhsIsTriangular && lhs_alpha!=LhsScalar(1))
+      {
+        Index diagSize = (std::min)(lhs.rows(),lhs.cols());
+        dst.topRows(diagSize) -= ((lhs_alpha-LhsScalar(1))*a_rhs).topRows(diagSize);
+      }
+      else if ((!LhsIsTriangular) && rhs_alpha!=RhsScalar(1))
+      {
+        Index diagSize = (std::min)(rhs.rows(),rhs.cols());
+        dst.leftCols(diagSize) -= (rhs_alpha-RhsScalar(1))*a_lhs.leftCols(diagSize);
+      }
+    }
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
new file mode 100644
index 000000000..a98d12e4a
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
@@ -0,0 +1,317 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Triangular matrix * matrix product functionality based on ?TRMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
+#define EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
+
+namespace Eigen { 
+
+namespace internal {
+
+
+template <typename Scalar, typename Index,
+          int Mode, bool LhsIsTriangular,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResStorageOrder>
+struct product_triangular_matrix_matrix_trmm :
+       product_triangular_matrix_matrix<Scalar,Index,Mode,
+          LhsIsTriangular,LhsStorageOrder,ConjugateLhs,
+          RhsStorageOrder, ConjugateRhs, ResStorageOrder, 1, BuiltIn> {};
+
+
+// try to go to BLAS specialization
+#define EIGEN_BLAS_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \
+template <typename Index, int Mode, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \
+           LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,1,Specialized> { \
+  static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\
+    const Scalar* _rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \
+      EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+      eigen_assert(resIncr == 1); \
+      product_triangular_matrix_matrix_trmm<Scalar,Index,Mode, \
+        LhsIsTriangular,LhsStorageOrder,ConjugateLhs, \
+        RhsStorageOrder, ConjugateRhs, ColMajor>::run( \
+          _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
+  } \
+};
+
+EIGEN_BLAS_TRMM_SPECIALIZE(double, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(double, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(float, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(float, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false)
+
+// implements col-major += alpha * op(triangular) * op(general)
+#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, int Mode, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
+         LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> \
+{ \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    LowUp = IsLower ? Lower : Upper, \
+    conjA = ((LhsStorageOrder==ColMajor) && ConjugateLhs) ? 1 : 0 \
+  }; \
+\
+  static void run( \
+    Index _rows, Index _cols, Index _depth, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resStride, \
+    EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \
+  { \
+   Index diagSize  = (std::min)(_rows,_depth); \
+   Index rows      = IsLower ? _rows : diagSize; \
+   Index depth     = IsLower ? diagSize : _depth; \
+   Index cols      = _cols; \
+\
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
+\
+/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \
+   if (rows != depth) { \
+\
+     /* FIXME handle mkl_domain_get_max_threads */ \
+     /*int nthr = mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS);*/ int nthr = 1;\
+\
+     if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \
+     /* Most likely no benefit to call TRMM or GEMM from BLAS */ \
+       product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \
+       LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \
+           _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \
+     /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \
+     } else { \
+     /* Make sense to call GEMM */ \
+       Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+       MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
+       BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
+       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \
+       rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, 1, resStride, alpha, gemm_blocking, 0); \
+\
+     /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
+     } \
+     return; \
+   } \
+   char side = 'L', transa, uplo, diag = 'N'; \
+   EIGTYPE *b; \
+   const EIGTYPE *a; \
+   BlasIndex m, n, lda, ldb; \
+\
+/* Set m, n */ \
+   m = convert_index<BlasIndex>(diagSize); \
+   n = convert_index<BlasIndex>(cols); \
+\
+/* Set trans */ \
+   transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set b, ldb */ \
+   Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs,depth,cols,OuterStride<>(rhsStride)); \
+   MatrixX##EIGPREFIX b_tmp; \
+\
+   if (ConjugateRhs) b_tmp = rhs.conjugate(); else b_tmp = rhs; \
+   b = b_tmp.data(); \
+   ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+\
+/* Set uplo */ \
+   uplo = IsLower ? 'L' : 'U'; \
+   if (LhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+   Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+   MatrixLhs a_tmp; \
+\
+   if ((conjA!=0) || (SetDiag==0)) { \
+     if (conjA) a_tmp = lhs.conjugate(); else a_tmp = lhs; \
+     if (IsZeroDiag) \
+       a_tmp.diagonal().setZero(); \
+     else if (IsUnitDiag) \
+       a_tmp.diagonal().setOnes();\
+     a = a_tmp.data(); \
+     lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+   } else { \
+     a = _lhs; \
+     lda = convert_index<BlasIndex>(lhsStride); \
+   } \
+   /*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \
+/* call ?trmm*/ \
+   BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
+\
+/* Add op(a_triangular)*b into res*/ \
+   Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
+   res_tmp=res_tmp+b_tmp; \
+  } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMM_L(double, double, d, dtrmm)
+EIGEN_BLAS_TRMM_L(dcomplex, MKL_Complex16, cd, ztrmm)
+EIGEN_BLAS_TRMM_L(float, float, f, strmm)
+EIGEN_BLAS_TRMM_L(scomplex, MKL_Complex8, cf, ctrmm)
+#else
+EIGEN_BLAS_TRMM_L(double, double, d, dtrmm_)
+EIGEN_BLAS_TRMM_L(dcomplex, double, cd, ztrmm_)
+EIGEN_BLAS_TRMM_L(float, float, f, strmm_)
+EIGEN_BLAS_TRMM_L(scomplex, float, cf, ctrmm_)
+#endif
+
+// implements col-major += alpha * op(general) * op(triangular)
+#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
+template <typename Index, int Mode, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
+         LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> \
+{ \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    LowUp = IsLower ? Lower : Upper, \
+    conjA = ((RhsStorageOrder==ColMajor) && ConjugateRhs) ? 1 : 0 \
+  }; \
+\
+  static void run( \
+    Index _rows, Index _cols, Index _depth, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resStride, \
+    EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \
+  { \
+   Index diagSize  = (std::min)(_cols,_depth); \
+   Index rows      = _rows; \
+   Index depth     = IsLower ? _depth : diagSize; \
+   Index cols      = IsLower ? diagSize : _cols; \
+\
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
+\
+/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \
+   if (cols != depth) { \
+\
+     int nthr = 1 /*mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS)*/; \
+\
+     if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \
+     /* Most likely no benefit to call TRMM or GEMM from BLAS*/ \
+       product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \
+       LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \
+           _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \
+       /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \
+     } else { \
+     /* Make sense to call GEMM */ \
+       Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \
+       MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
+       BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
+       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \
+       rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, 1, resStride, alpha, gemm_blocking, 0); \
+\
+     /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
+     } \
+     return; \
+   } \
+   char side = 'R', transa, uplo, diag = 'N'; \
+   EIGTYPE *b; \
+   const EIGTYPE *a; \
+   BlasIndex m, n, lda, ldb; \
+\
+/* Set m, n */ \
+   m = convert_index<BlasIndex>(rows); \
+   n = convert_index<BlasIndex>(diagSize); \
+\
+/* Set trans */ \
+   transa = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set b, ldb */ \
+   Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+   MatrixX##EIGPREFIX b_tmp; \
+\
+   if (ConjugateLhs) b_tmp = lhs.conjugate(); else b_tmp = lhs; \
+   b = b_tmp.data(); \
+   ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
+\
+/* Set uplo */ \
+   uplo = IsLower ? 'L' : 'U'; \
+   if (RhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+   Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs,depth,cols, OuterStride<>(rhsStride)); \
+   MatrixRhs a_tmp; \
+\
+   if ((conjA!=0) || (SetDiag==0)) { \
+     if (conjA) a_tmp = rhs.conjugate(); else a_tmp = rhs; \
+     if (IsZeroDiag) \
+       a_tmp.diagonal().setZero(); \
+     else if (IsUnitDiag) \
+       a_tmp.diagonal().setOnes();\
+     a = a_tmp.data(); \
+     lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+   } else { \
+     a = _rhs; \
+     lda = convert_index<BlasIndex>(rhsStride); \
+   } \
+   /*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \
+/* call ?trmm*/ \
+   BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
+\
+/* Add op(a_triangular)*b into res*/ \
+   Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
+   res_tmp=res_tmp+b_tmp; \
+  } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMM_R(double, double, d, dtrmm)
+EIGEN_BLAS_TRMM_R(dcomplex, MKL_Complex16, cd, ztrmm)
+EIGEN_BLAS_TRMM_R(float, float, f, strmm)
+EIGEN_BLAS_TRMM_R(scomplex, MKL_Complex8, cf, ctrmm)
+#else
+EIGEN_BLAS_TRMM_R(double, double, d, dtrmm_)
+EIGEN_BLAS_TRMM_R(dcomplex, double, cd, ztrmm_)
+EIGEN_BLAS_TRMM_R(float, float, f, strmm_)
+EIGEN_BLAS_TRMM_R(scomplex, float, cf, ctrmm_)
+#endif
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector.h
new file mode 100644
index 000000000..76bfa159c
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -0,0 +1,350 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULARMATRIXVECTOR_H
+#define EIGEN_TRIANGULARMATRIXVECTOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder, int Version=Specialized>
+struct triangular_matrix_vector_product;
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
+struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  enum {
+    IsLower = ((Mode&Lower)==Lower),
+    HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
+    HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
+  };
+  static EIGEN_DONT_INLINE  void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha);
+};
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
+EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
+  ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha)
+  {
+    static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+    Index size = (std::min)(_rows,_cols);
+    Index rows = IsLower ? _rows : (std::min)(_rows,_cols);
+    Index cols = IsLower ? (std::min)(_rows,_cols) : _cols;
+
+    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
+    const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
+    typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
+
+    typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
+    const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
+    typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
+
+    typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
+    ResMap res(_res,rows);
+
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
+    for (Index pi=0; pi<size; pi+=PanelWidth)
+    {
+      Index actualPanelWidth = (std::min)(PanelWidth, size-pi);
+      for (Index k=0; k<actualPanelWidth; ++k)
+      {
+        Index i = pi + k;
+        Index s = IsLower ? ((HasUnitDiag||HasZeroDiag) ? i+1 : i ) : pi;
+        Index r = IsLower ? actualPanelWidth-k : k+1;
+        if ((!(HasUnitDiag||HasZeroDiag)) || (--r)>0)
+          res.segment(s,r) += (alpha * cjRhs.coeff(i)) * cjLhs.col(i).segment(s,r);
+        if (HasUnitDiag)
+          res.coeffRef(i) += alpha * cjRhs.coeff(i);
+      }
+      Index r = IsLower ? rows - pi - actualPanelWidth : pi;
+      if (r>0)
+      {
+        Index s = IsLower ? pi+actualPanelWidth : 0;
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
+            r, actualPanelWidth,
+            LhsMapper(&lhs.coeffRef(s,pi), lhsStride),
+            RhsMapper(&rhs.coeffRef(pi), rhsIncr),
+            &res.coeffRef(s), resIncr, alpha);
+      }
+    }
+    if((!IsLower) && cols>size)
+    {
+      general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
+          rows, cols-size,
+          LhsMapper(&lhs.coeffRef(0,size), lhsStride),
+          RhsMapper(&rhs.coeffRef(size), rhsIncr),
+          _res, resIncr, alpha);
+    }
+  }
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
+struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  enum {
+    IsLower = ((Mode&Lower)==Lower),
+    HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
+    HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
+  };
+  static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+                                    const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+};
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
+EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
+  ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
+  {
+    static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+    Index diagSize = (std::min)(_rows,_cols);
+    Index rows = IsLower ? _rows : diagSize;
+    Index cols = IsLower ? diagSize : _cols;
+
+    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
+    const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
+    typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
+
+    typedef Map<const Matrix<RhsScalar,Dynamic,1> > RhsMap;
+    const RhsMap rhs(_rhs,cols);
+    typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
+
+    typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
+    ResMap res(_res,rows,InnerStride<>(resIncr));
+
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
+    for (Index pi=0; pi<diagSize; pi+=PanelWidth)
+    {
+      Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi);
+      for (Index k=0; k<actualPanelWidth; ++k)
+      {
+        Index i = pi + k;
+        Index s = IsLower ? pi  : ((HasUnitDiag||HasZeroDiag) ? i+1 : i);
+        Index r = IsLower ? k+1 : actualPanelWidth-k;
+        if ((!(HasUnitDiag||HasZeroDiag)) || (--r)>0)
+          res.coeffRef(i) += alpha * (cjLhs.row(i).segment(s,r).cwiseProduct(cjRhs.segment(s,r).transpose())).sum();
+        if (HasUnitDiag)
+          res.coeffRef(i) += alpha * cjRhs.coeff(i);
+      }
+      Index r = IsLower ? pi : cols - pi - actualPanelWidth;
+      if (r>0)
+      {
+        Index s = IsLower ? 0 : pi + actualPanelWidth;
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
+            actualPanelWidth, r,
+            LhsMapper(&lhs.coeffRef(pi,s), lhsStride),
+            RhsMapper(&rhs.coeffRef(s), rhsIncr),
+            &res.coeffRef(pi), resIncr, alpha);
+      }
+    }
+    if(IsLower && rows>diagSize)
+    {
+      general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
+            rows-diagSize, cols,
+            LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride),
+            RhsMapper(&rhs.coeffRef(0), rhsIncr),
+            &res.coeffRef(diagSize), resIncr, alpha);
+    }
+  }
+
+/***************************************************************************
+* Wrapper to product_triangular_vector
+***************************************************************************/
+
+template<int Mode,int StorageOrder>
+struct trmv_selector;
+
+} // end namespace internal
+
+namespace internal {
+
+template<int Mode, typename Lhs, typename Rhs>
+struct triangular_product_impl<Mode,true,Lhs,false,Rhs,true>
+{
+  template<typename Dest> static void run(Dest& dst, const Lhs &lhs, const Rhs &rhs, const typename Dest::Scalar& alpha)
+  {
+    eigen_assert(dst.rows()==lhs.rows() && dst.cols()==rhs.cols());
+  
+    internal::trmv_selector<Mode,(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(lhs, rhs, dst, alpha);
+  }
+};
+
+template<int Mode, typename Lhs, typename Rhs>
+struct triangular_product_impl<Mode,false,Lhs,true,Rhs,false>
+{
+  template<typename Dest> static void run(Dest& dst, const Lhs &lhs, const Rhs &rhs, const typename Dest::Scalar& alpha)
+  {
+    eigen_assert(dst.rows()==lhs.rows() && dst.cols()==rhs.cols());
+
+    Transpose<Dest> dstT(dst);
+    internal::trmv_selector<(Mode & (UnitDiag|ZeroDiag)) | ((Mode & Lower) ? Upper : Lower),
+                            (int(internal::traits<Rhs>::Flags)&RowMajorBit) ? ColMajor : RowMajor>
+            ::run(rhs.transpose(),lhs.transpose(), dstT, alpha);
+  }
+};
+
+} // end namespace internal
+
+namespace internal {
+
+// TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same.
+  
+template<int Mode> struct trmv_selector<Mode,ColMajor>
+{
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+  {
+    typedef typename Lhs::Scalar      LhsScalar;
+    typedef typename Rhs::Scalar      RhsScalar;
+    typedef typename Dest::Scalar     ResScalar;
+    typedef typename Dest::RealScalar RealScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    
+    typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
+
+    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
+
+    LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs);
+    RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
+    ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
+
+    enum {
+      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
+      // on, the other hand it is good for the cache to pack the vector anyways...
+      EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1,
+      ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
+      MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal
+    };
+
+    gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
+
+    bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
+    bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
+
+    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
+
+    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+                                                  evalToDest ? dest.data() : static_dest.data());
+
+    if(!evalToDest)
+    {
+      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      Index size = dest.size();
+      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      #endif
+      if(!alphaIsCompatible)
+      {
+        MappedDest(actualDestPtr, dest.size()).setZero();
+        compatibleAlpha = RhsScalar(1);
+      }
+      else
+        MappedDest(actualDestPtr, dest.size()) = dest;
+    }
+
+    internal::triangular_matrix_vector_product
+      <Index,Mode,
+       LhsScalar, LhsBlasTraits::NeedToConjugate,
+       RhsScalar, RhsBlasTraits::NeedToConjugate,
+       ColMajor>
+      ::run(actualLhs.rows(),actualLhs.cols(),
+            actualLhs.data(),actualLhs.outerStride(),
+            actualRhs.data(),actualRhs.innerStride(),
+            actualDestPtr,1,compatibleAlpha);
+
+    if (!evalToDest)
+    {
+      if(!alphaIsCompatible)
+        dest += actualAlpha * MappedDest(actualDestPtr, dest.size());
+      else
+        dest = MappedDest(actualDestPtr, dest.size());
+    }
+
+    if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) )
+    {
+      Index diagSize = (std::min)(lhs.rows(),lhs.cols());
+      dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize);
+    }
+  }
+};
+
+template<int Mode> struct trmv_selector<Mode,RowMajor>
+{
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+  {
+    typedef typename Lhs::Scalar      LhsScalar;
+    typedef typename Rhs::Scalar      RhsScalar;
+    typedef typename Dest::Scalar     ResScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+
+    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
+    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
+
+    LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs);
+    RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
+    ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
+
+    enum {
+      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
+    };
+
+    gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
+
+    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
+        DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
+
+    if(!DirectlyUseRhs)
+    {
+      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      Index size = actualRhs.size();
+      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      #endif
+      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+    }
+
+    internal::triangular_matrix_vector_product
+      <Index,Mode,
+       LhsScalar, LhsBlasTraits::NeedToConjugate,
+       RhsScalar, RhsBlasTraits::NeedToConjugate,
+       RowMajor>
+      ::run(actualLhs.rows(),actualLhs.cols(),
+            actualLhs.data(),actualLhs.outerStride(),
+            actualRhsPtr,1,
+            dest.data(),dest.innerStride(),
+            actualAlpha);
+
+    if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) )
+    {
+      Index diagSize = (std::min)(lhs.rows(),lhs.cols());
+      dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize);
+    }
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULARMATRIXVECTOR_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
new file mode 100644
index 000000000..3d47a2b94
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
@@ -0,0 +1,255 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Triangular matrix-vector product functionality based on ?TRMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
+#define EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************************************************
+* This file implements triangular matrix-vector multiplication using BLAS
+**********************************************************************/
+
+// trmv/hemv specialization
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder>
+struct triangular_matrix_vector_product_trmv :
+  triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,StorageOrder,BuiltIn> {};
+
+#define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \
+ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+                                     const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
+      triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor>::run( \
+        _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+  } \
+}; \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor,Specialized> { \
+ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+                                     const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
+      triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor>::run( \
+        _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+  } \
+};
+
+EIGEN_BLAS_TRMV_SPECIALIZE(double)
+EIGEN_BLAS_TRMV_SPECIALIZE(float)
+EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_TRMV_SPECIALIZE(scomplex)
+
+// implements col-major: res += alpha * op(triangular) * vector
+#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    LowUp = IsLower ? Lower : Upper \
+  }; \
+ static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+                 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+ { \
+   if (ConjLhs || IsZeroDiag) { \
+     triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor,BuiltIn>::run( \
+       _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+     return; \
+   }\
+   Index size = (std::min)(_rows,_cols); \
+   Index rows = IsLower ? _rows : size; \
+   Index cols = IsLower ? size : _cols; \
+\
+   typedef VectorX##EIGPREFIX VectorRhs; \
+   EIGTYPE *x, *y;\
+\
+/* Set x*/ \
+   Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
+   VectorRhs x_tmp; \
+   if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+   x = x_tmp.data(); \
+\
+/* Square part handling */\
+\
+   char trans, uplo, diag; \
+   BlasIndex m, n, lda, incx, incy; \
+   EIGTYPE const *a; \
+   EIGTYPE beta(1); \
+\
+/* Set m, n */ \
+   n = convert_index<BlasIndex>(size); \
+   lda = convert_index<BlasIndex>(lhsStride); \
+   incx = 1; \
+   incy = convert_index<BlasIndex>(resIncr); \
+\
+/* Set uplo, trans and diag*/ \
+   trans = 'N'; \
+   uplo = IsLower ? 'L' : 'U'; \
+   diag = IsUnitDiag ? 'U' : 'N'; \
+\
+/* call ?TRMV*/ \
+   BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
+\
+/* Add op(a_tr)rhs into res*/ \
+   BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
+   if (size<(std::max)(rows,cols)) { \
+     if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+     x = x_tmp.data(); \
+     if (size<rows) { \
+       y = _res + size*resIncr; \
+       a = _lhs + size; \
+       m = convert_index<BlasIndex>(rows-size); \
+       n = convert_index<BlasIndex>(size); \
+     } \
+     else { \
+       x += size; \
+       y = _res; \
+       a = _lhs + size*lda; \
+       m = convert_index<BlasIndex>(size); \
+       n = convert_index<BlasIndex>(cols-size); \
+     } \
+     BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
+   } \
+  } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMV_CM(double,   double, d,  d,)
+EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,)
+EIGEN_BLAS_TRMV_CM(float,    float,  f,  s,)
+EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8,  cf, c,)
+#else
+EIGEN_BLAS_TRMV_CM(double,   double, d,  d, _)
+EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _)
+EIGEN_BLAS_TRMV_CM(float,    float,  f,  s, _)
+EIGEN_BLAS_TRMV_CM(scomplex, float,  cf, c, _)
+#endif
+
+// implements row-major: res += alpha * op(triangular) * vector
+#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    LowUp = IsLower ? Lower : Upper \
+  }; \
+ static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+                 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+ { \
+   if (IsZeroDiag) { \
+     triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor,BuiltIn>::run( \
+       _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+     return; \
+   }\
+   Index size = (std::min)(_rows,_cols); \
+   Index rows = IsLower ? _rows : size; \
+   Index cols = IsLower ? size : _cols; \
+\
+   typedef VectorX##EIGPREFIX VectorRhs; \
+   EIGTYPE *x, *y;\
+\
+/* Set x*/ \
+   Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
+   VectorRhs x_tmp; \
+   if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+   x = x_tmp.data(); \
+\
+/* Square part handling */\
+\
+   char trans, uplo, diag; \
+   BlasIndex m, n, lda, incx, incy; \
+   EIGTYPE const *a; \
+   EIGTYPE beta(1); \
+\
+/* Set m, n */ \
+   n = convert_index<BlasIndex>(size); \
+   lda = convert_index<BlasIndex>(lhsStride); \
+   incx = 1; \
+   incy = convert_index<BlasIndex>(resIncr); \
+\
+/* Set uplo, trans and diag*/ \
+   trans = ConjLhs ? 'C' : 'T'; \
+   uplo = IsLower ? 'U' : 'L'; \
+   diag = IsUnitDiag ? 'U' : 'N'; \
+\
+/* call ?TRMV*/ \
+   BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
+\
+/* Add op(a_tr)rhs into res*/ \
+   BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
+   if (size<(std::max)(rows,cols)) { \
+     if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+     x = x_tmp.data(); \
+     if (size<rows) { \
+       y = _res + size*resIncr; \
+       a = _lhs + size*lda; \
+       m = convert_index<BlasIndex>(rows-size); \
+       n = convert_index<BlasIndex>(size); \
+     } \
+     else { \
+       x += size; \
+       y = _res; \
+       a = _lhs + size; \
+       m = convert_index<BlasIndex>(size); \
+       n = convert_index<BlasIndex>(cols-size); \
+     } \
+     BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
+   } \
+  } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMV_RM(double,   double, d,  d,)
+EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,)
+EIGEN_BLAS_TRMV_RM(float,    float,  f,  s,)
+EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8,  cf, c,)
+#else
+EIGEN_BLAS_TRMV_RM(double,   double, d,  d,_)
+EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_)
+EIGEN_BLAS_TRMV_RM(float,    float,  f,  s,_)
+EIGEN_BLAS_TRMV_RM(scomplex, float,  cf, c,_)
+#endif
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h
new file mode 100644
index 000000000..6d879ba00
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -0,0 +1,337 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H
+#define EIGEN_TRIANGULAR_SOLVER_MATRIX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+// if the rhs is row major, let's transpose the product
+template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor,OtherInnerStride>
+{
+  static void run(
+    Index size, Index cols,
+    const Scalar*  tri, Index triStride,
+    Scalar* _other, Index otherIncr, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking)
+  {
+    triangular_solve_matrix<
+      Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft,
+      (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper),
+      NumTraits<Scalar>::IsComplex && Conjugate,
+      TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor, OtherInnerStride>
+      ::run(size, cols, tri, triStride, _other, otherIncr, otherStride, blocking);
+  }
+};
+
+/* Optimized triangular solver with multiple right hand side and the triangular matrix on the left
+ */
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder,int OtherInnerStride>
+struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>
+{
+  static EIGEN_DONT_INLINE void run(
+    Index size, Index otherSize,
+    const Scalar* _tri, Index triStride,
+    Scalar* _other, Index otherIncr, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking);
+};
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run(
+    Index size, Index otherSize,
+    const Scalar* _tri, Index triStride,
+    Scalar* _other, Index otherIncr, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking)
+  {
+    Index cols = otherSize;
+
+    typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
+    typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> OtherMapper;
+    TriMapper tri(_tri, triStride);
+    OtherMapper other(_other, otherStride, otherIncr);
+
+    typedef gebp_traits<Scalar,Scalar> Traits;
+
+    enum {
+      SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+      IsLower = (Mode&Lower) == Lower
+    };
+
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(size,blocking.mc());  // cache block size along the M direction
+
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*cols;
+
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+    conj_if<Conjugate> conj;
+    gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, TriStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
+
+    // the goal here is to subdivise the Rhs panels such that we keep some cache
+    // coherence when accessing the rhs elements
+    std::ptrdiff_t l1, l2, l3;
+    manage_caching_sizes(GetAction, &l1, &l2, &l3);
+    Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * std::max<Index>(otherStride,size)) : 0;
+    subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
+
+    for(Index k2=IsLower ? 0 : size;
+        IsLower ? k2<size : k2>0;
+        IsLower ? k2+=kc : k2-=kc)
+    {
+      const Index actual_kc = (std::min)(IsLower ? size-k2 : k2, kc);
+
+      // We have selected and packed a big horizontal panel R1 of rhs. Let B be the packed copy of this panel,
+      // and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into
+      // A11 (the triangular part) and A21 the remaining rectangular part.
+      // Then the high level algorithm is:
+      //  - B = R1                    => general block copy (done during the next step)
+      //  - R1 = A11^-1 B             => tricky part
+      //  - update B from the new R1  => actually this has to be performed continuously during the above step
+      //  - R2 -= A21 * B             => GEPP
+
+      // The tricky part: compute R1 = A11^-1 B while updating B from R1
+      // The idea is to split A11 into multiple small vertical panels.
+      // Each panel can be split into a small triangular part T1k which is processed without optimization,
+      // and the remaining small part T2k which is processed using gebp with appropriate block strides
+      for(Index j2=0; j2<cols; j2+=subcols)
+      {
+        Index actual_cols = (std::min)(cols-j2,subcols);
+        // for each small vertical panels [T1k^T, T2k^T]^T of lhs
+        for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
+        {
+          Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
+          // tr solve
+          for (Index k=0; k<actualPanelWidth; ++k)
+          {
+            // TODO write a small kernel handling this (can be shared with trsv)
+            Index i  = IsLower ? k2+k1+k : k2-k1-k-1;
+            Index rs = actualPanelWidth - k - 1; // remaining size
+            Index s  = TriStorageOrder==RowMajor ? (IsLower ? k2+k1 : i+1)
+                                                 :  IsLower ? i+1 : i-rs;
+
+            Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i));
+            for (Index j=j2; j<j2+actual_cols; ++j)
+            {
+              if (TriStorageOrder==RowMajor)
+              {
+                Scalar b(0);
+                const Scalar* l = &tri(i,s);
+                typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j);
+                for (Index i3=0; i3<k; ++i3)
+                  b += conj(l[i3]) * r(i3);
+
+                other(i,j) = (other(i,j) - b)*a;
+              }
+              else
+              {
+                Scalar& otherij = other(i,j);
+                otherij *= a;
+                Scalar b = otherij;
+                typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j);
+                typename TriMapper::LinearMapper l = tri.getLinearMapper(s,i);
+                for (Index i3=0;i3<rs;++i3)
+                  r(i3) -= b * conj(l(i3));
+              }
+            }
+          }
+
+          Index lengthTarget = actual_kc-k1-actualPanelWidth;
+          Index startBlock   = IsLower ? k2+k1 : k2-k1-actualPanelWidth;
+          Index blockBOffset = IsLower ? k1 : lengthTarget;
+
+          // update the respective rows of B from other
+          pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset);
+
+          // GEBP
+          if (lengthTarget>0)
+          {
+            Index startTarget  = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
+
+            pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
+
+            gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
+                        actualPanelWidth, actual_kc, 0, blockBOffset);
+          }
+        }
+      }
+      
+      // R2 -= A21 * B => GEPP
+      {
+        Index start = IsLower ? k2+kc : 0;
+        Index end   = IsLower ? size : k2-kc;
+        for(Index i2=start; i2<end; i2+=mc)
+        {
+          const Index actual_mc = (std::min)(mc,end-i2);
+          if (actual_mc>0)
+          {
+            pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc);
+
+            gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
+          }
+        }
+      }
+    }
+  }
+
+/* Optimized triangular solver with multiple left hand sides and the triangular matrix on the right
+ */
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>
+{
+  static EIGEN_DONT_INLINE void run(
+    Index size, Index otherSize,
+    const Scalar* _tri, Index triStride,
+    Scalar* _other, Index otherIncr, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking);
+};
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run(
+    Index size, Index otherSize,
+    const Scalar* _tri, Index triStride,
+    Scalar* _other, Index otherIncr, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking)
+  {
+    Index rows = otherSize;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
+    LhsMapper lhs(_other, otherStride, otherIncr);
+    RhsMapper rhs(_tri, triStride);
+
+    typedef gebp_traits<Scalar,Scalar> Traits;
+    enum {
+      RhsStorageOrder   = TriStorageOrder,
+      SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+      IsLower = (Mode&Lower) == Lower
+    };
+
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*size;
+
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+    conj_if<Conjugate> conj;
+    gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor, false, true> pack_lhs_panel;
+
+    for(Index k2=IsLower ? size : 0;
+        IsLower ? k2>0 : k2<size;
+        IsLower ? k2-=kc : k2+=kc)
+    {
+      const Index actual_kc = (std::min)(IsLower ? k2 : size-k2, kc);
+      Index actual_k2 = IsLower ? k2-actual_kc : k2 ;
+
+      Index startPanel = IsLower ? 0 : k2+actual_kc;
+      Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
+      Scalar* geb = blockB+actual_kc*actual_kc;
+
+      if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs);
+
+      // triangular packing (we only pack the panels off the diagonal,
+      // neglecting the blocks overlapping the diagonal
+      {
+        for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
+        {
+          Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+          Index actual_j2 = actual_k2 + j2;
+          Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
+          Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
+
+          if (panelLength>0)
+          pack_rhs_panel(blockB+j2*actual_kc,
+                         rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
+                         panelLength, actualPanelWidth,
+                         actual_kc, panelOffset);
+        }
+      }
+
+      for(Index i2=0; i2<rows; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(mc,rows-i2);
+
+        // triangular solver kernel
+        {
+          // for each small block of the diagonal (=> vertical panels of rhs)
+          for (Index j2 = IsLower
+                      ? (actual_kc - ((actual_kc%SmallPanelWidth) ? Index(actual_kc%SmallPanelWidth)
+                                                                  : Index(SmallPanelWidth)))
+                      : 0;
+               IsLower ? j2>=0 : j2<actual_kc;
+               IsLower ? j2-=SmallPanelWidth : j2+=SmallPanelWidth)
+          {
+            Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+            Index absolute_j2 = actual_k2 + j2;
+            Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
+            Index panelLength = IsLower ? actual_kc - j2 - actualPanelWidth : j2;
+
+            // GEBP
+            if(panelLength>0)
+            {
+              gebp_kernel(lhs.getSubMapper(i2,absolute_j2),
+                          blockA, blockB+j2*actual_kc,
+                          actual_mc, panelLength, actualPanelWidth,
+                          Scalar(-1),
+                          actual_kc, actual_kc, // strides
+                          panelOffset, panelOffset); // offsets
+            }
+
+            // unblocked triangular solve
+            for (Index k=0; k<actualPanelWidth; ++k)
+            {
+              Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k;
+
+              typename LhsMapper::LinearMapper r = lhs.getLinearMapper(i2,j);
+              for (Index k3=0; k3<k; ++k3)
+              {
+                Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j));
+                typename LhsMapper::LinearMapper a = lhs.getLinearMapper(i2,IsLower ? j+1+k3 : absolute_j2+k3);
+                for (Index i=0; i<actual_mc; ++i)
+                  r(i) -= a(i) * b;
+              }
+              if((Mode & UnitDiag)==0)
+              {
+                Scalar inv_rjj = RealScalar(1)/conj(rhs(j,j));
+                for (Index i=0; i<actual_mc; ++i)
+                  r(i) *= inv_rjj;
+              }
+            }
+
+            // pack the just computed part of lhs to A
+            pack_lhs_panel(blockA, lhs.getSubMapper(i2,absolute_j2),
+                           actualPanelWidth, actual_mc,
+                           actual_kc, j2);
+          }
+        }
+
+        if (rs>0)
+          gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb,
+                      actual_mc, actual_kc, rs, Scalar(-1),
+                      -1, -1, 0, 0);
+      }
+    }
+  }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
new file mode 100644
index 000000000..621194ce6
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
@@ -0,0 +1,167 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Triangular matrix * matrix product functionality based on ?TRMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
+#define EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// implements LeftSide op(triangular)^-1 * general
+#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \
+template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
+struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,1> \
+{ \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \
+  }; \
+  static void run( \
+      Index size, Index otherSize, \
+      const EIGTYPE* _tri, Index triStride, \
+      EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
+  { \
+   EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \
+   eigen_assert(otherIncr == 1); \
+   BlasIndex m = convert_index<BlasIndex>(size), n = convert_index<BlasIndex>(otherSize), lda, ldb; \
+   char side = 'L', uplo, diag='N', transa; \
+   /* Set alpha_ */ \
+   EIGTYPE alpha(1); \
+   ldb = convert_index<BlasIndex>(otherStride);\
+\
+   const EIGTYPE *a; \
+/* Set trans */ \
+   transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \
+/* Set uplo */ \
+   uplo = IsLower ? 'L' : 'U'; \
+   if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri; \
+   Map<const MatrixTri, 0, OuterStride<> > tri(_tri,size,size,OuterStride<>(triStride)); \
+   MatrixTri a_tmp; \
+\
+   if (conjA) { \
+     a_tmp = tri.conjugate(); \
+     a = a_tmp.data(); \
+     lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+   } else { \
+     a = _tri; \
+     lda = convert_index<BlasIndex>(triStride); \
+   } \
+   if (IsUnitDiag) diag='U'; \
+/* call ?trsm*/ \
+   BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
+ } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRSM_L(double,   double, dtrsm)
+EIGEN_BLAS_TRSM_L(dcomplex, MKL_Complex16, ztrsm)
+EIGEN_BLAS_TRSM_L(float,    float,  strsm)
+EIGEN_BLAS_TRSM_L(scomplex, MKL_Complex8, ctrsm)
+#else
+EIGEN_BLAS_TRSM_L(double,   double, dtrsm_)
+EIGEN_BLAS_TRSM_L(dcomplex, double, ztrsm_)
+EIGEN_BLAS_TRSM_L(float,    float,  strsm_)
+EIGEN_BLAS_TRSM_L(scomplex, float,  ctrsm_)
+#endif
+
+// implements RightSide general * op(triangular)^-1
+#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASFUNC) \
+template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
+struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,1> \
+{ \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \
+  }; \
+  static void run( \
+      Index size, Index otherSize, \
+      const EIGTYPE* _tri, Index triStride, \
+      EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
+  { \
+   EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \
+   eigen_assert(otherIncr == 1); \
+   BlasIndex m = convert_index<BlasIndex>(otherSize), n = convert_index<BlasIndex>(size), lda, ldb; \
+   char side = 'R', uplo, diag='N', transa; \
+   /* Set alpha_ */ \
+   EIGTYPE alpha(1); \
+   ldb = convert_index<BlasIndex>(otherStride);\
+\
+   const EIGTYPE *a; \
+/* Set trans */ \
+   transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \
+/* Set uplo */ \
+   uplo = IsLower ? 'L' : 'U'; \
+   if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri; \
+   Map<const MatrixTri, 0, OuterStride<> > tri(_tri,size,size,OuterStride<>(triStride)); \
+   MatrixTri a_tmp; \
+\
+   if (conjA) { \
+     a_tmp = tri.conjugate(); \
+     a = a_tmp.data(); \
+     lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
+   } else { \
+     a = _tri; \
+     lda = convert_index<BlasIndex>(triStride); \
+   } \
+   if (IsUnitDiag) diag='U'; \
+/* call ?trsm*/ \
+   BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
+   /*std::cout << "TRMS_L specialization!\n";*/ \
+ } \
+};
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRSM_R(double,   double, dtrsm)
+EIGEN_BLAS_TRSM_R(dcomplex, MKL_Complex16, ztrsm)
+EIGEN_BLAS_TRSM_R(float,    float,  strsm)
+EIGEN_BLAS_TRSM_R(scomplex, MKL_Complex8,  ctrsm)
+#else
+EIGEN_BLAS_TRSM_R(double,   double, dtrsm_)
+EIGEN_BLAS_TRSM_R(dcomplex, double, ztrsm_)
+EIGEN_BLAS_TRSM_R(float,    float,  strsm_)
+EIGEN_BLAS_TRSM_R(scomplex, float,  ctrsm_)
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverVector.h b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverVector.h
new file mode 100644
index 000000000..647317016
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/products/TriangularSolverVector.h
@@ -0,0 +1,148 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H
+#define EIGEN_TRIANGULAR_SOLVER_VECTOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate, int StorageOrder>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Conjugate, StorageOrder>
+{
+  static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
+  {
+    triangular_solve_vector<LhsScalar,RhsScalar,Index,OnTheLeft,
+        ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
+        Conjugate,StorageOrder==RowMajor?ColMajor:RowMajor
+      >::run(size, _lhs, lhsStride, rhs);
+  }
+};
+
+// forward and backward substitution, row-major, rhs is a vector
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor>
+{
+  enum {
+    IsLower = ((Mode&Lower)==Lower)
+  };
+  static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
+  {
+    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
+    const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
+
+    typename internal::conditional<
+                          Conjugate,
+                          const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
+                          const LhsMap&>
+                        ::type cjLhs(lhs);
+    static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+    for(Index pi=IsLower ? 0 : size;
+        IsLower ? pi<size : pi>0;
+        IsLower ? pi+=PanelWidth : pi-=PanelWidth)
+    {
+      Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth);
+
+      Index r = IsLower ? pi : size - pi; // remaining size
+      if (r > 0)
+      {
+        // let's directly call the low level product function because:
+        // 1 - it is faster to compile
+        // 2 - it is slightly faster at runtime
+        Index startRow = IsLower ? pi : pi-actualPanelWidth;
+        Index startCol = IsLower ? 0 : pi;
+
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
+          actualPanelWidth, r,
+          LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride),
+          RhsMapper(rhs + startCol, 1),
+          rhs + startRow, 1,
+          RhsScalar(-1));
+      }
+
+      for(Index k=0; k<actualPanelWidth; ++k)
+      {
+        Index i = IsLower ? pi+k : pi-k-1;
+        Index s = IsLower ? pi   : i+1;
+        if (k>0)
+          rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
+
+        if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0)))
+          rhs[i] /= cjLhs(i,i);
+      }
+    }
+  }
+};
+
+// forward and backward substitution, column-major, rhs is a vector
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, ColMajor>
+{
+  enum {
+    IsLower = ((Mode&Lower)==Lower)
+  };
+  static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
+  {
+    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
+    const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
+    typename internal::conditional<Conjugate,
+                                   const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
+                                   const LhsMap&
+                                  >::type cjLhs(lhs);
+    static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+
+    for(Index pi=IsLower ? 0 : size;
+        IsLower ? pi<size : pi>0;
+        IsLower ? pi+=PanelWidth : pi-=PanelWidth)
+    {
+      Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth);
+      Index startBlock = IsLower ? pi : pi-actualPanelWidth;
+      Index endBlock = IsLower ? pi + actualPanelWidth : 0;
+
+      for(Index k=0; k<actualPanelWidth; ++k)
+      {
+        Index i = IsLower ? pi+k : pi-k-1;
+        if(numext::not_equal_strict(rhs[i],RhsScalar(0)))
+        {
+          if(!(Mode & UnitDiag))
+            rhs[i] /= cjLhs.coeff(i,i);
+
+          Index r = actualPanelWidth - k - 1; // remaining size
+          Index s = IsLower ? i+1 : i-r;
+          if (r>0)
+            Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+        }
+      }
+      Index r = IsLower ? size - endBlock : startBlock; // remaining size
+      if (r > 0)
+      {
+        // let's directly call the low level product function because:
+        // 1 - it is faster to compile
+        // 2 - it is slightly faster at runtime
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
+            r, actualPanelWidth,
+            LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
+            RhsMapper(rhs+startBlock, 1),
+            rhs+endBlock, 1, RhsScalar(-1));
+      }
+    }
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_SOLVER_VECTOR_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/BlasUtil.h b/src/3rdparty/eigen/Eigen/src/Core/util/BlasUtil.h
new file mode 100644
index 000000000..e16a56498
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/BlasUtil.h
@@ -0,0 +1,583 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BLASUTIL_H
+#define EIGEN_BLASUTIL_H
+
+// This file contains many lightweight helper classes used to
+// implement and control fast level 2 and level 3 BLAS-like routines.
+
+namespace Eigen {
+
+namespace internal {
+
+// forward declarations
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
+struct gebp_kernel;
+
+template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
+struct gemm_pack_rhs;
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+struct gemm_pack_lhs;
+
+template<
+  typename Index,
+  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+  int ResStorageOrder, int ResInnerStride>
+struct general_matrix_matrix_product;
+
+template<typename Index,
+         typename LhsScalar, typename LhsMapper, int LhsStorageOrder, bool ConjugateLhs,
+         typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version=Specialized>
+struct general_matrix_vector_product;
+
+template<typename From,typename To> struct get_factor {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); }
+};
+
+template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::Real> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) { return numext::real(x); }
+};
+
+
+template<typename Scalar, typename Index>
+class BlasVectorMapper {
+  public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    return m_data[i];
+  }
+  template <typename Packet, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet load(Index i) const {
+    return ploadt<Packet, AlignmentType>(m_data + i);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC bool aligned(Index i) const {
+    return (UIntPtr(m_data+i)%sizeof(Packet))==0;
+  }
+
+  protected:
+  Scalar* m_data;
+};
+
+template<typename Scalar, typename Index, int AlignmentType, int Incr=1>
+class BlasLinearMapper;
+
+template<typename Scalar, typename Index, int AlignmentType>
+class BlasLinearMapper<Scalar,Index,AlignmentType>
+{
+public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data, Index incr=1)
+    : m_data(data)
+  {
+    EIGEN_ONLY_USED_FOR_DEBUG(incr);
+    eigen_assert(incr==1);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
+    internal::prefetch(&operator()(i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
+    return m_data[i];
+  }
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const {
+    return ploadt<PacketType, AlignmentType>(m_data + i);
+  }
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
+    pstoret<Scalar, PacketType, AlignmentType>(m_data + i, p);
+  }
+
+protected:
+  Scalar *m_data;
+};
+
+// Lightweight helper class to access matrix coefficients.
+template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned, int Incr = 1>
+class blas_data_mapper;
+
+// TMP to help PacketBlock store implementation.
+// There's currently no known use case for PacketBlock load.
+// The default implementation assumes ColMajor order.
+// It always store each packet sequentially one `stride` apart.
+template<typename Index, typename Scalar, typename Packet, int n, int idx, int StorageOrder>
+struct PacketBlockManagement
+{
+  PacketBlockManagement<Index, Scalar, Packet, n, idx - 1, StorageOrder> pbm;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const {
+    pbm.store(to, stride, i, j, block);
+    pstoreu<Scalar>(to + i + (j + idx)*stride, block.packet[idx]);
+  }
+};
+
+// PacketBlockManagement specialization to take care of RowMajor order without ifs.
+template<typename Index, typename Scalar, typename Packet, int n, int idx>
+struct PacketBlockManagement<Index, Scalar, Packet, n, idx, RowMajor>
+{
+  PacketBlockManagement<Index, Scalar, Packet, n, idx - 1, RowMajor> pbm;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const {
+    pbm.store(to, stride, i, j, block);
+    pstoreu<Scalar>(to + j + (i + idx)*stride, block.packet[idx]);
+  }
+};
+
+template<typename Index, typename Scalar, typename Packet, int n, int StorageOrder>
+struct PacketBlockManagement<Index, Scalar, Packet, n, -1, StorageOrder>
+{
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const {
+    EIGEN_UNUSED_VARIABLE(to);
+    EIGEN_UNUSED_VARIABLE(stride);
+    EIGEN_UNUSED_VARIABLE(i);
+    EIGEN_UNUSED_VARIABLE(j);
+    EIGEN_UNUSED_VARIABLE(block);
+  }
+};
+
+template<typename Index, typename Scalar, typename Packet, int n>
+struct PacketBlockManagement<Index, Scalar, Packet, n, -1, RowMajor>
+{
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const {
+    EIGEN_UNUSED_VARIABLE(to);
+    EIGEN_UNUSED_VARIABLE(stride);
+    EIGEN_UNUSED_VARIABLE(i);
+    EIGEN_UNUSED_VARIABLE(j);
+    EIGEN_UNUSED_VARIABLE(block);
+  }
+};
+
+template<typename Scalar, typename Index, int StorageOrder, int AlignmentType>
+class blas_data_mapper<Scalar,Index,StorageOrder,AlignmentType,1>
+{
+public:
+  typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
+  typedef BlasVectorMapper<Scalar, Index> VectorMapper;
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr=1)
+   : m_data(data), m_stride(stride)
+  {
+    EIGEN_ONLY_USED_FOR_DEBUG(incr);
+    eigen_assert(incr==1);
+  }
+
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
+  getSubMapper(Index i, Index j) const {
+    return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride);
+  }
+
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(&operator()(i, j));
+  }
+
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(&operator()(i, j));
+  }
+
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
+    return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
+  }
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const {
+    return ploadt<PacketType, AlignmentType>(&operator()(i, j));
+  }
+
+  template <typename PacketT, int AlignmentT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
+    return ploadt<PacketT, AlignmentT>(&operator()(i, j));
+  }
+
+  template<typename SubPacket>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
+    pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
+  }
+
+  template<typename SubPacket>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
+    return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
+  }
+
+  EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; }
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
+
+  EIGEN_DEVICE_FUNC Index firstAligned(Index size) const {
+    if (UIntPtr(m_data)%sizeof(Scalar)) {
+      return -1;
+    }
+    return internal::first_default_aligned(m_data, size);
+  }
+
+  template<typename SubPacket, int n>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j, const PacketBlock<SubPacket, n> &block) const {
+    PacketBlockManagement<Index, Scalar, SubPacket, n, n-1, StorageOrder> pbm;
+    pbm.store(m_data, m_stride, i, j, block);
+  }
+protected:
+  Scalar* EIGEN_RESTRICT m_data;
+  const Index m_stride;
+};
+
+// Implementation of non-natural increment (i.e. inner-stride != 1)
+// The exposed API is not complete yet compared to the Incr==1 case
+// because some features makes less sense in this case.
+template<typename Scalar, typename Index, int AlignmentType, int Incr>
+class BlasLinearMapper
+{
+public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data,Index incr) : m_data(data), m_incr(incr) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
+    internal::prefetch(&operator()(i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
+    return m_data[i*m_incr.value()];
+  }
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const {
+    return pgather<Scalar,PacketType>(m_data + i*m_incr.value(), m_incr.value());
+  }
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
+    pscatter<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value());
+  }
+
+protected:
+  Scalar *m_data;
+  const internal::variable_if_dynamic<Index,Incr> m_incr;
+};
+
+template<typename Scalar, typename Index, int StorageOrder, int AlignmentType,int Incr>
+class blas_data_mapper
+{
+public:
+  typedef BlasLinearMapper<Scalar, Index, AlignmentType,Incr> LinearMapper;
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr) : m_data(data), m_stride(stride), m_incr(incr) {}
+
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE blas_data_mapper
+  getSubMapper(Index i, Index j) const {
+    return blas_data_mapper(&operator()(i, j), m_stride, m_incr.value());
+  }
+
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(&operator()(i, j), m_incr.value());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
+    return m_data[StorageOrder==RowMajor ? j*m_incr.value() + i*m_stride : i*m_incr.value() + j*m_stride];
+  }
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const {
+    return pgather<Scalar,PacketType>(&operator()(i, j),m_incr.value());
+  }
+
+  template <typename PacketT, int AlignmentT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
+    return pgather<Scalar,PacketT>(&operator()(i, j),m_incr.value());
+  }
+
+  template<typename SubPacket>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
+    pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
+  }
+
+  template<typename SubPacket>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
+    return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
+  }
+
+  // storePacketBlock_helper defines a way to access values inside the PacketBlock, this is essentially required by the Complex types.
+  template<typename SubPacket, typename ScalarT, int n, int idx>
+  struct storePacketBlock_helper
+  {
+    storePacketBlock_helper<SubPacket, ScalarT, n, idx-1> spbh;
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>* sup, Index i, Index j, const PacketBlock<SubPacket, n>& block) const {
+      spbh.store(sup, i,j,block);
+      for(int l = 0; l < unpacket_traits<SubPacket>::size; l++)
+      {
+        ScalarT *v = &sup->operator()(i+l, j+idx);
+        *v = block.packet[idx][l];
+      }
+    }
+  };
+
+  template<typename SubPacket, int n, int idx>
+  struct storePacketBlock_helper<SubPacket, std::complex<float>, n, idx>
+  {
+    storePacketBlock_helper<SubPacket, std::complex<float>, n, idx-1> spbh;
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>* sup, Index i, Index j, const PacketBlock<SubPacket, n>& block) const {
+      spbh.store(sup,i,j,block);
+      for(int l = 0; l < unpacket_traits<SubPacket>::size; l++)
+      {
+        std::complex<float> *v = &sup->operator()(i+l, j+idx);
+        v->real(block.packet[idx].v[2*l+0]);
+        v->imag(block.packet[idx].v[2*l+1]);
+      }
+    }
+  };
+
+  template<typename SubPacket, int n, int idx>
+  struct storePacketBlock_helper<SubPacket, std::complex<double>, n, idx>
+  {
+    storePacketBlock_helper<SubPacket, std::complex<double>, n, idx-1> spbh;
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>* sup, Index i, Index j, const PacketBlock<SubPacket, n>& block) const {
+      spbh.store(sup,i,j,block);
+      for(int l = 0; l < unpacket_traits<SubPacket>::size; l++)
+      {
+        std::complex<double> *v = &sup->operator()(i+l, j+idx);
+        v->real(block.packet[idx].v[2*l+0]);
+        v->imag(block.packet[idx].v[2*l+1]);
+      }
+    }
+  };
+
+  template<typename SubPacket, typename ScalarT, int n>
+  struct storePacketBlock_helper<SubPacket, ScalarT, n, -1>
+  {
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>*, Index, Index, const PacketBlock<SubPacket, n>& ) const {
+    }
+  };
+
+  template<typename SubPacket, int n>
+  struct storePacketBlock_helper<SubPacket, std::complex<float>, n, -1>
+  {
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>*, Index, Index, const PacketBlock<SubPacket, n>& ) const {
+    }
+  };
+
+  template<typename SubPacket, int n>
+  struct storePacketBlock_helper<SubPacket, std::complex<double>, n, -1>
+  {
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>*, Index, Index, const PacketBlock<SubPacket, n>& ) const {
+    }
+  };
+  // This function stores a PacketBlock on m_data, this approach is really quite slow compare to Incr=1 and should be avoided when possible.
+  template<typename SubPacket, int n>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j, const PacketBlock<SubPacket, n>&block) const {
+    storePacketBlock_helper<SubPacket, Scalar, n, n-1> spb;
+    spb.store(this, i,j,block);
+  }
+protected:
+  Scalar* EIGEN_RESTRICT m_data;
+  const Index m_stride;
+  const internal::variable_if_dynamic<Index,Incr> m_incr;
+};
+
+// lightweight helper class to access matrix coefficients (const version)
+template<typename Scalar, typename Index, int StorageOrder>
+class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {
+  public:
+  EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper<const Scalar, Index, StorageOrder>(data, stride) {}
+
+  EIGEN_ALWAYS_INLINE const_blas_data_mapper<Scalar, Index, StorageOrder> getSubMapper(Index i, Index j) const {
+    return const_blas_data_mapper<Scalar, Index, StorageOrder>(&(this->operator()(i, j)), this->m_stride);
+  }
+};
+
+
+/* Helper class to analyze the factors of a Product expression.
+ * In particular it allows to pop out operator-, scalar multiples,
+ * and conjugate */
+template<typename XprType> struct blas_traits
+{
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef const XprType& ExtractType;
+  typedef XprType _ExtractType;
+  enum {
+    IsComplex = NumTraits<Scalar>::IsComplex,
+    IsTransposed = false,
+    NeedToConjugate = false,
+    HasUsableDirectAccess = (    (int(XprType::Flags)&DirectAccessBit)
+                              && (   bool(XprType::IsVectorAtCompileTime)
+                                  || int(inner_stride_at_compile_time<XprType>::ret) == 1)
+                             ) ?  1 : 0,
+    HasScalarFactor = false
+  };
+  typedef typename conditional<bool(HasUsableDirectAccess),
+    ExtractType,
+    typename _ExtractType::PlainObject
+    >::type DirectLinearAccessType;
+  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; }
+  static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
+};
+
+// pop conjugate
+template<typename Scalar, typename NestedXpr>
+struct blas_traits<CwiseUnaryOp<scalar_conjugate_op<Scalar>, NestedXpr> >
+ : blas_traits<NestedXpr>
+{
+  typedef blas_traits<NestedXpr> Base;
+  typedef CwiseUnaryOp<scalar_conjugate_op<Scalar>, NestedXpr> XprType;
+  typedef typename Base::ExtractType ExtractType;
+
+  enum {
+    IsComplex = NumTraits<Scalar>::IsComplex,
+    NeedToConjugate = Base::NeedToConjugate ? 0 : IsComplex
+  };
+  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+  static inline Scalar extractScalarFactor(const XprType& x) { return conj(Base::extractScalarFactor(x.nestedExpression())); }
+};
+
+// pop scalar multiple
+template<typename Scalar, typename NestedXpr, typename Plain>
+struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> >
+ : blas_traits<NestedXpr>
+{
+  enum {
+    HasScalarFactor = true
+  };
+  typedef blas_traits<NestedXpr> Base;
+  typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType;
+  typedef typename Base::ExtractType ExtractType;
+  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
+  static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x)
+  { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); }
+};
+template<typename Scalar, typename NestedXpr, typename Plain>
+struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > >
+ : blas_traits<NestedXpr>
+{
+  enum {
+    HasScalarFactor = true
+  };
+  typedef blas_traits<NestedXpr> Base;
+  typedef CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > XprType;
+  typedef typename Base::ExtractType ExtractType;
+  static inline ExtractType extract(const XprType& x) { return Base::extract(x.lhs()); }
+  static inline Scalar extractScalarFactor(const XprType& x)
+  { return Base::extractScalarFactor(x.lhs()) * x.rhs().functor().m_other; }
+};
+template<typename Scalar, typename Plain1, typename Plain2>
+struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain1>,
+                                                            const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain2> > >
+ : blas_traits<CwiseNullaryOp<scalar_constant_op<Scalar>,Plain1> >
+{};
+
+// pop opposite
+template<typename Scalar, typename NestedXpr>
+struct blas_traits<CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> >
+ : blas_traits<NestedXpr>
+{
+  enum {
+    HasScalarFactor = true
+  };
+  typedef blas_traits<NestedXpr> Base;
+  typedef CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> XprType;
+  typedef typename Base::ExtractType ExtractType;
+  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+  static inline Scalar extractScalarFactor(const XprType& x)
+  { return - Base::extractScalarFactor(x.nestedExpression()); }
+};
+
+// pop/push transpose
+template<typename NestedXpr>
+struct blas_traits<Transpose<NestedXpr> >
+ : blas_traits<NestedXpr>
+{
+  typedef typename NestedXpr::Scalar Scalar;
+  typedef blas_traits<NestedXpr> Base;
+  typedef Transpose<NestedXpr> XprType;
+  typedef Transpose<const typename Base::_ExtractType>  ExtractType; // const to get rid of a compile error; anyway blas traits are only used on the RHS
+  typedef Transpose<const typename Base::_ExtractType> _ExtractType;
+  typedef typename conditional<bool(Base::HasUsableDirectAccess),
+    ExtractType,
+    typename ExtractType::PlainObject
+    >::type DirectLinearAccessType;
+  enum {
+    IsTransposed = Base::IsTransposed ? 0 : 1
+  };
+  static inline ExtractType extract(const XprType& x) { return ExtractType(Base::extract(x.nestedExpression())); }
+  static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); }
+};
+
+template<typename T>
+struct blas_traits<const T>
+     : blas_traits<T>
+{};
+
+template<typename T, bool HasUsableDirectAccess=blas_traits<T>::HasUsableDirectAccess>
+struct extract_data_selector {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename T::Scalar* run(const T& m)
+  {
+    return blas_traits<T>::extract(m).data();
+  }
+};
+
+template<typename T>
+struct extract_data_selector<T,false> {
+  static typename T::Scalar* run(const T&) { return 0; }
+};
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename T::Scalar* extract_data(const T& m)
+{
+  return extract_data_selector<T>::run(m);
+}
+
+/**
+ * \c combine_scalar_factors extracts and multiplies factors from GEMM and GEMV products.
+ * There is a specialization for booleans
+ */
+template<typename ResScalar, typename Lhs, typename Rhs>
+struct combine_scalar_factors_impl
+{
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const Lhs& lhs, const Rhs& rhs)
+  {
+    return blas_traits<Lhs>::extractScalarFactor(lhs) * blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs)
+  {
+    return alpha * blas_traits<Lhs>::extractScalarFactor(lhs) * blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+};
+template<typename Lhs, typename Rhs>
+struct combine_scalar_factors_impl<bool, Lhs, Rhs>
+{
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const Lhs& lhs, const Rhs& rhs)
+  {
+    return blas_traits<Lhs>::extractScalarFactor(lhs) && blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const bool& alpha, const Lhs& lhs, const Rhs& rhs)
+  {
+    return alpha && blas_traits<Lhs>::extractScalarFactor(lhs) && blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+};
+
+template<typename ResScalar, typename Lhs, typename Rhs>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs)
+{
+  return combine_scalar_factors_impl<ResScalar,Lhs,Rhs>::run(alpha, lhs, rhs);
+}
+template<typename ResScalar, typename Lhs, typename Rhs>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const Lhs& lhs, const Rhs& rhs)
+{
+  return combine_scalar_factors_impl<ResScalar,Lhs,Rhs>::run(lhs, rhs);
+}
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BLASUTIL_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/ConfigureVectorization.h b/src/3rdparty/eigen/Eigen/src/Core/util/ConfigureVectorization.h
new file mode 100644
index 000000000..af4e69623
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/ConfigureVectorization.h
@@ -0,0 +1,512 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONFIGURE_VECTORIZATION_H
+#define EIGEN_CONFIGURE_VECTORIZATION_H
+
+//------------------------------------------------------------------------------------------
+// Static and dynamic alignment control
+//
+// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
+// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
+// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
+// a default value is automatically computed based on architecture, compiler, and OS.
+//
+// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
+// to be used to declare statically aligned buffers.
+//------------------------------------------------------------------------------------------
+
+
+/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
+ * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
+ * so that vectorization doesn't affect binary compatibility.
+ *
+ * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
+ * vectorized and non-vectorized code.
+ * 
+ * FIXME: this code can be cleaned up once we switch to proper C++11 only.
+ */
+#if (defined EIGEN_CUDACC)
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
+  #define EIGEN_ALIGNOF(x) __alignof(x)
+#elif EIGEN_HAS_ALIGNAS
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n)
+  #define EIGEN_ALIGNOF(x) alignof(x)
+#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
+  #define EIGEN_ALIGNOF(x) __alignof(x)
+#elif EIGEN_COMP_MSVC
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
+  #define EIGEN_ALIGNOF(x) __alignof(x)
+#elif EIGEN_COMP_SUNCC
+  // FIXME not sure about this one:
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
+  #define EIGEN_ALIGNOF(x) __alignof(x)
+#else
+  #error Please tell me what is the equivalent of alignas(n) and alignof(x) for your compiler
+#endif
+
+// If the user explicitly disable vectorization, then we also disable alignment
+#if defined(EIGEN_DONT_VECTORIZE)
+  #if defined(EIGEN_GPUCC)
+    // GPU code is always vectorized and requires memory alignment for
+    // statically allocated buffers.
+    #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
+  #else
+    #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
+  #endif
+#elif defined(__AVX512F__)
+  // 64 bytes static alignment is preferred only if really required
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
+#elif defined(__AVX__)
+  // 32 bytes static alignment is preferred only if really required
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
+#else
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
+#endif
+
+
+// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
+#define EIGEN_MIN_ALIGN_BYTES 16
+
+// Defined the boundary (in bytes) on which the data needs to be aligned. Note
+// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
+// aligned at all regardless of the value of this #define.
+
+#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN))  && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
+#endif
+
+// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated
+// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
+#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)
+  #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
+    #undef EIGEN_MAX_STATIC_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+#endif
+
+#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
+
+  // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
+
+  // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
+  // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
+  // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
+  // certain common platform (compiler+architecture combinations) to avoid these problems.
+  // Only static alignment is really problematic (relies on nonstandard compiler extensions),
+  // try to keep heap alignment even when we have to disable static alignment.
+  #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS)
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
+  // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
+  // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use.
+  // 4.8 and newer seem definitely unaffected.
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #else
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
+  #endif
+
+  // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
+  #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
+  && !EIGEN_GCC3_OR_OLDER \
+  && !EIGEN_COMP_SUNCC \
+  && !EIGEN_OS_QNX
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
+  #else
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
+  #endif
+
+  #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+  #else
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+  #endif
+
+#endif
+
+// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_STATIC_ALIGN_BYTES
+#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES<EIGEN_MAX_STATIC_ALIGN_BYTES
+#undef EIGEN_MAX_STATIC_ALIGN_BYTES
+#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES==0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
+  #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
+#endif
+
+// At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
+// It takes into account both the user choice to explicitly enable/disable alignment (by setting EIGEN_MAX_STATIC_ALIGN_BYTES)
+// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT).
+// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used.
+
+
+// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
+#define EIGEN_ALIGN8  EIGEN_ALIGN_TO_BOUNDARY(8)
+#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
+#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
+#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
+#else
+#define EIGEN_ALIGN_MAX
+#endif
+
+
+// Dynamic alignment control
+
+#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0
+#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
+#endif
+
+#ifdef EIGEN_DONT_ALIGN
+  #ifdef EIGEN_MAX_ALIGN_BYTES
+    #undef EIGEN_MAX_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_ALIGN_BYTES 0
+#elif !defined(EIGEN_MAX_ALIGN_BYTES)
+  #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#else
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+
+#ifndef EIGEN_UNALIGNED_VECTORIZE
+#define EIGEN_UNALIGNED_VECTORIZE 1
+#endif
+
+//----------------------------------------------------------------------
+
+// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
+// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
+#if EIGEN_MAX_ALIGN_BYTES==0
+  #ifndef EIGEN_DONT_VECTORIZE
+    #define EIGEN_DONT_VECTORIZE
+  #endif
+#endif
+
+
+// The following (except #include <malloc.h> and _M_IX86_FP ??) can likely be
+// removed as gcc 4.1 and msvc 2008 are not supported anyways.
+#if EIGEN_COMP_MSVC
+  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
+  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
+    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
+    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
+      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
+    #endif
+  #endif
+#else
+  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
+    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
+  #endif
+#endif
+
+#if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC))
+
+  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
+
+    // Defines symbols for compile-time detection of which instructions are
+    // used.
+    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_SSE
+    #define EIGEN_VECTORIZE_SSE2
+
+    // Detect sse3/ssse3/sse4:
+    // gcc and icc defines __SSE3__, ...
+    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
+    // want to force the use of those instructions with msvc.
+    #ifdef __SSE3__
+      #define EIGEN_VECTORIZE_SSE3
+    #endif
+    #ifdef __SSSE3__
+      #define EIGEN_VECTORIZE_SSSE3
+    #endif
+    #ifdef __SSE4_1__
+      #define EIGEN_VECTORIZE_SSE4_1
+    #endif
+    #ifdef __SSE4_2__
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __AVX__
+      #ifndef EIGEN_USE_SYCL 
+        #define EIGEN_VECTORIZE_AVX
+      #endif
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __AVX2__
+      #ifndef EIGEN_USE_SYCL 
+        #define EIGEN_VECTORIZE_AVX2
+        #define EIGEN_VECTORIZE_AVX
+      #endif
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #if defined(__FMA__) || (EIGEN_COMP_MSVC && defined(__AVX2__))
+      // MSVC does not expose a switch dedicated for FMA
+      // For MSVC, AVX2 => FMA
+      #define EIGEN_VECTORIZE_FMA
+    #endif
+    #if defined(__AVX512F__)
+      #ifndef EIGEN_VECTORIZE_FMA
+      #if EIGEN_COMP_GNUC
+      #error Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638).
+      #else
+      #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638).
+      #endif
+      #endif
+      #ifndef EIGEN_USE_SYCL
+        #define EIGEN_VECTORIZE_AVX512
+        #define EIGEN_VECTORIZE_AVX2
+        #define EIGEN_VECTORIZE_AVX
+      #endif
+      #define EIGEN_VECTORIZE_FMA
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+      #ifndef EIGEN_USE_SYCL
+        #ifdef __AVX512DQ__
+          #define EIGEN_VECTORIZE_AVX512DQ
+        #endif
+        #ifdef __AVX512ER__
+          #define EIGEN_VECTORIZE_AVX512ER
+        #endif
+        #ifdef __AVX512BF16__
+          #define EIGEN_VECTORIZE_AVX512BF16
+        #endif
+      #endif
+    #endif
+
+    // Disable AVX support on broken xcode versions
+    #if defined(__apple_build_version__) && (__apple_build_version__ == 11000033 ) && ( __MAC_OS_X_VERSION_MIN_REQUIRED == 101500 )
+      // A nasty bug in the clang compiler shipped with xcode in a common compilation situation
+      // when XCode 11.0 and Mac deployment target macOS 10.15 is https://trac.macports.org/ticket/58776#no1
+      #ifdef EIGEN_VECTORIZE_AVX
+        #undef EIGEN_VECTORIZE_AVX
+        #warning "Disabling AVX support: clang compiler shipped with XCode 11.[012] generates broken assembly with -macosx-version-min=10.15 and AVX enabled. "
+        #ifdef EIGEN_VECTORIZE_AVX2
+          #undef EIGEN_VECTORIZE_AVX2
+        #endif
+        #ifdef EIGEN_VECTORIZE_FMA
+          #undef EIGEN_VECTORIZE_FMA
+        #endif
+        #ifdef EIGEN_VECTORIZE_AVX512
+          #undef EIGEN_VECTORIZE_AVX512
+        #endif
+        #ifdef EIGEN_VECTORIZE_AVX512DQ
+          #undef EIGEN_VECTORIZE_AVX512DQ
+        #endif
+        #ifdef EIGEN_VECTORIZE_AVX512ER
+          #undef EIGEN_VECTORIZE_AVX512ER
+        #endif
+      #endif
+      // NOTE: Confirmed test failures in XCode 11.0, and XCode 11.2 with  -macosx-version-min=10.15 and AVX
+      // NOTE using -macosx-version-min=10.15 with Xcode 11.0 results in runtime segmentation faults in many tests, 11.2 produce core dumps in 3 tests
+      // NOTE using -macosx-version-min=10.14 produces functioning and passing tests in all cases
+      // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.8)"  XCode 11.0 <- Produces many segfault and core dumping tests
+      //                                                                    with  -macosx-version-min=10.15 and AVX
+      // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.12)" XCode 11.2 <- Produces 3 core dumping tests with  
+      //                                                                    -macosx-version-min=10.15 and AVX
+    #endif
+
+    // include files
+
+    // This extern "C" works around a MINGW-w64 compilation issue
+    // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
+    // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
+    // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
+    // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
+    // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
+    // notice that since these are C headers, the extern "C" is theoretically needed anyways.
+    extern "C" {
+      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
+      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
+      #if EIGEN_COMP_ICC >= 1110
+        #include <immintrin.h>
+      #else
+        #include <mmintrin.h>
+        #include <emmintrin.h>
+        #include <xmmintrin.h>
+        #ifdef  EIGEN_VECTORIZE_SSE3
+        #include <pmmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSSE3
+        #include <tmmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSE4_1
+        #include <smmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSE4_2
+        #include <nmmintrin.h>
+        #endif
+        #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
+        #include <immintrin.h>
+        #endif
+      #endif
+    } // end extern "C"
+
+  #elif defined __VSX__
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_VSX
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
+
+  #elif defined __ALTIVEC__
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_ALTIVEC
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
+
+  #elif ((defined  __ARM_NEON) || (defined __ARM_NEON__)) && !(defined EIGEN_ARM64_USE_SVE)
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_NEON
+    #include <arm_neon.h>
+
+  // We currently require SVE to be enabled explicitly via EIGEN_ARM64_USE_SVE and
+  // will not select the backend automatically
+  #elif (defined __ARM_FEATURE_SVE) && (defined EIGEN_ARM64_USE_SVE)
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_SVE
+    #include <arm_sve.h>
+
+    // Since we depend on knowing SVE vector lengths at compile-time, we need
+    // to ensure a fixed lengths is set
+    #if defined __ARM_FEATURE_SVE_BITS
+      #define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS
+    #else
+#error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set."
+#endif
+
+#elif (defined __s390x__ && defined __VEC__)
+
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_ZVECTOR
+#include <vecintrin.h>
+
+#elif defined __mips_msa
+
+// Limit MSA optimizations to little-endian CPUs for now.
+// TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#if defined(__LP64__)
+#define EIGEN_MIPS_64
+#else
+#define EIGEN_MIPS_32
+#endif
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_MSA
+#include <msa.h>
+#endif
+
+#endif
+#endif
+
+// Following the Arm ACLE arm_neon.h should also include arm_fp16.h but not all
+// compilers seem to follow this. We therefore include it explicitly.
+// See also: https://bugs.llvm.org/show_bug.cgi?id=47955
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  #include <arm_fp16.h>
+#endif
+
+#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!defined(EIGEN_COMP_CLANG) || EIGEN_COMP_CLANG>=380))
+  // We can use the optimized fp16 to float and float to fp16 conversion routines
+  #define EIGEN_HAS_FP16_C
+
+  #if defined(EIGEN_COMP_CLANG)
+    // Workaround for clang: The FP16C intrinsics for clang are included by
+    // immintrin.h, as opposed to emmintrin.h as suggested by Intel:
+    // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711
+    #include <immintrin.h>
+  #endif
+#endif
+
+#if defined EIGEN_CUDACC
+  #define EIGEN_VECTORIZE_GPU
+  #include <vector_types.h>
+  #if EIGEN_CUDA_SDK_VER >= 70500
+    #define EIGEN_HAS_CUDA_FP16
+  #endif
+#endif
+
+#if defined(EIGEN_HAS_CUDA_FP16)
+  #include <cuda_runtime_api.h>
+  #include <cuda_fp16.h>
+#endif
+
+#if defined(EIGEN_HIPCC)
+  #define EIGEN_VECTORIZE_GPU
+  #include <hip/hip_vector_types.h>
+  #define EIGEN_HAS_HIP_FP16
+  #include <hip/hip_fp16.h>
+#endif
+
+
+/** \brief Namespace containing all symbols from the %Eigen library. */
+namespace Eigen {
+
+inline static const char *SimdInstructionSetsInUse(void) {
+#if defined(EIGEN_VECTORIZE_AVX512)
+  return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_AVX)
+  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_2)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_1)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
+#elif defined(EIGEN_VECTORIZE_SSSE3)
+  return "SSE, SSE2, SSE3, SSSE3";
+#elif defined(EIGEN_VECTORIZE_SSE3)
+  return "SSE, SSE2, SSE3";
+#elif defined(EIGEN_VECTORIZE_SSE2)
+  return "SSE, SSE2";
+#elif defined(EIGEN_VECTORIZE_ALTIVEC)
+  return "AltiVec";
+#elif defined(EIGEN_VECTORIZE_VSX)
+  return "VSX";
+#elif defined(EIGEN_VECTORIZE_NEON)
+  return "ARM NEON";
+#elif defined(EIGEN_VECTORIZE_SVE)
+  return "ARM SVE";
+#elif defined(EIGEN_VECTORIZE_ZVECTOR)
+  return "S390X ZVECTOR";
+#elif defined(EIGEN_VECTORIZE_MSA)
+  return "MIPS MSA";
+#else
+  return "None";
+#endif
+}
+
+} // end namespace Eigen
+
+
+#endif // EIGEN_CONFIGURE_VECTORIZATION_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/Constants.h b/src/3rdparty/eigen/Eigen/src/Core/util/Constants.h
new file mode 100644
index 000000000..35dcaa7b3
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/Constants.h
@@ -0,0 +1,563 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONSTANTS_H
+#define EIGEN_CONSTANTS_H
+
+namespace Eigen {
+
+/** This value means that a positive quantity (e.g., a size) is not known at compile-time, and that instead the value is
+  * stored in some runtime variable.
+  *
+  * Changing the value of Dynamic breaks the ABI, as Dynamic is often used as a template parameter for Matrix.
+  */
+const int Dynamic = -1;
+
+/** This value means that a signed quantity (e.g., a signed index) is not known at compile-time, and that instead its value
+  * has to be specified at runtime.
+  */
+const int DynamicIndex = 0xffffff;
+
+/** This value means that the increment to go from one value to another in a sequence is not constant for each step.
+  */
+const int UndefinedIncr = 0xfffffe;
+
+/** This value means +Infinity; it is currently used only as the p parameter to MatrixBase::lpNorm<int>().
+  * The value Infinity there means the L-infinity norm.
+  */
+const int Infinity = -1;
+
+/** This value means that the cost to evaluate an expression coefficient is either very expensive or
+  * cannot be known at compile time.
+  *
+  * This value has to be positive to (1) simplify cost computation, and (2) allow to distinguish between a very expensive and very very expensive expressions.
+  * It thus must also be large enough to make sure unrolling won't happen and that sub expressions will be evaluated, but not too large to avoid overflow.
+  */
+const int HugeCost = 10000;
+
+/** \defgroup flags Flags
+  * \ingroup Core_Module
+  *
+  * These are the possible bits which can be OR'ed to constitute the flags of a matrix or
+  * expression.
+  *
+  * It is important to note that these flags are a purely compile-time notion. They are a compile-time property of
+  * an expression type, implemented as enum's. They are not stored in memory at runtime, and they do not incur any
+  * runtime overhead.
+  *
+  * \sa MatrixBase::Flags
+  */
+
+/** \ingroup flags
+  *
+  * for a matrix, this means that the storage order is row-major.
+  * If this bit is not set, the storage order is column-major.
+  * For an expression, this determines the storage order of
+  * the matrix created by evaluation of that expression.
+  * \sa \blank  \ref TopicStorageOrders */
+const unsigned int RowMajorBit = 0x1;
+
+/** \ingroup flags
+  * means the expression should be evaluated by the calling expression */
+const unsigned int EvalBeforeNestingBit = 0x2;
+
+/** \ingroup flags
+  * \deprecated
+  * means the expression should be evaluated before any assignment */
+EIGEN_DEPRECATED
+const unsigned int EvalBeforeAssigningBit = 0x4; // FIXME deprecated
+
+/** \ingroup flags
+  *
+  * Short version: means the expression might be vectorized
+  *
+  * Long version: means that the coefficients can be handled by packets
+  * and start at a memory location whose alignment meets the requirements
+  * of the present CPU architecture for optimized packet access. In the fixed-size
+  * case, there is the additional condition that it be possible to access all the
+  * coefficients by packets (this implies the requirement that the size be a multiple of 16 bytes,
+  * and that any nontrivial strides don't break the alignment). In the dynamic-size case,
+  * there is no such condition on the total size and strides, so it might not be possible to access
+  * all coeffs by packets.
+  *
+  * \note This bit can be set regardless of whether vectorization is actually enabled.
+  *       To check for actual vectorizability, see \a ActualPacketAccessBit.
+  */
+const unsigned int PacketAccessBit = 0x8;
+
+#ifdef EIGEN_VECTORIZE
+/** \ingroup flags
+  *
+  * If vectorization is enabled (EIGEN_VECTORIZE is defined) this constant
+  * is set to the value \a PacketAccessBit.
+  *
+  * If vectorization is not enabled (EIGEN_VECTORIZE is not defined) this constant
+  * is set to the value 0.
+  */
+const unsigned int ActualPacketAccessBit = PacketAccessBit;
+#else
+const unsigned int ActualPacketAccessBit = 0x0;
+#endif
+
+/** \ingroup flags
+  *
+  * Short version: means the expression can be seen as 1D vector.
+  *
+  * Long version: means that one can access the coefficients
+  * of this expression by coeff(int), and coeffRef(int) in the case of a lvalue expression. These
+  * index-based access methods are guaranteed
+  * to not have to do any runtime computation of a (row, col)-pair from the index, so that it
+  * is guaranteed that whenever it is available, index-based access is at least as fast as
+  * (row,col)-based access. Expressions for which that isn't possible don't have the LinearAccessBit.
+  *
+  * If both PacketAccessBit and LinearAccessBit are set, then the
+  * packets of this expression can be accessed by packet(int), and writePacket(int) in the case of a
+  * lvalue expression.
+  *
+  * Typically, all vector expressions have the LinearAccessBit, but there is one exception:
+  * Product expressions don't have it, because it would be troublesome for vectorization, even when the
+  * Product is a vector expression. Thus, vector Product expressions allow index-based coefficient access but
+  * not index-based packet access, so they don't have the LinearAccessBit.
+  */
+const unsigned int LinearAccessBit = 0x10;
+
+/** \ingroup flags
+  *
+  * Means the expression has a coeffRef() method, i.e. is writable as its individual coefficients are directly addressable.
+  * This rules out read-only expressions.
+  *
+  * Note that DirectAccessBit and LvalueBit are mutually orthogonal, as there are examples of expression having one but note
+  * the other:
+  *   \li writable expressions that don't have a very simple memory layout as a strided array, have LvalueBit but not DirectAccessBit
+  *   \li Map-to-const expressions, for example Map<const Matrix>, have DirectAccessBit but not LvalueBit
+  *
+  * Expressions having LvalueBit also have their coeff() method returning a const reference instead of returning a new value.
+  */
+const unsigned int LvalueBit = 0x20;
+
+/** \ingroup flags
+  *
+  * Means that the underlying array of coefficients can be directly accessed as a plain strided array. The memory layout
+  * of the array of coefficients must be exactly the natural one suggested by rows(), cols(),
+  * outerStride(), innerStride(), and the RowMajorBit. This rules out expressions such as Diagonal, whose coefficients,
+  * though referencable, do not have such a regular memory layout.
+  *
+  * See the comment on LvalueBit for an explanation of how LvalueBit and DirectAccessBit are mutually orthogonal.
+  */
+const unsigned int DirectAccessBit = 0x40;
+
+/** \deprecated \ingroup flags
+  *
+  * means the first coefficient packet is guaranteed to be aligned.
+  * An expression cannot have the AlignedBit without the PacketAccessBit flag.
+  * In other words, this means we are allow to perform an aligned packet access to the first element regardless
+  * of the expression kind:
+  * \code
+  * expression.packet<Aligned>(0);
+  * \endcode
+  */
+EIGEN_DEPRECATED const unsigned int AlignedBit = 0x80;
+
+const unsigned int NestByRefBit = 0x100;
+
+/** \ingroup flags
+  *
+  * for an expression, this means that the storage order
+  * can be either row-major or column-major.
+  * The precise choice will be decided at evaluation time or when
+  * combined with other expressions.
+  * \sa \blank  \ref RowMajorBit, \ref TopicStorageOrders */
+const unsigned int NoPreferredStorageOrderBit = 0x200;
+
+/** \ingroup flags
+  *
+  * Means that the underlying coefficients can be accessed through pointers to the sparse (un)compressed storage format,
+  * that is, the expression provides:
+  * \code
+    inline const Scalar* valuePtr() const;
+    inline const Index* innerIndexPtr() const;
+    inline const Index* outerIndexPtr() const;
+    inline const Index* innerNonZeroPtr() const;
+    \endcode
+  */
+const unsigned int CompressedAccessBit = 0x400;
+
+
+// list of flags that are inherited by default
+const unsigned int HereditaryBits = RowMajorBit
+                                  | EvalBeforeNestingBit;
+
+/** \defgroup enums Enumerations
+  * \ingroup Core_Module
+  *
+  * Various enumerations used in %Eigen. Many of these are used as template parameters.
+  */
+
+/** \ingroup enums
+  * Enum containing possible values for the \c Mode or \c UpLo parameter of
+  * MatrixBase::selfadjointView() and MatrixBase::triangularView(), and selfadjoint solvers. */
+enum UpLoType {
+  /** View matrix as a lower triangular matrix. */
+  Lower=0x1,                      
+  /** View matrix as an upper triangular matrix. */
+  Upper=0x2,                      
+  /** %Matrix has ones on the diagonal; to be used in combination with #Lower or #Upper. */
+  UnitDiag=0x4, 
+  /** %Matrix has zeros on the diagonal; to be used in combination with #Lower or #Upper. */
+  ZeroDiag=0x8,
+  /** View matrix as a lower triangular matrix with ones on the diagonal. */
+  UnitLower=UnitDiag|Lower, 
+  /** View matrix as an upper triangular matrix with ones on the diagonal. */
+  UnitUpper=UnitDiag|Upper,
+  /** View matrix as a lower triangular matrix with zeros on the diagonal. */
+  StrictlyLower=ZeroDiag|Lower, 
+  /** View matrix as an upper triangular matrix with zeros on the diagonal. */
+  StrictlyUpper=ZeroDiag|Upper,
+  /** Used in BandMatrix and SelfAdjointView to indicate that the matrix is self-adjoint. */
+  SelfAdjoint=0x10,
+  /** Used to support symmetric, non-selfadjoint, complex matrices. */
+  Symmetric=0x20
+};
+
+/** \ingroup enums
+  * Enum for indicating whether a buffer is aligned or not. */
+enum AlignmentType {
+  Unaligned=0,        /**< Data pointer has no specific alignment. */
+  Aligned8=8,         /**< Data pointer is aligned on a 8 bytes boundary. */
+  Aligned16=16,       /**< Data pointer is aligned on a 16 bytes boundary. */
+  Aligned32=32,       /**< Data pointer is aligned on a 32 bytes boundary. */
+  Aligned64=64,       /**< Data pointer is aligned on a 64 bytes boundary. */
+  Aligned128=128,     /**< Data pointer is aligned on a 128 bytes boundary. */
+  AlignedMask=255,
+  Aligned=16,         /**< \deprecated Synonym for Aligned16. */
+#if EIGEN_MAX_ALIGN_BYTES==128
+  AlignedMax = Aligned128
+#elif EIGEN_MAX_ALIGN_BYTES==64
+  AlignedMax = Aligned64
+#elif EIGEN_MAX_ALIGN_BYTES==32
+  AlignedMax = Aligned32
+#elif EIGEN_MAX_ALIGN_BYTES==16
+  AlignedMax = Aligned16
+#elif EIGEN_MAX_ALIGN_BYTES==8
+  AlignedMax = Aligned8
+#elif EIGEN_MAX_ALIGN_BYTES==0
+  AlignedMax = Unaligned
+#else
+#error Invalid value for EIGEN_MAX_ALIGN_BYTES
+#endif
+};
+
+/** \ingroup enums
+  * Enum containing possible values for the \p Direction parameter of
+  * Reverse, PartialReduxExpr and VectorwiseOp. */
+enum DirectionType { 
+  /** For Reverse, all columns are reversed; 
+    * for PartialReduxExpr and VectorwiseOp, act on columns. */
+  Vertical, 
+  /** For Reverse, all rows are reversed; 
+    * for PartialReduxExpr and VectorwiseOp, act on rows. */
+  Horizontal, 
+  /** For Reverse, both rows and columns are reversed; 
+    * not used for PartialReduxExpr and VectorwiseOp. */
+  BothDirections 
+};
+
+/** \internal \ingroup enums
+  * Enum to specify how to traverse the entries of a matrix. */
+enum TraversalType {
+  /** \internal Default traversal, no vectorization, no index-based access */
+  DefaultTraversal,
+  /** \internal No vectorization, use index-based access to have only one for loop instead of 2 nested loops */
+  LinearTraversal,
+  /** \internal Equivalent to a slice vectorization for fixed-size matrices having good alignment
+    * and good size */
+  InnerVectorizedTraversal,
+  /** \internal Vectorization path using a single loop plus scalar loops for the
+    * unaligned boundaries */
+  LinearVectorizedTraversal,
+  /** \internal Generic vectorization path using one vectorized loop per row/column with some
+    * scalar loops to handle the unaligned boundaries */
+  SliceVectorizedTraversal,
+  /** \internal Special case to properly handle incompatible scalar types or other defecting cases*/
+  InvalidTraversal,
+  /** \internal Evaluate all entries at once */
+  AllAtOnceTraversal
+};
+
+/** \internal \ingroup enums
+  * Enum to specify whether to unroll loops when traversing over the entries of a matrix. */
+enum UnrollingType {
+  /** \internal Do not unroll loops. */
+  NoUnrolling,
+  /** \internal Unroll only the inner loop, but not the outer loop. */
+  InnerUnrolling,
+  /** \internal Unroll both the inner and the outer loop. If there is only one loop, 
+    * because linear traversal is used, then unroll that loop. */
+  CompleteUnrolling
+};
+
+/** \internal \ingroup enums
+  * Enum to specify whether to use the default (built-in) implementation or the specialization. */
+enum SpecializedType {
+  Specialized,
+  BuiltIn
+};
+
+/** \ingroup enums
+  * Enum containing possible values for the \p _Options template parameter of
+  * Matrix, Array and BandMatrix. */
+enum StorageOptions {
+  /** Storage order is column major (see \ref TopicStorageOrders). */
+  ColMajor = 0,
+  /** Storage order is row major (see \ref TopicStorageOrders). */
+  RowMajor = 0x1,  // it is only a coincidence that this is equal to RowMajorBit -- don't rely on that
+  /** Align the matrix itself if it is vectorizable fixed-size */
+  AutoAlign = 0,
+  /** Don't require alignment for the matrix itself (the array of coefficients, if dynamically allocated, may still be requested to be aligned) */ // FIXME --- clarify the situation
+  DontAlign = 0x2
+};
+
+/** \ingroup enums
+  * Enum for specifying whether to apply or solve on the left or right. */
+enum SideType {
+  /** Apply transformation on the left. */
+  OnTheLeft = 1,
+  /** Apply transformation on the right. */
+  OnTheRight = 2
+};
+
+/** \ingroup enums
+ * Enum for specifying NaN-propagation behavior, e.g. for coeff-wise min/max. */
+enum NaNPropagationOptions {
+  /**  Implementation defined behavior if NaNs are present. */
+  PropagateFast = 0,
+  /**  Always propagate NaNs. */
+  PropagateNaN,
+  /**  Always propagate not-NaNs. */
+  PropagateNumbers
+};
+
+/* the following used to be written as:
+ *
+ *   struct NoChange_t {};
+ *   namespace {
+ *     EIGEN_UNUSED NoChange_t NoChange;
+ *   }
+ *
+ * on the ground that it feels dangerous to disambiguate overloaded functions on enum/integer types.  
+ * However, this leads to "variable declared but never referenced" warnings on Intel Composer XE,
+ * and we do not know how to get rid of them (bug 450).
+ */
+
+enum NoChange_t   { NoChange };
+enum Sequential_t { Sequential };
+enum Default_t    { Default };
+
+/** \internal \ingroup enums
+  * Used in AmbiVector. */
+enum AmbiVectorMode {
+  IsDense         = 0,
+  IsSparse
+};
+
+/** \ingroup enums
+  * Used as template parameter in DenseCoeffBase and MapBase to indicate 
+  * which accessors should be provided. */
+enum AccessorLevels {
+  /** Read-only access via a member function. */
+  ReadOnlyAccessors, 
+  /** Read/write access via member functions. */
+  WriteAccessors, 
+  /** Direct read-only access to the coefficients. */
+  DirectAccessors, 
+  /** Direct read/write access to the coefficients. */
+  DirectWriteAccessors
+};
+
+/** \ingroup enums
+  * Enum with options to give to various decompositions. */
+enum DecompositionOptions {
+  /** \internal Not used (meant for LDLT?). */
+  Pivoting            = 0x01, 
+  /** \internal Not used (meant for LDLT?). */
+  NoPivoting          = 0x02, 
+  /** Used in JacobiSVD to indicate that the square matrix U is to be computed. */
+  ComputeFullU        = 0x04,
+  /** Used in JacobiSVD to indicate that the thin matrix U is to be computed. */
+  ComputeThinU        = 0x08,
+  /** Used in JacobiSVD to indicate that the square matrix V is to be computed. */
+  ComputeFullV        = 0x10,
+  /** Used in JacobiSVD to indicate that the thin matrix V is to be computed. */
+  ComputeThinV        = 0x20,
+  /** Used in SelfAdjointEigenSolver and GeneralizedSelfAdjointEigenSolver to specify
+    * that only the eigenvalues are to be computed and not the eigenvectors. */
+  EigenvaluesOnly     = 0x40,
+  /** Used in SelfAdjointEigenSolver and GeneralizedSelfAdjointEigenSolver to specify
+    * that both the eigenvalues and the eigenvectors are to be computed. */
+  ComputeEigenvectors = 0x80,
+  /** \internal */
+  EigVecMask = EigenvaluesOnly | ComputeEigenvectors,
+  /** Used in GeneralizedSelfAdjointEigenSolver to indicate that it should
+    * solve the generalized eigenproblem \f$ Ax = \lambda B x \f$. */
+  Ax_lBx              = 0x100,
+  /** Used in GeneralizedSelfAdjointEigenSolver to indicate that it should
+    * solve the generalized eigenproblem \f$ ABx = \lambda x \f$. */
+  ABx_lx              = 0x200,
+  /** Used in GeneralizedSelfAdjointEigenSolver to indicate that it should
+    * solve the generalized eigenproblem \f$ BAx = \lambda x \f$. */
+  BAx_lx              = 0x400,
+  /** \internal */
+  GenEigMask = Ax_lBx | ABx_lx | BAx_lx
+};
+
+/** \ingroup enums
+  * Possible values for the \p QRPreconditioner template parameter of JacobiSVD. */
+enum QRPreconditioners {
+  /** Do not specify what is to be done if the SVD of a non-square matrix is asked for. */
+  NoQRPreconditioner,
+  /** Use a QR decomposition without pivoting as the first step. */
+  HouseholderQRPreconditioner,
+  /** Use a QR decomposition with column pivoting as the first step. */
+  ColPivHouseholderQRPreconditioner,
+  /** Use a QR decomposition with full pivoting as the first step. */
+  FullPivHouseholderQRPreconditioner
+};
+
+#ifdef Success
+#error The preprocessor symbol 'Success' is defined, possibly by the X11 header file X.h
+#endif
+
+/** \ingroup enums
+  * Enum for reporting the status of a computation. */
+enum ComputationInfo {
+  /** Computation was successful. */
+  Success = 0,        
+  /** The provided data did not satisfy the prerequisites. */
+  NumericalIssue = 1, 
+  /** Iterative procedure did not converge. */
+  NoConvergence = 2,
+  /** The inputs are invalid, or the algorithm has been improperly called.
+    * When assertions are enabled, such errors trigger an assert. */
+  InvalidInput = 3
+};
+
+/** \ingroup enums
+  * Enum used to specify how a particular transformation is stored in a matrix.
+  * \sa Transform, Hyperplane::transform(). */
+enum TransformTraits {
+  /** Transformation is an isometry. */
+  Isometry      = 0x1,
+  /** Transformation is an affine transformation stored as a (Dim+1)^2 matrix whose last row is 
+    * assumed to be [0 ... 0 1]. */
+  Affine        = 0x2,
+  /** Transformation is an affine transformation stored as a (Dim) x (Dim+1) matrix. */
+  AffineCompact = 0x10 | Affine,
+  /** Transformation is a general projective transformation stored as a (Dim+1)^2 matrix. */
+  Projective    = 0x20
+};
+
+/** \internal \ingroup enums
+  * Enum used to choose between implementation depending on the computer architecture. */
+namespace Architecture
+{
+  enum Type {
+    Generic = 0x0,
+    SSE = 0x1,
+    AltiVec = 0x2,
+    VSX = 0x3,
+    NEON = 0x4,
+    MSA = 0x5,
+    SVE = 0x6,
+#if defined EIGEN_VECTORIZE_SSE
+    Target = SSE
+#elif defined EIGEN_VECTORIZE_ALTIVEC
+    Target = AltiVec
+#elif defined EIGEN_VECTORIZE_VSX
+    Target = VSX
+#elif defined EIGEN_VECTORIZE_NEON
+    Target = NEON
+#elif defined EIGEN_VECTORIZE_SVE
+    Target = SVE
+#elif defined EIGEN_VECTORIZE_MSA
+    Target = MSA
+#else
+    Target = Generic
+#endif
+  };
+}
+
+/** \internal \ingroup enums
+  * Enum used as template parameter in Product and product evaluators. */
+enum ProductImplType
+{ DefaultProduct=0, LazyProduct, AliasFreeProduct, CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct };
+
+/** \internal \ingroup enums
+  * Enum used in experimental parallel implementation. */
+enum Action {GetAction, SetAction};
+
+/** The type used to identify a dense storage. */
+struct Dense {};
+
+/** The type used to identify a general sparse storage. */
+struct Sparse {};
+
+/** The type used to identify a general solver (factored) storage. */
+struct SolverStorage {};
+
+/** The type used to identify a permutation storage. */
+struct PermutationStorage {};
+
+/** The type used to identify a permutation storage. */
+struct TranspositionsStorage {};
+
+/** The type used to identify a matrix expression */
+struct MatrixXpr {};
+
+/** The type used to identify an array expression */
+struct ArrayXpr {};
+
+// An evaluator must define its shape. By default, it can be one of the following:
+struct DenseShape             { static std::string debugName() { return "DenseShape"; } };
+struct SolverShape            { static std::string debugName() { return "SolverShape"; } };
+struct HomogeneousShape       { static std::string debugName() { return "HomogeneousShape"; } };
+struct DiagonalShape          { static std::string debugName() { return "DiagonalShape"; } };
+struct BandShape              { static std::string debugName() { return "BandShape"; } };
+struct TriangularShape        { static std::string debugName() { return "TriangularShape"; } };
+struct SelfAdjointShape       { static std::string debugName() { return "SelfAdjointShape"; } };
+struct PermutationShape       { static std::string debugName() { return "PermutationShape"; } };
+struct TranspositionsShape    { static std::string debugName() { return "TranspositionsShape"; } };
+struct SparseShape            { static std::string debugName() { return "SparseShape"; } };
+
+namespace internal {
+
+  // random access iterators based on coeff*() accessors.
+struct IndexBased {};
+
+// evaluator based on iterators to access coefficients. 
+struct IteratorBased {};
+
+/** \internal
+ * Constants for comparison functors
+ */
+enum ComparisonName {
+  cmp_EQ = 0,
+  cmp_LT = 1,
+  cmp_LE = 2,
+  cmp_UNORD = 3,
+  cmp_NEQ = 4,
+  cmp_GT = 5,
+  cmp_GE = 6
+};
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CONSTANTS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/DisableStupidWarnings.h b/src/3rdparty/eigen/Eigen/src/Core/util/DisableStupidWarnings.h
new file mode 100644
index 000000000..fe0cfec0b
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -0,0 +1,106 @@
+#ifndef EIGEN_WARNINGS_DISABLED
+#define EIGEN_WARNINGS_DISABLED
+
+#ifdef _MSC_VER
+  // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
+  // 4101 - unreferenced local variable
+  // 4181 - qualifier applied to reference type ignored
+  // 4211 - nonstandard extension used : redefined extern to static
+  // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
+  // 4273 - QtAlignedMalloc, inconsistent DLL linkage
+  // 4324 - structure was padded due to declspec(align())
+  // 4503 - decorated name length exceeded, name was truncated
+  // 4512 - assignment operator could not be generated
+  // 4522 - 'class' : multiple assignment operators specified
+  // 4700 - uninitialized local variable 'xyz' used
+  // 4714 - function marked as __forceinline not inlined
+  // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
+  // 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
+  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+    #pragma warning( push )
+  #endif
+  #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
+
+#elif defined __INTEL_COMPILER
+  // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
+  //        ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e. inside of class body
+  //        typedef that may be a reference type.
+  // 279  - controlling expression is constant
+  //        ICC 12 generates this warning on assert(constant_expression_depending_on_template_params) and frankly this is a legitimate use case.
+  // 1684 - conversion from pointer to same-sized integral type (potential portability problem)
+  // 2259 - non-pointer conversion from "Eigen::Index={ptrdiff_t={long}}" to "int" may lose significant bits
+  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+    #pragma warning push
+  #endif
+  #pragma warning disable 2196 279 1684 2259
+
+#elif defined __clang__
+  // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
+  //     this is really a stupid warning as it warns on compile-time expressions involving enums
+  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+    #pragma clang diagnostic push
+  #endif
+  #pragma clang diagnostic ignored "-Wconstant-logical-operand"
+  #if __clang_major__ >= 3 && __clang_minor__ >= 5
+    #pragma clang diagnostic ignored "-Wabsolute-value"
+  #endif
+  #if __clang_major__ >= 10
+    #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
+  #endif
+  #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L
+    // warning: generic selections are a C11-specific feature
+    // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
+    #pragma clang diagnostic ignored "-Wc11-extensions"
+  #endif
+
+#elif defined __GNUC__
+
+  #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) &&  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+    #pragma GCC diagnostic push
+  #endif
+  // g++ warns about local variables shadowing member functions, which is too strict
+  #pragma GCC diagnostic ignored "-Wshadow"
+  #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
+    // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
+    #pragma GCC diagnostic ignored "-Wtype-limits"
+  #endif
+  #if __GNUC__>=6
+    #pragma GCC diagnostic ignored "-Wignored-attributes"
+  #endif
+  #if __GNUC__==7
+    // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
+    #pragma GCC diagnostic ignored "-Wattributes"
+  #endif
+#endif
+
+#if defined __NVCC__
+  #pragma diag_suppress boolean_controlling_expr_is_constant
+  // Disable the "statement is unreachable" message
+  #pragma diag_suppress code_is_unreachable
+  // Disable the "dynamic initialization in unreachable code" message
+  #pragma diag_suppress initialization_not_reachable
+  // Disable the "invalid error number" message that we get with older versions of nvcc
+  #pragma diag_suppress 1222
+  // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are many of them and they seem to change with every version of the compiler)
+  #pragma diag_suppress 2527
+  #pragma diag_suppress 2529
+  #pragma diag_suppress 2651
+  #pragma diag_suppress 2653
+  #pragma diag_suppress 2668
+  #pragma diag_suppress 2669
+  #pragma diag_suppress 2670
+  #pragma diag_suppress 2671
+  #pragma diag_suppress 2735
+  #pragma diag_suppress 2737
+  #pragma diag_suppress 2739
+#endif
+
+#else
+// warnings already disabled:
+# ifndef EIGEN_WARNINGS_DISABLED_2
+#  define EIGEN_WARNINGS_DISABLED_2
+# elif defined(EIGEN_INTERNAL_DEBUGGING)
+#  error "Do not include \"DisableStupidWarnings.h\" recursively more than twice!"
+# endif
+
+#endif // not EIGEN_WARNINGS_DISABLED
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/ForwardDeclarations.h b/src/3rdparty/eigen/Eigen/src/Core/util/ForwardDeclarations.h
new file mode 100644
index 000000000..2f9cc4491
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/ForwardDeclarations.h
@@ -0,0 +1,322 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FORWARDDECLARATIONS_H
+#define EIGEN_FORWARDDECLARATIONS_H
+
+namespace Eigen {
+namespace internal {
+
+template<typename T> struct traits;
+
+// here we say once and for all that traits<const T> == traits<T>
+// When constness must affect traits, it has to be constness on template parameters on which T itself depends.
+// For example, traits<Map<const T> > != traits<Map<T> >, but
+//              traits<const Map<T> > == traits<Map<T> >
+template<typename T> struct traits<const T> : traits<T> {};
+
+template<typename Derived> struct has_direct_access
+{
+  enum { ret = (traits<Derived>::Flags & DirectAccessBit) ? 1 : 0 };
+};
+
+template<typename Derived> struct accessors_level
+{
+  enum { has_direct_access = (traits<Derived>::Flags & DirectAccessBit) ? 1 : 0,
+         has_write_access = (traits<Derived>::Flags & LvalueBit) ? 1 : 0,
+         value = has_direct_access ? (has_write_access ? DirectWriteAccessors : DirectAccessors)
+                                   : (has_write_access ? WriteAccessors       : ReadOnlyAccessors)
+  };
+};
+
+template<typename T> struct evaluator_traits;
+
+template< typename T> struct evaluator;
+
+} // end namespace internal
+
+template<typename T> struct NumTraits;
+
+template<typename Derived> struct EigenBase;
+template<typename Derived> class DenseBase;
+template<typename Derived> class PlainObjectBase;
+template<typename Derived, int Level> class DenseCoeffsBase;
+
+template<typename _Scalar, int _Rows, int _Cols,
+         int _Options = AutoAlign |
+#if EIGEN_GNUC_AT(3,4)
+    // workaround a bug in at least gcc 3.4.6
+    // the innermost ?: ternary operator is misparsed. We write it slightly
+    // differently and this makes gcc 3.4.6 happy, but it's ugly.
+    // The error would only show up with EIGEN_DEFAULT_TO_ROW_MAJOR is defined
+    // (when EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION is RowMajor)
+                          ( (_Rows==1 && _Cols!=1) ? Eigen::RowMajor
+                          : !(_Cols==1 && _Rows!=1) ?  EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION
+                          : Eigen::ColMajor ),
+#else
+                          ( (_Rows==1 && _Cols!=1) ? Eigen::RowMajor
+                          : (_Cols==1 && _Rows!=1) ? Eigen::ColMajor
+                          : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ),
+#endif
+         int _MaxRows = _Rows,
+         int _MaxCols = _Cols
+> class Matrix;
+
+template<typename Derived> class MatrixBase;
+template<typename Derived> class ArrayBase;
+
+template<typename ExpressionType, unsigned int Added, unsigned int Removed> class Flagged;
+template<typename ExpressionType, template <typename> class StorageBase > class NoAlias;
+template<typename ExpressionType> class NestByValue;
+template<typename ExpressionType> class ForceAlignedAccess;
+template<typename ExpressionType> class SwapWrapper;
+
+template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool InnerPanel = false> class Block;
+template<typename XprType, typename RowIndices, typename ColIndices> class IndexedView;
+template<typename XprType, int Rows=Dynamic, int Cols=Dynamic, int Order=0> class Reshaped;
+
+template<typename MatrixType, int Size=Dynamic> class VectorBlock;
+template<typename MatrixType> class Transpose;
+template<typename MatrixType> class Conjugate;
+template<typename NullaryOp, typename MatrixType>         class CwiseNullaryOp;
+template<typename UnaryOp,   typename MatrixType>         class CwiseUnaryOp;
+template<typename ViewOp,    typename MatrixType>         class CwiseUnaryView;
+template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
+template<typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>  class CwiseTernaryOp;
+template<typename Decomposition, typename Rhstype>        class Solve;
+template<typename XprType>                                class Inverse;
+
+template<typename Lhs, typename Rhs, int Option = DefaultProduct> class Product;
+
+template<typename Derived> class DiagonalBase;
+template<typename _DiagonalVectorType> class DiagonalWrapper;
+template<typename _Scalar, int SizeAtCompileTime, int MaxSizeAtCompileTime=SizeAtCompileTime> class DiagonalMatrix;
+template<typename MatrixType, typename DiagonalType, int ProductOrder> class DiagonalProduct;
+template<typename MatrixType, int Index = 0> class Diagonal;
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime = SizeAtCompileTime, typename IndexType=int> class PermutationMatrix;
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime = SizeAtCompileTime, typename IndexType=int> class Transpositions;
+template<typename Derived> class PermutationBase;
+template<typename Derived> class TranspositionsBase;
+template<typename _IndicesType> class PermutationWrapper;
+template<typename _IndicesType> class TranspositionsWrapper;
+
+template<typename Derived,
+         int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
+> class MapBase;
+template<int OuterStrideAtCompileTime, int InnerStrideAtCompileTime> class Stride;
+template<int Value = Dynamic> class InnerStride;
+template<int Value = Dynamic> class OuterStride;
+template<typename MatrixType, int MapOptions=Unaligned, typename StrideType = Stride<0,0> > class Map;
+template<typename Derived> class RefBase;
+template<typename PlainObjectType, int Options = 0,
+         typename StrideType = typename internal::conditional<PlainObjectType::IsVectorAtCompileTime,InnerStride<1>,OuterStride<> >::type > class Ref;
+
+template<typename Derived> class TriangularBase;
+template<typename MatrixType, unsigned int Mode> class TriangularView;
+template<typename MatrixType, unsigned int Mode> class SelfAdjointView;
+template<typename MatrixType> class SparseView;
+template<typename ExpressionType> class WithFormat;
+template<typename MatrixType> struct CommaInitializer;
+template<typename Derived> class ReturnByValue;
+template<typename ExpressionType> class ArrayWrapper;
+template<typename ExpressionType> class MatrixWrapper;
+template<typename Derived> class SolverBase;
+template<typename XprType> class InnerIterator;
+
+namespace internal {
+template<typename XprType> class generic_randaccess_stl_iterator;
+template<typename XprType> class pointer_based_stl_iterator;
+template<typename XprType, DirectionType Direction> class subvector_stl_iterator;
+template<typename XprType, DirectionType Direction> class subvector_stl_reverse_iterator;
+template<typename DecompositionType> struct kernel_retval_base;
+template<typename DecompositionType> struct kernel_retval;
+template<typename DecompositionType> struct image_retval_base;
+template<typename DecompositionType> struct image_retval;
+} // end namespace internal
+
+namespace internal {
+template<typename _Scalar, int Rows=Dynamic, int Cols=Dynamic, int Supers=Dynamic, int Subs=Dynamic, int Options=0> class BandMatrix;
+}
+
+namespace internal {
+template<typename Lhs, typename Rhs> struct product_type;
+
+template<bool> struct EnableIf;
+
+/** \internal
+  * \class product_evaluator
+  * Products need their own evaluator with more template arguments allowing for
+  * easier partial template specializations.
+  */
+template< typename T,
+          int ProductTag = internal::product_type<typename T::Lhs,typename T::Rhs>::ret,
+          typename LhsShape = typename evaluator_traits<typename T::Lhs>::Shape,
+          typename RhsShape = typename evaluator_traits<typename T::Rhs>::Shape,
+          typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
+          typename RhsScalar = typename traits<typename T::Rhs>::Scalar
+        > struct product_evaluator;
+}
+
+template<typename Lhs, typename Rhs,
+         int ProductType = internal::product_type<Lhs,Rhs>::value>
+struct ProductReturnType;
+
+// this is a workaround for sun CC
+template<typename Lhs, typename Rhs> struct LazyProductReturnType;
+
+namespace internal {
+
+// Provides scalar/packet-wise product and product with accumulation
+// with optional conjugation of the arguments.
+template<typename LhsScalar, typename RhsScalar, bool ConjLhs=false, bool ConjRhs=false> struct conj_helper;
+
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_sum_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_difference_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_conj_product_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar, int NaNPropagation=PropagateFast> struct scalar_min_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar, int NaNPropagation=PropagateFast> struct scalar_max_op;
+template<typename Scalar> struct scalar_opposite_op;
+template<typename Scalar> struct scalar_conjugate_op;
+template<typename Scalar> struct scalar_real_op;
+template<typename Scalar> struct scalar_imag_op;
+template<typename Scalar> struct scalar_abs_op;
+template<typename Scalar> struct scalar_abs2_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_absolute_difference_op;
+template<typename Scalar> struct scalar_sqrt_op;
+template<typename Scalar> struct scalar_rsqrt_op;
+template<typename Scalar> struct scalar_exp_op;
+template<typename Scalar> struct scalar_log_op;
+template<typename Scalar> struct scalar_cos_op;
+template<typename Scalar> struct scalar_sin_op;
+template<typename Scalar> struct scalar_acos_op;
+template<typename Scalar> struct scalar_asin_op;
+template<typename Scalar> struct scalar_tan_op;
+template<typename Scalar> struct scalar_inverse_op;
+template<typename Scalar> struct scalar_square_op;
+template<typename Scalar> struct scalar_cube_op;
+template<typename Scalar, typename NewType> struct scalar_cast_op;
+template<typename Scalar> struct scalar_random_op;
+template<typename Scalar> struct scalar_constant_op;
+template<typename Scalar> struct scalar_identity_op;
+template<typename Scalar,bool is_complex, bool is_integer> struct scalar_sign_op;
+template<typename Scalar,typename ScalarExponent> struct scalar_pow_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_hypot_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_quotient_op;
+
+// SpecialFunctions module
+template<typename Scalar> struct scalar_lgamma_op;
+template<typename Scalar> struct scalar_digamma_op;
+template<typename Scalar> struct scalar_erf_op;
+template<typename Scalar> struct scalar_erfc_op;
+template<typename Scalar> struct scalar_ndtri_op;
+template<typename Scalar> struct scalar_igamma_op;
+template<typename Scalar> struct scalar_igammac_op;
+template<typename Scalar> struct scalar_zeta_op;
+template<typename Scalar> struct scalar_betainc_op;
+
+// Bessel functions in SpecialFunctions module
+template<typename Scalar> struct scalar_bessel_i0_op;
+template<typename Scalar> struct scalar_bessel_i0e_op;
+template<typename Scalar> struct scalar_bessel_i1_op;
+template<typename Scalar> struct scalar_bessel_i1e_op;
+template<typename Scalar> struct scalar_bessel_j0_op;
+template<typename Scalar> struct scalar_bessel_y0_op;
+template<typename Scalar> struct scalar_bessel_j1_op;
+template<typename Scalar> struct scalar_bessel_y1_op;
+template<typename Scalar> struct scalar_bessel_k0_op;
+template<typename Scalar> struct scalar_bessel_k0e_op;
+template<typename Scalar> struct scalar_bessel_k1_op;
+template<typename Scalar> struct scalar_bessel_k1e_op;
+
+
+} // end namespace internal
+
+struct IOFormat;
+
+// Array module
+template<typename _Scalar, int _Rows, int _Cols,
+         int _Options = AutoAlign |
+#if EIGEN_GNUC_AT(3,4)
+    // workaround a bug in at least gcc 3.4.6
+    // the innermost ?: ternary operator is misparsed. We write it slightly
+    // differently and this makes gcc 3.4.6 happy, but it's ugly.
+    // The error would only show up with EIGEN_DEFAULT_TO_ROW_MAJOR is defined
+    // (when EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION is RowMajor)
+                          ( (_Rows==1 && _Cols!=1) ? Eigen::RowMajor
+                          : !(_Cols==1 && _Rows!=1) ?  EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION
+                          : Eigen::ColMajor ),
+#else
+                          ( (_Rows==1 && _Cols!=1) ? Eigen::RowMajor
+                          : (_Cols==1 && _Rows!=1) ? Eigen::ColMajor
+                          : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ),
+#endif
+         int _MaxRows = _Rows, int _MaxCols = _Cols> class Array;
+template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType> class Select;
+template<typename MatrixType, typename BinaryOp, int Direction> class PartialReduxExpr;
+template<typename ExpressionType, int Direction> class VectorwiseOp;
+template<typename MatrixType,int RowFactor,int ColFactor> class Replicate;
+template<typename MatrixType, int Direction = BothDirections> class Reverse;
+
+template<typename MatrixType> class FullPivLU;
+template<typename MatrixType> class PartialPivLU;
+namespace internal {
+template<typename MatrixType> struct inverse_impl;
+}
+template<typename MatrixType> class HouseholderQR;
+template<typename MatrixType> class ColPivHouseholderQR;
+template<typename MatrixType> class FullPivHouseholderQR;
+template<typename MatrixType> class CompleteOrthogonalDecomposition;
+template<typename MatrixType> class SVDBase;
+template<typename MatrixType, int QRPreconditioner = ColPivHouseholderQRPreconditioner> class JacobiSVD;
+template<typename MatrixType> class BDCSVD;
+template<typename MatrixType, int UpLo = Lower> class LLT;
+template<typename MatrixType, int UpLo = Lower> class LDLT;
+template<typename VectorsType, typename CoeffsType, int Side=OnTheLeft> class HouseholderSequence;
+template<typename Scalar>     class JacobiRotation;
+
+// Geometry module:
+template<typename Derived, int _Dim> class RotationBase;
+template<typename Lhs, typename Rhs> class Cross;
+template<typename Derived> class QuaternionBase;
+template<typename Scalar> class Rotation2D;
+template<typename Scalar> class AngleAxis;
+template<typename Scalar,int Dim> class Translation;
+template<typename Scalar,int Dim> class AlignedBox;
+template<typename Scalar, int Options = AutoAlign> class Quaternion;
+template<typename Scalar,int Dim,int Mode,int _Options=AutoAlign> class Transform;
+template <typename _Scalar, int _AmbientDim, int Options=AutoAlign> class ParametrizedLine;
+template <typename _Scalar, int _AmbientDim, int Options=AutoAlign> class Hyperplane;
+template<typename Scalar> class UniformScaling;
+template<typename MatrixType,int Direction> class Homogeneous;
+
+// Sparse module:
+template<typename Derived> class SparseMatrixBase;
+
+// MatrixFunctions module
+template<typename Derived> struct MatrixExponentialReturnValue;
+template<typename Derived> class MatrixFunctionReturnValue;
+template<typename Derived> class MatrixSquareRootReturnValue;
+template<typename Derived> class MatrixLogarithmReturnValue;
+template<typename Derived> class MatrixPowerReturnValue;
+template<typename Derived> class MatrixComplexPowerReturnValue;
+
+namespace internal {
+template <typename Scalar>
+struct stem_function
+{
+  typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
+  typedef ComplexScalar type(ComplexScalar, int);
+};
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_FORWARDDECLARATIONS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/IndexedViewHelper.h b/src/3rdparty/eigen/Eigen/src/Core/util/IndexedViewHelper.h
new file mode 100644
index 000000000..f85de305f
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/IndexedViewHelper.h
@@ -0,0 +1,186 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_INDEXED_VIEW_HELPER_H
+#define EIGEN_INDEXED_VIEW_HELPER_H
+
+namespace Eigen {
+
+namespace internal {
+struct symbolic_last_tag {};
+}
+
+/** \var last
+  * \ingroup Core_Module
+  *
+  * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically reference the last element/row/columns
+  * of the underlying vector or matrix once passed to DenseBase::operator()(const RowIndices&, const ColIndices&).
+  *
+  * This symbolic placeholder supports standard arithmetic operations.
+  *
+  * A typical usage example would be:
+  * \code
+  * using namespace Eigen;
+  * using Eigen::last;
+  * VectorXd v(n);
+  * v(seq(2,last-2)).setOnes();
+  * \endcode
+  *
+  * \sa end
+  */
+static const symbolic::SymbolExpr<internal::symbolic_last_tag> last; // PLEASE use Eigen::last   instead of Eigen::placeholders::last
+
+/** \var lastp1
+  * \ingroup Core_Module
+  *
+  * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically
+  * reference the last+1 element/row/columns of the underlying vector or matrix once
+  * passed to DenseBase::operator()(const RowIndices&, const ColIndices&).
+  *
+  * This symbolic placeholder supports standard arithmetic operations.
+  * It is essentially an alias to last+fix<1>.
+  *
+  * \sa last
+  */
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+static const auto lastp1 = last+fix<1>;
+#else
+// Using a FixedExpr<1> expression is important here to make sure the compiler
+// can fully optimize the computation starting indices with zero overhead.
+static const symbolic::AddExpr<symbolic::SymbolExpr<internal::symbolic_last_tag>,symbolic::ValueExpr<Eigen::internal::FixedInt<1> > > lastp1(last+fix<1>());
+#endif
+
+namespace internal {
+
+ // Replace symbolic last/end "keywords" by their true runtime value
+inline Index eval_expr_given_size(Index x, Index /* size */)   { return x; }
+
+template<int N>
+FixedInt<N> eval_expr_given_size(FixedInt<N> x, Index /*size*/)   { return x; }
+
+template<typename Derived>
+Index eval_expr_given_size(const symbolic::BaseExpr<Derived> &x, Index size)
+{
+  return x.derived().eval(last=size-1);
+}
+
+// Extract increment/step at compile time
+template<typename T, typename EnableIf = void> struct get_compile_time_incr {
+  enum { value = UndefinedIncr };
+};
+
+// Analogue of std::get<0>(x), but tailored for our needs.
+template<typename T>
+EIGEN_CONSTEXPR Index first(const T& x) EIGEN_NOEXCEPT { return x.first(); }
+
+// IndexedViewCompatibleType/makeIndexedViewCompatible turn an arbitrary object of type T into something usable by MatrixSlice
+// The generic implementation is a no-op
+template<typename T,int XprSize,typename EnableIf=void>
+struct IndexedViewCompatibleType {
+  typedef T type;
+};
+
+template<typename T,typename Q>
+const T& makeIndexedViewCompatible(const T& x, Index /*size*/, Q) { return x; }
+
+//--------------------------------------------------------------------------------
+// Handling of a single Index
+//--------------------------------------------------------------------------------
+
+struct SingleRange {
+  enum {
+    SizeAtCompileTime = 1
+  };
+  SingleRange(Index val) : m_value(val) {}
+  Index operator[](Index) const { return m_value; }
+  static EIGEN_CONSTEXPR Index size() EIGEN_NOEXCEPT { return 1; }
+  Index first() const EIGEN_NOEXCEPT { return m_value; }
+  Index m_value;
+};
+
+template<> struct get_compile_time_incr<SingleRange> {
+  enum { value = 1 }; // 1 or 0 ??
+};
+
+// Turn a single index into something that looks like an array (i.e., that exposes a .size(), and operator[](int) methods)
+template<typename T, int XprSize>
+struct IndexedViewCompatibleType<T,XprSize,typename internal::enable_if<internal::is_integral<T>::value>::type> {
+  // Here we could simply use Array, but maybe it's less work for the compiler to use
+  // a simpler wrapper as SingleRange
+  //typedef Eigen::Array<Index,1,1> type;
+  typedef SingleRange type;
+};
+
+template<typename T, int XprSize>
+struct IndexedViewCompatibleType<T, XprSize, typename enable_if<symbolic::is_symbolic<T>::value>::type> {
+  typedef SingleRange type;
+};
+
+
+template<typename T>
+typename enable_if<symbolic::is_symbolic<T>::value,SingleRange>::type
+makeIndexedViewCompatible(const T& id, Index size, SpecializedType) {
+  return eval_expr_given_size(id,size);
+}
+
+//--------------------------------------------------------------------------------
+// Handling of all
+//--------------------------------------------------------------------------------
+
+struct all_t { all_t() {} };
+
+// Convert a symbolic 'all' into a usable range type
+template<int XprSize>
+struct AllRange {
+  enum { SizeAtCompileTime = XprSize };
+  AllRange(Index size = XprSize) : m_size(size) {}
+  EIGEN_CONSTEXPR Index operator[](Index i) const EIGEN_NOEXCEPT { return i; }
+  EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_size.value(); }
+  EIGEN_CONSTEXPR Index first() const EIGEN_NOEXCEPT { return 0; }
+  variable_if_dynamic<Index,XprSize> m_size;
+};
+
+template<int XprSize>
+struct IndexedViewCompatibleType<all_t,XprSize> {
+  typedef AllRange<XprSize> type;
+};
+
+template<typename XprSizeType>
+inline AllRange<get_fixed_value<XprSizeType>::value> makeIndexedViewCompatible(all_t , XprSizeType size, SpecializedType) {
+  return AllRange<get_fixed_value<XprSizeType>::value>(size);
+}
+
+template<int Size> struct get_compile_time_incr<AllRange<Size> > {
+  enum { value = 1 };
+};
+
+} // end namespace internal
+
+
+/** \var all
+  * \ingroup Core_Module
+  * Can be used as a parameter to DenseBase::operator()(const RowIndices&, const ColIndices&) to index all rows or columns
+  */
+static const Eigen::internal::all_t all; // PLEASE use Eigen::all instead of Eigen::placeholders::all
+
+
+namespace placeholders {
+  typedef symbolic::SymbolExpr<internal::symbolic_last_tag> last_t;
+  typedef symbolic::AddExpr<symbolic::SymbolExpr<internal::symbolic_last_tag>,symbolic::ValueExpr<Eigen::internal::FixedInt<1> > > end_t;
+  typedef Eigen::internal::all_t all_t;
+
+  EIGEN_DEPRECATED static const all_t  all  = Eigen::all;    // PLEASE use Eigen::all    instead of Eigen::placeholders::all
+  EIGEN_DEPRECATED static const last_t last = Eigen::last;   // PLEASE use Eigen::last   instead of Eigen::placeholders::last
+  EIGEN_DEPRECATED static const end_t  end  = Eigen::lastp1; // PLEASE use Eigen::lastp1 instead of Eigen::placeholders::end
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_INDEXED_VIEW_HELPER_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/IntegralConstant.h b/src/3rdparty/eigen/Eigen/src/Core/util/IntegralConstant.h
new file mode 100644
index 000000000..945d426ea
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/IntegralConstant.h
@@ -0,0 +1,272 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_INTEGRAL_CONSTANT_H
+#define EIGEN_INTEGRAL_CONSTANT_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<int N> class FixedInt;
+template<int N> class VariableAndFixedInt;
+
+/** \internal
+  * \class FixedInt
+  *
+  * This class embeds a compile-time integer \c N.
+  *
+  * It is similar to c++11 std::integral_constant<int,N> but with some additional features
+  * such as:
+  *  - implicit conversion to int
+  *  - arithmetic and some bitwise operators: -, +, *, /, %, &, |
+  *  - c++98/14 compatibility with fix<N> and fix<N>() syntax to define integral constants.
+  *
+  * It is strongly discouraged to directly deal with this class FixedInt. Instances are expcected to
+  * be created by the user using Eigen::fix<N> or Eigen::fix<N>(). In C++98-11, the former syntax does
+  * not create a FixedInt<N> instance but rather a point to function that needs to be \em cleaned-up
+  * using the generic helper:
+  * \code
+  * internal::cleanup_index_type<T>::type
+  * internal::cleanup_index_type<T,DynamicKey>::type
+  * \endcode
+  * where T can a FixedInt<N>, a pointer to function FixedInt<N> (*)(), or numerous other integer-like representations.
+  * \c DynamicKey is either Dynamic (default) or DynamicIndex and used to identify true compile-time values.
+  *
+  * For convenience, you can extract the compile-time value \c N in a generic way using the following helper:
+  * \code
+  * internal::get_fixed_value<T,DefaultVal>::value
+  * \endcode
+  * that will give you \c N if T equals FixedInt<N> or FixedInt<N> (*)(), and \c DefaultVal if T does not embed any compile-time value (e.g., T==int).
+  *
+  * \sa fix<N>, class VariableAndFixedInt
+  */
+template<int N> class FixedInt
+{
+public:
+  static const int value = N;
+  EIGEN_CONSTEXPR operator int() const { return value; }
+  FixedInt() {}
+  FixedInt( VariableAndFixedInt<N> other) {
+    #ifndef EIGEN_INTERNAL_DEBUGGING
+    EIGEN_UNUSED_VARIABLE(other);
+    #endif
+    eigen_internal_assert(int(other)==N);
+  }
+
+  FixedInt<-N> operator-() const { return FixedInt<-N>(); }
+  template<int M>
+  FixedInt<N+M> operator+( FixedInt<M>) const { return FixedInt<N+M>(); }
+  template<int M>
+  FixedInt<N-M> operator-( FixedInt<M>) const { return FixedInt<N-M>(); }
+  template<int M>
+  FixedInt<N*M> operator*( FixedInt<M>) const { return FixedInt<N*M>(); }
+  template<int M>
+  FixedInt<N/M> operator/( FixedInt<M>) const { return FixedInt<N/M>(); }
+  template<int M>
+  FixedInt<N%M> operator%( FixedInt<M>) const { return FixedInt<N%M>(); }
+  template<int M>
+  FixedInt<N|M> operator|( FixedInt<M>) const { return FixedInt<N|M>(); }
+  template<int M>
+  FixedInt<N&M> operator&( FixedInt<M>) const { return FixedInt<N&M>(); }
+
+#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
+  // Needed in C++14 to allow fix<N>():
+  FixedInt operator() () const { return *this; }
+
+  VariableAndFixedInt<N> operator() (int val) const { return VariableAndFixedInt<N>(val); }
+#else
+  FixedInt ( FixedInt<N> (*)() ) {}
+#endif
+
+#if EIGEN_HAS_CXX11
+  FixedInt(std::integral_constant<int,N>) {}
+#endif
+};
+
+/** \internal
+  * \class VariableAndFixedInt
+  *
+  * This class embeds both a compile-time integer \c N and a runtime integer.
+  * Both values are supposed to be equal unless the compile-time value \c N has a special
+  * value meaning that the runtime-value should be used. Depending on the context, this special
+  * value can be either Eigen::Dynamic (for positive quantities) or Eigen::DynamicIndex (for
+  * quantities that can be negative).
+  *
+  * It is the return-type of the function Eigen::fix<N>(int), and most of the time this is the only
+  * way it is used. It is strongly discouraged to directly deal with instances of VariableAndFixedInt.
+  * Indeed, in order to write generic code, it is the responsibility of the callee to properly convert
+  * it to either a true compile-time quantity (i.e. a FixedInt<N>), or to a runtime quantity (e.g., an Index)
+  * using the following generic helper:
+  * \code
+  * internal::cleanup_index_type<T>::type
+  * internal::cleanup_index_type<T,DynamicKey>::type
+  * \endcode
+  * where T can be a template instantiation of VariableAndFixedInt or numerous other integer-like representations.
+  * \c DynamicKey is either Dynamic (default) or DynamicIndex and used to identify true compile-time values.
+  *
+  * For convenience, you can also extract the compile-time value \c N using the following helper:
+  * \code
+  * internal::get_fixed_value<T,DefaultVal>::value
+  * \endcode
+  * that will give you \c N if T equals VariableAndFixedInt<N>, and \c DefaultVal if T does not embed any compile-time value (e.g., T==int).
+  *
+  * \sa fix<N>(int), class FixedInt
+  */
+template<int N> class VariableAndFixedInt
+{
+public:
+  static const int value = N;
+  operator int() const { return m_value; }
+  VariableAndFixedInt(int val) { m_value = val; }
+protected:
+  int m_value;
+};
+
+template<typename T, int Default=Dynamic> struct get_fixed_value {
+  static const int value = Default;
+};
+
+template<int N,int Default> struct get_fixed_value<FixedInt<N>,Default> {
+  static const int value = N;
+};
+
+#if !EIGEN_HAS_CXX14
+template<int N,int Default> struct get_fixed_value<FixedInt<N> (*)(),Default> {
+  static const int value = N;
+};
+#endif
+
+template<int N,int Default> struct get_fixed_value<VariableAndFixedInt<N>,Default> {
+  static const int value = N ;
+};
+
+template<typename T, int N, int Default>
+struct get_fixed_value<variable_if_dynamic<T,N>,Default> {
+  static const int value = N;
+};
+
+template<typename T> EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; }
+#if !EIGEN_HAS_CXX14
+template<int N> EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt<N> (*)()) { return N; }
+#endif
+
+// Cleanup integer/FixedInt/VariableAndFixedInt/etc types:
+
+// By default, no cleanup:
+template<typename T, int DynamicKey=Dynamic, typename EnableIf=void> struct cleanup_index_type { typedef T type; };
+
+// Convert any integral type (e.g., short, int, unsigned int, etc.) to Eigen::Index
+template<typename T, int DynamicKey> struct cleanup_index_type<T,DynamicKey,typename internal::enable_if<internal::is_integral<T>::value>::type> { typedef Index type; };
+
+#if !EIGEN_HAS_CXX14
+// In c++98/c++11, fix<N> is a pointer to function that we better cleanup to a true FixedInt<N>:
+template<int N, int DynamicKey> struct cleanup_index_type<FixedInt<N> (*)(), DynamicKey> { typedef FixedInt<N> type; };
+#endif
+
+// If VariableAndFixedInt does not match DynamicKey, then we turn it to a pure compile-time value:
+template<int N, int DynamicKey> struct cleanup_index_type<VariableAndFixedInt<N>, DynamicKey> { typedef FixedInt<N> type; };
+// If VariableAndFixedInt matches DynamicKey, then we turn it to a pure runtime-value (aka Index):
+template<int DynamicKey> struct cleanup_index_type<VariableAndFixedInt<DynamicKey>, DynamicKey> { typedef Index type; };
+
+#if EIGEN_HAS_CXX11
+template<int N, int DynamicKey> struct cleanup_index_type<std::integral_constant<int,N>, DynamicKey> { typedef FixedInt<N> type; };
+#endif
+
+} // end namespace internal
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
+template<int N>
+static const internal::FixedInt<N> fix{};
+#else
+template<int N>
+inline internal::FixedInt<N> fix() { return internal::FixedInt<N>(); }
+
+// The generic typename T is mandatory. Otherwise, a code like fix<N> could refer to either the function above or this next overload.
+// This way a code like fix<N> can only refer to the previous function.
+template<int N,typename T>
+inline internal::VariableAndFixedInt<N> fix(T val) { return internal::VariableAndFixedInt<N>(internal::convert_index<int>(val)); }
+#endif
+
+#else // EIGEN_PARSED_BY_DOXYGEN
+
+/** \var fix<N>()
+  * \ingroup Core_Module
+  *
+  * This \em identifier permits to construct an object embedding a compile-time integer \c N.
+  *
+  * \tparam N the compile-time integer value
+  *
+  * It is typically used in conjunction with the Eigen::seq and Eigen::seqN functions to pass compile-time values to them:
+  * \code
+  * seqN(10,fix<4>,fix<-3>)   // <=> [10 7 4 1]
+  * \endcode
+  *
+  * See also the function fix(int) to pass both a compile-time and runtime value.
+  *
+  * In c++14, it is implemented as:
+  * \code
+  * template<int N> static const internal::FixedInt<N> fix{};
+  * \endcode
+  * where internal::FixedInt<N> is an internal template class similar to
+  * <a href="http://en.cppreference.com/w/cpp/types/integral_constant">\c std::integral_constant </a><tt> <int,N> </tt>
+  * Here, \c fix<N> is thus an object of type \c internal::FixedInt<N>.
+  *
+  * In c++98/11, it is implemented as a function:
+  * \code
+  * template<int N> inline internal::FixedInt<N> fix();
+  * \endcode
+  * Here internal::FixedInt<N> is thus a pointer to function.
+  *
+  * If for some reason you want a true object in c++98 then you can write: \code fix<N>() \endcode which is also valid in c++14.
+  *
+  * \sa fix<N>(int), seq, seqN
+  */
+template<int N>
+static const auto fix();
+
+/** \fn fix<N>(int)
+  * \ingroup Core_Module
+  *
+  * This function returns an object embedding both a compile-time integer \c N, and a fallback runtime value \a val.
+  *
+  * \tparam N the compile-time integer value
+  * \param  val the fallback runtime integer value
+  *
+  * This function is a more general version of the \ref fix identifier/function that can be used in template code
+  * where the compile-time value could turn out to actually mean "undefined at compile-time". For positive integers
+  * such as a size or a dimension, this case is identified by Eigen::Dynamic, whereas runtime signed integers
+  * (e.g., an increment/stride) are identified as Eigen::DynamicIndex. In such a case, the runtime value \a val
+  * will be used as a fallback.
+  *
+  * A typical use case would be:
+  * \code
+  * template<typename Derived> void foo(const MatrixBase<Derived> &mat) {
+  *   const int N = Derived::RowsAtCompileTime==Dynamic ? Dynamic : Derived::RowsAtCompileTime/2;
+  *   const int n = mat.rows()/2;
+  *   ... mat( seqN(0,fix<N>(n) ) ...;
+  * }
+  * \endcode
+  * In this example, the function Eigen::seqN knows that the second argument is expected to be a size.
+  * If the passed compile-time value N equals Eigen::Dynamic, then the proxy object returned by fix will be dissmissed, and converted to an Eigen::Index of value \c n.
+  * Otherwise, the runtime-value \c n will be dissmissed, and the returned ArithmeticSequence will be of the exact same type as <tt> seqN(0,fix<N>) </tt>.
+  *
+  * \sa fix, seqN, class ArithmeticSequence
+  */
+template<int N>
+static const auto fix(int val);
+
+#endif // EIGEN_PARSED_BY_DOXYGEN
+
+} // end namespace Eigen
+
+#endif // EIGEN_INTEGRAL_CONSTANT_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/MKL_support.h b/src/3rdparty/eigen/Eigen/src/Core/util/MKL_support.h
new file mode 100644
index 000000000..17963fad4
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/MKL_support.h
@@ -0,0 +1,137 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   Include file with common MKL declarations
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_MKL_SUPPORT_H
+#define EIGEN_MKL_SUPPORT_H
+
+#ifdef EIGEN_USE_MKL_ALL
+  #ifndef EIGEN_USE_BLAS
+    #define EIGEN_USE_BLAS
+  #endif
+  #ifndef EIGEN_USE_LAPACKE
+    #define EIGEN_USE_LAPACKE
+  #endif
+  #ifndef EIGEN_USE_MKL_VML
+    #define EIGEN_USE_MKL_VML
+  #endif
+#endif
+
+#ifdef EIGEN_USE_LAPACKE_STRICT
+  #define EIGEN_USE_LAPACKE
+#endif
+
+#if defined(EIGEN_USE_MKL_VML) && !defined(EIGEN_USE_MKL)
+  #define EIGEN_USE_MKL
+#endif
+
+
+#if defined EIGEN_USE_MKL
+#   if (!defined MKL_DIRECT_CALL) && (!defined EIGEN_MKL_NO_DIRECT_CALL)
+#       define MKL_DIRECT_CALL
+#       define MKL_DIRECT_CALL_JUST_SET
+#   endif
+#   include <mkl.h>
+/*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
+#   ifndef INTEL_MKL_VERSION
+#       undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */
+#   elif INTEL_MKL_VERSION < 100305    /* the intel-mkl-103-release-notes say this was when the lapacke.h interface was added*/
+#       undef EIGEN_USE_MKL
+#   endif
+#   ifndef EIGEN_USE_MKL
+    /*If the MKL version is too old, undef everything*/
+#       undef   EIGEN_USE_MKL_ALL
+#       undef   EIGEN_USE_LAPACKE
+#       undef   EIGEN_USE_MKL_VML
+#       undef   EIGEN_USE_LAPACKE_STRICT
+#       undef   EIGEN_USE_LAPACKE
+#       ifdef   MKL_DIRECT_CALL_JUST_SET
+#           undef MKL_DIRECT_CALL
+#       endif
+#   endif
+#endif
+
+#if defined EIGEN_USE_MKL
+
+#define EIGEN_MKL_VML_THRESHOLD 128
+
+/* MKL_DOMAIN_BLAS, etc are defined only in 10.3 update 7 */
+/* MKL_BLAS, etc are not defined in 11.2 */
+#ifdef MKL_DOMAIN_ALL
+#define EIGEN_MKL_DOMAIN_ALL MKL_DOMAIN_ALL
+#else
+#define EIGEN_MKL_DOMAIN_ALL MKL_ALL
+#endif
+
+#ifdef MKL_DOMAIN_BLAS
+#define EIGEN_MKL_DOMAIN_BLAS MKL_DOMAIN_BLAS
+#else
+#define EIGEN_MKL_DOMAIN_BLAS MKL_BLAS
+#endif
+
+#ifdef MKL_DOMAIN_FFT
+#define EIGEN_MKL_DOMAIN_FFT MKL_DOMAIN_FFT
+#else
+#define EIGEN_MKL_DOMAIN_FFT MKL_FFT
+#endif
+
+#ifdef MKL_DOMAIN_VML
+#define EIGEN_MKL_DOMAIN_VML MKL_DOMAIN_VML
+#else
+#define EIGEN_MKL_DOMAIN_VML MKL_VML
+#endif
+
+#ifdef MKL_DOMAIN_PARDISO
+#define EIGEN_MKL_DOMAIN_PARDISO MKL_DOMAIN_PARDISO
+#else
+#define EIGEN_MKL_DOMAIN_PARDISO MKL_PARDISO
+#endif
+#endif
+
+#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL)
+#include "../../misc/blas.h"
+#endif
+
+namespace Eigen {
+
+typedef std::complex<double> dcomplex;
+typedef std::complex<float>  scomplex;
+
+#if defined(EIGEN_USE_MKL)
+typedef MKL_INT BlasIndex;
+#else
+typedef int BlasIndex;
+#endif
+
+} // end namespace Eigen
+
+
+#endif // EIGEN_MKL_SUPPORT_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/Macros.h b/src/3rdparty/eigen/Eigen/src/Core/util/Macros.h
new file mode 100644
index 000000000..986c3d44d
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/Macros.h
@@ -0,0 +1,1464 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MACROS_H
+#define EIGEN_MACROS_H
+
+//------------------------------------------------------------------------------------------
+// Eigen version and basic defaults
+//------------------------------------------------------------------------------------------
+
+#define EIGEN_WORLD_VERSION 3
+#define EIGEN_MAJOR_VERSION 4
+#define EIGEN_MINOR_VERSION 0
+
+#define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
+                                      (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
+                                                                 EIGEN_MINOR_VERSION>=z))))
+
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
+#else
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor
+#endif
+
+#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t
+#endif
+
+// Upperbound on the C++ version to use.
+// Expected values are 03, 11, 14, 17, etc.
+// By default, let's use an arbitrarily large C++ version.
+#ifndef EIGEN_MAX_CPP_VER
+#define EIGEN_MAX_CPP_VER 99
+#endif
+
+/** Allows to disable some optimizations which might affect the accuracy of the result.
+  * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
+  * They currently include:
+  *   - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization.
+  */
+#ifndef EIGEN_FAST_MATH
+#define EIGEN_FAST_MATH 1
+#endif
+
+#ifndef EIGEN_STACK_ALLOCATION_LIMIT
+// 131072 == 128 KB
+#define EIGEN_STACK_ALLOCATION_LIMIT 131072
+#endif
+
+//------------------------------------------------------------------------------------------
+// Compiler identification, EIGEN_COMP_*
+//------------------------------------------------------------------------------------------
+
+/// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC
+#ifdef __GNUC__
+  #define EIGEN_COMP_GNUC (__GNUC__*10+__GNUC_MINOR__)
+#else
+  #define EIGEN_COMP_GNUC 0
+#endif
+
+/// \internal EIGEN_COMP_CLANG set to major+minor version (e.g., 307 for clang 3.7) if the compiler is clang
+#if defined(__clang__)
+  #define EIGEN_COMP_CLANG (__clang_major__*100+__clang_minor__)
+#else
+  #define EIGEN_COMP_CLANG 0
+#endif
+
+/// \internal EIGEN_COMP_CASTXML set to 1 if being preprocessed by CastXML
+#if defined(__castxml__)
+  #define EIGEN_COMP_CASTXML 1
+#else
+  #define EIGEN_COMP_CASTXML 0
+#endif
+
+/// \internal EIGEN_COMP_LLVM set to 1 if the compiler backend is llvm
+#if defined(__llvm__)
+  #define EIGEN_COMP_LLVM 1
+#else
+  #define EIGEN_COMP_LLVM 0
+#endif
+
+/// \internal EIGEN_COMP_ICC set to __INTEL_COMPILER if the compiler is Intel compiler, 0 otherwise
+#if defined(__INTEL_COMPILER)
+  #define EIGEN_COMP_ICC __INTEL_COMPILER
+#else
+  #define EIGEN_COMP_ICC 0
+#endif
+
+/// \internal EIGEN_COMP_MINGW set to 1 if the compiler is mingw
+#if defined(__MINGW32__)
+  #define EIGEN_COMP_MINGW 1
+#else
+  #define EIGEN_COMP_MINGW 0
+#endif
+
+/// \internal EIGEN_COMP_SUNCC set to 1 if the compiler is Solaris Studio
+#if defined(__SUNPRO_CC)
+  #define EIGEN_COMP_SUNCC 1
+#else
+  #define EIGEN_COMP_SUNCC 0
+#endif
+
+/// \internal EIGEN_COMP_MSVC set to _MSC_VER if the compiler is Microsoft Visual C++, 0 otherwise.
+#if defined(_MSC_VER)
+  #define EIGEN_COMP_MSVC _MSC_VER
+#else
+  #define EIGEN_COMP_MSVC 0
+#endif
+
+#if defined(__NVCC__)
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+  #define EIGEN_COMP_NVCC  ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
+#elif defined(__CUDACC_VER__)
+  #define EIGEN_COMP_NVCC __CUDACC_VER__
+#else
+  #error "NVCC did not define compiler version."
+#endif
+#else
+  #define EIGEN_COMP_NVCC 0
+#endif
+
+// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC:
+//  name        ver   MSC_VER
+//  2008         9      1500
+//  2010        10      1600
+//  2012        11      1700
+//  2013        12      1800
+//  2015        14      1900
+//  "15"        15      1900
+//  2017-14.1   15.0    1910
+//  2017-14.11  15.3    1911
+//  2017-14.12  15.5    1912
+//  2017-14.13  15.6    1913
+//  2017-14.14  15.7    1914
+
+/// \internal EIGEN_COMP_MSVC_LANG set to _MSVC_LANG if the compiler is Microsoft Visual C++, 0 otherwise.
+#if defined(_MSVC_LANG)
+  #define EIGEN_COMP_MSVC_LANG _MSVC_LANG
+#else
+  #define EIGEN_COMP_MSVC_LANG 0
+#endif
+
+// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC_LANG:
+// MSVC option                          Standard  MSVC_LANG
+// /std:c++14 (default as of VS 2019)   C++14     201402L
+// /std:c++17                           C++17     201703L
+// /std:c++latest                       >C++17    >201703L
+
+/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl
+#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG)
+  #define EIGEN_COMP_MSVC_STRICT _MSC_VER
+#else
+  #define EIGEN_COMP_MSVC_STRICT 0
+#endif
+
+/// \internal EIGEN_COMP_IBM set to xlc version if the compiler is IBM XL C++
+// XLC   version
+// 3.1   0x0301
+// 4.5   0x0405
+// 5.0   0x0500
+// 12.1  0x0C01
+#if defined(__IBMCPP__) || defined(__xlc__) || defined(__ibmxl__)
+  #define EIGEN_COMP_IBM __xlC__
+#else
+  #define EIGEN_COMP_IBM 0
+#endif
+
+/// \internal EIGEN_COMP_PGI set to PGI version if the compiler is Portland Group Compiler
+#if defined(__PGI)
+  #define EIGEN_COMP_PGI (__PGIC__*100+__PGIC_MINOR__)
+#else
+  #define EIGEN_COMP_PGI 0
+#endif
+
+/// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler
+#if defined(__CC_ARM) || defined(__ARMCC_VERSION)
+  #define EIGEN_COMP_ARM 1
+#else
+  #define EIGEN_COMP_ARM 0
+#endif
+
+/// \internal EIGEN_COMP_EMSCRIPTEN set to 1 if the compiler is Emscripten Compiler
+#if defined(__EMSCRIPTEN__)
+  #define EIGEN_COMP_EMSCRIPTEN 1
+#else
+  #define EIGEN_COMP_EMSCRIPTEN 0
+#endif
+
+
+/// \internal EIGEN_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.)
+#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN)
+  #define EIGEN_COMP_GNUC_STRICT 1
+#else
+  #define EIGEN_COMP_GNUC_STRICT 0
+#endif
+
+
+#if EIGEN_COMP_GNUC
+  #define EIGEN_GNUC_AT_LEAST(x,y) ((__GNUC__==x && __GNUC_MINOR__>=y) || __GNUC__>x)
+  #define EIGEN_GNUC_AT_MOST(x,y)  ((__GNUC__==x && __GNUC_MINOR__<=y) || __GNUC__<x)
+  #define EIGEN_GNUC_AT(x,y)       ( __GNUC__==x && __GNUC_MINOR__==y )
+#else
+  #define EIGEN_GNUC_AT_LEAST(x,y) 0
+  #define EIGEN_GNUC_AT_MOST(x,y)  0
+  #define EIGEN_GNUC_AT(x,y)       0
+#endif
+
+// FIXME: could probably be removed as we do not support gcc 3.x anymore
+#if EIGEN_COMP_GNUC && (__GNUC__ <= 3)
+#define EIGEN_GCC3_OR_OLDER 1
+#else
+#define EIGEN_GCC3_OR_OLDER 0
+#endif
+
+
+
+//------------------------------------------------------------------------------------------
+// Architecture identification, EIGEN_ARCH_*
+//------------------------------------------------------------------------------------------
+
+
+#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)) || defined(__amd64)
+  #define EIGEN_ARCH_x86_64 1
+#else
+  #define EIGEN_ARCH_x86_64 0
+#endif
+
+#if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386)
+  #define EIGEN_ARCH_i386 1
+#else
+  #define EIGEN_ARCH_i386 0
+#endif
+
+#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_i386
+  #define EIGEN_ARCH_i386_OR_x86_64 1
+#else
+  #define EIGEN_ARCH_i386_OR_x86_64 0
+#endif
+
+/// \internal EIGEN_ARCH_ARM set to 1 if the architecture is ARM
+#if defined(__arm__)
+  #define EIGEN_ARCH_ARM 1
+#else
+  #define EIGEN_ARCH_ARM 0
+#endif
+
+/// \internal EIGEN_ARCH_ARM64 set to 1 if the architecture is ARM64
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+  #define EIGEN_ARCH_ARM64 1
+#else
+  #define EIGEN_ARCH_ARM64 0
+#endif
+
+/// \internal EIGEN_ARCH_ARM_OR_ARM64 set to 1 if the architecture is ARM or ARM64
+#if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64
+  #define EIGEN_ARCH_ARM_OR_ARM64 1
+#else
+  #define EIGEN_ARCH_ARM_OR_ARM64 0
+#endif
+
+/// \internal EIGEN_ARCH_ARMV8 set to 1 if the architecture is armv8 or greater.
+#if EIGEN_ARCH_ARM_OR_ARM64 && defined(__ARM_ARCH) && __ARM_ARCH >= 8
+#define EIGEN_ARCH_ARMV8 1
+#else
+#define EIGEN_ARCH_ARMV8 0
+#endif
+
+
+/// \internal EIGEN_HAS_ARM64_FP16 set to 1 if the architecture provides an IEEE
+/// compliant Arm fp16 type
+#if EIGEN_ARCH_ARM64
+  #ifndef EIGEN_HAS_ARM64_FP16
+    #if defined(__ARM_FP16_FORMAT_IEEE)
+      #define EIGEN_HAS_ARM64_FP16 1
+    #else
+      #define EIGEN_HAS_ARM64_FP16 0
+    #endif
+  #endif
+#endif
+
+/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture
+/// supports Neon vector intrinsics for fp16.
+#if EIGEN_ARCH_ARM64
+  #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+    #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1
+    #else
+      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0
+    #endif
+  #endif
+#endif
+
+/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture
+/// supports Neon scalar intrinsics for fp16.
+#if EIGEN_ARCH_ARM64
+  #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
+    #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+      #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1
+    #endif
+  #endif
+#endif
+
+/// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS
+#if defined(__mips__) || defined(__mips)
+  #define EIGEN_ARCH_MIPS 1
+#else
+  #define EIGEN_ARCH_MIPS 0
+#endif
+
+/// \internal EIGEN_ARCH_SPARC set to 1 if the architecture is SPARC
+#if defined(__sparc__) || defined(__sparc)
+  #define EIGEN_ARCH_SPARC 1
+#else
+  #define EIGEN_ARCH_SPARC 0
+#endif
+
+/// \internal EIGEN_ARCH_IA64 set to 1 if the architecture is Intel Itanium
+#if defined(__ia64__)
+  #define EIGEN_ARCH_IA64 1
+#else
+  #define EIGEN_ARCH_IA64 0
+#endif
+
+/// \internal EIGEN_ARCH_PPC set to 1 if the architecture is PowerPC
+#if defined(__powerpc__) || defined(__ppc__) || defined(_M_PPC)
+  #define EIGEN_ARCH_PPC 1
+#else
+  #define EIGEN_ARCH_PPC 0
+#endif
+
+
+
+//------------------------------------------------------------------------------------------
+// Operating system identification, EIGEN_OS_*
+//------------------------------------------------------------------------------------------
+
+/// \internal EIGEN_OS_UNIX set to 1 if the OS is a unix variant
+#if defined(__unix__) || defined(__unix)
+  #define EIGEN_OS_UNIX 1
+#else
+  #define EIGEN_OS_UNIX 0
+#endif
+
+/// \internal EIGEN_OS_LINUX set to 1 if the OS is based on Linux kernel
+#if defined(__linux__)
+  #define EIGEN_OS_LINUX 1
+#else
+  #define EIGEN_OS_LINUX 0
+#endif
+
+/// \internal EIGEN_OS_ANDROID set to 1 if the OS is Android
+// note: ANDROID is defined when using ndk_build, __ANDROID__ is defined when using a standalone toolchain.
+#if defined(__ANDROID__) || defined(ANDROID)
+  #define EIGEN_OS_ANDROID 1
+#else
+  #define EIGEN_OS_ANDROID 0
+#endif
+
+/// \internal EIGEN_OS_GNULINUX set to 1 if the OS is GNU Linux and not Linux-based OS (e.g., not android)
+#if defined(__gnu_linux__) && !(EIGEN_OS_ANDROID)
+  #define EIGEN_OS_GNULINUX 1
+#else
+  #define EIGEN_OS_GNULINUX 0
+#endif
+
+/// \internal EIGEN_OS_BSD set to 1 if the OS is a BSD variant
+#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__bsdi__) || defined(__DragonFly__)
+  #define EIGEN_OS_BSD 1
+#else
+  #define EIGEN_OS_BSD 0
+#endif
+
+/// \internal EIGEN_OS_MAC set to 1 if the OS is MacOS
+#if defined(__APPLE__)
+  #define EIGEN_OS_MAC 1
+#else
+  #define EIGEN_OS_MAC 0
+#endif
+
+/// \internal EIGEN_OS_QNX set to 1 if the OS is QNX
+#if defined(__QNX__)
+  #define EIGEN_OS_QNX 1
+#else
+  #define EIGEN_OS_QNX 0
+#endif
+
+/// \internal EIGEN_OS_WIN set to 1 if the OS is Windows based
+#if defined(_WIN32)
+  #define EIGEN_OS_WIN 1
+#else
+  #define EIGEN_OS_WIN 0
+#endif
+
+/// \internal EIGEN_OS_WIN64 set to 1 if the OS is Windows 64bits
+#if defined(_WIN64)
+  #define EIGEN_OS_WIN64 1
+#else
+  #define EIGEN_OS_WIN64 0
+#endif
+
+/// \internal EIGEN_OS_WINCE set to 1 if the OS is Windows CE
+#if defined(_WIN32_WCE)
+  #define EIGEN_OS_WINCE 1
+#else
+  #define EIGEN_OS_WINCE 0
+#endif
+
+/// \internal EIGEN_OS_CYGWIN set to 1 if the OS is Windows/Cygwin
+#if defined(__CYGWIN__)
+  #define EIGEN_OS_CYGWIN 1
+#else
+  #define EIGEN_OS_CYGWIN 0
+#endif
+
+/// \internal EIGEN_OS_WIN_STRICT set to 1 if the OS is really Windows and not some variants
+#if EIGEN_OS_WIN && !( EIGEN_OS_WINCE || EIGEN_OS_CYGWIN )
+  #define EIGEN_OS_WIN_STRICT 1
+#else
+  #define EIGEN_OS_WIN_STRICT 0
+#endif
+
+/// \internal EIGEN_OS_SUN set to __SUNPRO_C if the OS is SUN
+// compiler  solaris   __SUNPRO_C
+// version   studio
+// 5.7       10        0x570
+// 5.8       11        0x580
+// 5.9       12        0x590
+// 5.10	     12.1      0x5100
+// 5.11	     12.2      0x5110
+// 5.12	     12.3      0x5120
+#if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__))
+  #define EIGEN_OS_SUN __SUNPRO_C
+#else
+  #define EIGEN_OS_SUN 0
+#endif
+
+/// \internal EIGEN_OS_SOLARIS set to 1 if the OS is Solaris
+#if (defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__))
+  #define EIGEN_OS_SOLARIS 1
+#else
+  #define EIGEN_OS_SOLARIS 0
+#endif
+
+
+//------------------------------------------------------------------------------------------
+// Detect GPU compilers and architectures
+//------------------------------------------------------------------------------------------
+
+// NVCC is not supported as the target platform for HIPCC
+// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive
+#if defined(__NVCC__) && defined(__HIPCC__)
+  #error "NVCC as the target platform for HIPCC is currently not supported."
+#endif
+
+#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
+  // Means the compiler is either nvcc or clang with CUDA enabled
+  #define EIGEN_CUDACC __CUDACC__
+#endif
+
+#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
+  // Means we are generating code for the device
+  #define EIGEN_CUDA_ARCH __CUDA_ARCH__
+#endif
+
+#if defined(EIGEN_CUDACC)
+#include <cuda.h>
+  #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
+#else
+  #define EIGEN_CUDA_SDK_VER 0
+#endif
+
+#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
+  // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
+  #define EIGEN_HIPCC __HIPCC__
+
+  // We need to include hip_runtime.h here because it pulls in
+  // ++ hip_common.h which contains the define for  __HIP_DEVICE_COMPILE__
+  // ++ host_defines.h which contains the defines for the __host__ and __device__ macros
+  #include <hip/hip_runtime.h>
+
+  #if defined(__HIP_DEVICE_COMPILE__)
+    // analogous to EIGEN_CUDA_ARCH, but for HIP
+    #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
+  #endif
+
+  // For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute
+  // value to 1024. The compiler assigns a default value of 256 when the attribute is not
+  // specified. This results in failures on the HIP platform, for cases when a GPU kernel
+  // without an explicit launch_bounds attribute is called with a threads_per_block value
+  // greater than 256.
+  //
+  // This is a regression in functioanlity and is expected to be fixed within the next
+  // couple of ROCm releases (compiler will go back to using 1024 value as the default)
+  //
+  // In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds
+  // attribute.
+
+  #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)
+
+#endif
+
+#if !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
+#define EIGEN_HIP_LAUNCH_BOUNDS_1024
+#endif // !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
+
+// Unify CUDA/HIPCC
+
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
+//
+// If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC
+//
+#define EIGEN_GPUCC
+//
+// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels
+// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels
+//
+// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels.
+// For those cases, the corresponding code should be guarded with
+//      #if defined(EIGEN_GPUCC)
+// instead of
+//      #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
+//
+// For cases where the tweak is specific to HIP, the code should be guarded with
+//      #if defined(EIGEN_HIPCC)
+//
+// For cases where the tweak is specific to CUDA, the code should be guarded with
+//      #if defined(EIGEN_CUDACC)
+//
+#endif
+
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE
+//
+#define EIGEN_GPU_COMPILE_PHASE
+//
+// GPU compilers (HIPCC, NVCC) typically do two passes over the source code,
+//   + one to compile the source for the "host" (ie CPU)
+//   + another to compile the source for the "device" (ie. GPU)
+//
+// Code that needs to enabled only during the either the "host" or "device" compilation phase
+// needs to be guarded with a macro that indicates the current compilation phase
+//
+// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP
+// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA
+//
+// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA
+// For those cases, the code should be guarded with
+//       #if defined(EIGEN_GPU_COMPILE_PHASE)
+// instead of
+//       #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// For cases where the tweak is specific to HIP, the code should be guarded with
+//      #if defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// For cases where the tweak is specific to CUDA, the code should be guarded with
+//      #if defined(EIGEN_CUDA_ARCH)
+//
+#endif
+
+#if defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
+// EIGEN_USE_SYCL is a user-defined macro while __SYCL_DEVICE_ONLY__ is a compiler-defined macro.
+// In most cases we want to check if both macros are defined which can be done using the define below.
+#define SYCL_DEVICE_ONLY
+#endif
+
+//------------------------------------------------------------------------------------------
+// Detect Compiler/Architecture/OS specific features
+//------------------------------------------------------------------------------------------
+
+#if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG
+  // see bug 89
+  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0
+#else
+  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1
+#endif
+
+// Cross compiler wrapper around LLVM's __has_builtin
+#ifdef __has_builtin
+#  define EIGEN_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define EIGEN_HAS_BUILTIN(x) 0
+#endif
+
+// A Clang feature extension to determine compiler features.
+// We use it to determine 'cxx_rvalue_references'
+#ifndef __has_feature
+# define __has_feature(x) 0
+#endif
+
+// Some old compilers do not support template specializations like:
+// template<typename T,int N> void foo(const T x[N]);
+#if !(   EIGEN_COMP_CLANG && (   (EIGEN_COMP_CLANG<309)                                                       \
+                              || (defined(__apple_build_version__) && (__apple_build_version__ < 9000000)))  \
+      || EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<49)
+#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 1
+#else
+#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0
+#endif
+
+// The macro EIGEN_CPLUSPLUS is a replacement for __cplusplus/_MSVC_LANG that
+// works for both platforms, indicating the C++ standard version number.
+//
+// With MSVC, without defining /Zc:__cplusplus, the __cplusplus macro will
+// report 199711L regardless of the language standard specified via /std.
+// We need to rely on _MSVC_LANG instead, which is only available after
+// VS2015.3.
+#if EIGEN_COMP_MSVC_LANG > 0
+#define EIGEN_CPLUSPLUS EIGEN_COMP_MSVC_LANG
+#elif EIGEN_COMP_MSVC >= 1900
+#define EIGEN_CPLUSPLUS 201103L
+#elif defined(__cplusplus)
+#define EIGEN_CPLUSPLUS __cplusplus
+#else
+#define EIGEN_CPLUSPLUS 0
+#endif
+
+// The macro EIGEN_COMP_CXXVER defines the c++ verson expected by the compiler.
+// For instance, if compiling with gcc and -std=c++17, then EIGEN_COMP_CXXVER
+// is defined to 17.
+#if EIGEN_CPLUSPLUS > 201703L
+  #define EIGEN_COMP_CXXVER 20
+#elif EIGEN_CPLUSPLUS > 201402L
+  #define EIGEN_COMP_CXXVER 17
+#elif EIGEN_CPLUSPLUS > 201103L
+  #define EIGEN_COMP_CXXVER 14
+#elif EIGEN_CPLUSPLUS >= 201103L
+  #define EIGEN_COMP_CXXVER 11
+#else
+  #define EIGEN_COMP_CXXVER 03
+#endif
+
+#ifndef EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
+  #if defined(__cpp_variable_templates) && __cpp_variable_templates >= 201304 && EIGEN_MAX_CPP_VER>=14
+    #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 1
+  #else
+    #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 0
+  #endif
+#endif
+
+
+// The macros EIGEN_HAS_CXX?? defines a rough estimate of available c++ features
+// but in practice we should not rely on them but rather on the availabilty of
+// individual features as defined later.
+// This is why there is no EIGEN_HAS_CXX17.
+// FIXME: get rid of EIGEN_HAS_CXX14 and maybe even EIGEN_HAS_CXX11.
+#if EIGEN_MAX_CPP_VER>=11 && EIGEN_COMP_CXXVER>=11
+#define EIGEN_HAS_CXX11 1
+#else
+#define EIGEN_HAS_CXX11 0
+#endif
+
+#if EIGEN_MAX_CPP_VER>=14 && EIGEN_COMP_CXXVER>=14
+#define EIGEN_HAS_CXX14 1
+#else
+#define EIGEN_HAS_CXX14 0
+#endif
+
+// Do we support r-value references?
+#ifndef EIGEN_HAS_RVALUE_REFERENCES
+#if EIGEN_MAX_CPP_VER>=11 && \
+    (__has_feature(cxx_rvalue_references) || \
+     (EIGEN_COMP_CXXVER >= 11) || (EIGEN_COMP_MSVC >= 1600))
+  #define EIGEN_HAS_RVALUE_REFERENCES 1
+#else
+  #define EIGEN_HAS_RVALUE_REFERENCES 0
+#endif
+#endif
+
+// Does the compiler support C99?
+// Need to include <cmath> to make sure _GLIBCXX_USE_C99 gets defined
+#include <cmath>
+#ifndef EIGEN_HAS_C99_MATH
+#if EIGEN_MAX_CPP_VER>=11 && \
+    ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901))       \
+  || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \
+  || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) \
+  || (EIGEN_COMP_MSVC >= 1900) || defined(SYCL_DEVICE_ONLY))
+  #define EIGEN_HAS_C99_MATH 1
+#else
+  #define EIGEN_HAS_C99_MATH 0
+#endif
+#endif
+
+// Does the compiler support result_of?
+// result_of was deprecated in c++17 and removed in c++ 20
+#ifndef EIGEN_HAS_STD_RESULT_OF
+#if EIGEN_HAS_CXX11 && EIGEN_COMP_CXXVER < 17
+#define EIGEN_HAS_STD_RESULT_OF 1
+#else
+#define EIGEN_HAS_STD_RESULT_OF 0
+#endif
+#endif
+
+// Does the compiler support std::hash?
+#ifndef EIGEN_HAS_STD_HASH
+// The std::hash struct is defined in C++11 but is not labelled as a __device__
+// function and is not constexpr, so cannot be used on device.
+#if EIGEN_HAS_CXX11 && !defined(EIGEN_GPU_COMPILE_PHASE)
+#define EIGEN_HAS_STD_HASH 1
+#else
+#define EIGEN_HAS_STD_HASH 0
+#endif
+#endif  // EIGEN_HAS_STD_HASH
+
+#ifndef EIGEN_HAS_STD_INVOKE_RESULT
+#if EIGEN_MAX_CPP_VER >= 17 && EIGEN_COMP_CXXVER >= 17
+#define EIGEN_HAS_STD_INVOKE_RESULT 1
+#else
+#define EIGEN_HAS_STD_INVOKE_RESULT 0
+#endif
+#endif
+
+#ifndef EIGEN_HAS_ALIGNAS
+#if EIGEN_MAX_CPP_VER>=11 && EIGEN_HAS_CXX11 &&   \
+      (     __has_feature(cxx_alignas)            \
+        ||  EIGEN_HAS_CXX14                       \
+        || (EIGEN_COMP_MSVC >= 1800)              \
+        || (EIGEN_GNUC_AT_LEAST(4,8))             \
+        || (EIGEN_COMP_CLANG>=305)                \
+        || (EIGEN_COMP_ICC>=1500)                 \
+        || (EIGEN_COMP_PGI>=1500)                 \
+        || (EIGEN_COMP_SUNCC>=0x5130))
+#define EIGEN_HAS_ALIGNAS 1
+#else
+#define EIGEN_HAS_ALIGNAS 0
+#endif
+#endif
+
+// Does the compiler support type_traits?
+// - full support of type traits was added only to GCC 5.1.0.
+// - 20150626 corresponds to the last release of 4.x libstdc++
+#ifndef EIGEN_HAS_TYPE_TRAITS
+#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700) \
+  && ((!EIGEN_COMP_GNUC_STRICT) || EIGEN_GNUC_AT_LEAST(5, 1)) \
+  && ((!defined(__GLIBCXX__))   || __GLIBCXX__ > 20150626)
+#define EIGEN_HAS_TYPE_TRAITS 1
+#define EIGEN_INCLUDE_TYPE_TRAITS
+#else
+#define EIGEN_HAS_TYPE_TRAITS 0
+#endif
+#endif
+
+// Does the compiler support variadic templates?
+#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_COMP_CXXVER >= 11) \
+  && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_COMP_NVCC >= 80000) )
+    // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
+    //    this prevents nvcc from crashing when compiling Eigen on Tegra X1
+#define EIGEN_HAS_VARIADIC_TEMPLATES 1
+#elif  EIGEN_MAX_CPP_VER>=11 && (EIGEN_COMP_CXXVER >= 11) && defined(SYCL_DEVICE_ONLY)
+#define EIGEN_HAS_VARIADIC_TEMPLATES 1
+#else
+#define EIGEN_HAS_VARIADIC_TEMPLATES 0
+#endif
+#endif
+
+// Does the compiler fully support const expressions? (as in c++14)
+#ifndef EIGEN_HAS_CONSTEXPR
+  #if defined(EIGEN_CUDACC)
+  // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
+    #if EIGEN_MAX_CPP_VER>=14 && (EIGEN_COMP_CXXVER >= 11 && (EIGEN_COMP_CLANG || EIGEN_COMP_NVCC >= 70500))
+      #define EIGEN_HAS_CONSTEXPR 1
+    #endif
+  #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (EIGEN_COMP_CXXVER >= 14) || \
+    (EIGEN_GNUC_AT_LEAST(4,8) && (EIGEN_COMP_CXXVER >= 11)) || \
+    (EIGEN_COMP_CLANG >= 306 && (EIGEN_COMP_CXXVER >= 11)))
+    #define EIGEN_HAS_CONSTEXPR 1
+  #endif
+
+  #ifndef EIGEN_HAS_CONSTEXPR
+    #define EIGEN_HAS_CONSTEXPR 0
+  #endif
+
+#endif // EIGEN_HAS_CONSTEXPR
+
+#if EIGEN_HAS_CONSTEXPR
+#define EIGEN_CONSTEXPR constexpr
+#else
+#define EIGEN_CONSTEXPR
+#endif
+
+// Does the compiler support C++11 math?
+// Let's be conservative and enable the default C++11 implementation only if we are sure it exists
+#ifndef EIGEN_HAS_CXX11_MATH
+  #if EIGEN_MAX_CPP_VER>=11 && ((EIGEN_COMP_CXXVER > 11) || (EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
+      && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC))
+    #define EIGEN_HAS_CXX11_MATH 1
+  #else
+    #define EIGEN_HAS_CXX11_MATH 0
+  #endif
+#endif
+
+// Does the compiler support proper C++11 containers?
+#ifndef EIGEN_HAS_CXX11_CONTAINERS
+  #if    EIGEN_MAX_CPP_VER>=11 && \
+         ((EIGEN_COMP_CXXVER > 11) \
+      || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC>=1400)))
+    #define EIGEN_HAS_CXX11_CONTAINERS 1
+  #else
+    #define EIGEN_HAS_CXX11_CONTAINERS 0
+  #endif
+#endif
+
+// Does the compiler support C++11 noexcept?
+#ifndef EIGEN_HAS_CXX11_NOEXCEPT
+  #if    EIGEN_MAX_CPP_VER>=11 && \
+         (__has_feature(cxx_noexcept) \
+      || (EIGEN_COMP_CXXVER > 11) \
+      || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC>=1400)))
+    #define EIGEN_HAS_CXX11_NOEXCEPT 1
+  #else
+    #define EIGEN_HAS_CXX11_NOEXCEPT 0
+  #endif
+#endif
+
+#ifndef EIGEN_HAS_CXX11_ATOMIC
+  #if    EIGEN_MAX_CPP_VER>=11 && \
+         (__has_feature(cxx_atomic) \
+      || (EIGEN_COMP_CXXVER > 11) \
+      || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_MSVC==0 || EIGEN_COMP_MSVC >= 1700)))
+    #define EIGEN_HAS_CXX11_ATOMIC 1
+  #else
+    #define EIGEN_HAS_CXX11_ATOMIC 0
+  #endif
+#endif
+
+#ifndef EIGEN_HAS_CXX11_OVERRIDE_FINAL
+  #if    EIGEN_MAX_CPP_VER>=11 && \
+       (EIGEN_COMP_CXXVER >= 11 || EIGEN_COMP_MSVC >= 1700)
+    #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 1
+  #else
+    #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 0
+  #endif
+#endif
+
+// NOTE: the required Apple's clang version is very conservative
+//       and it could be that XCode 9 works just fine.
+// NOTE: the MSVC version is based on https://en.cppreference.com/w/cpp/compiler_support
+//       and not tested.
+#ifndef EIGEN_HAS_CXX17_OVERALIGN
+#if EIGEN_MAX_CPP_VER>=17 && EIGEN_COMP_CXXVER>=17 && (                                 \
+           (EIGEN_COMP_MSVC >= 1912)                                                    \
+        || (EIGEN_GNUC_AT_LEAST(7,0))                                                   \
+        || ((!defined(__apple_build_version__)) && (EIGEN_COMP_CLANG>=500))             \
+        || (( defined(__apple_build_version__)) && (__apple_build_version__>=10000000)) \
+      )
+#define EIGEN_HAS_CXX17_OVERALIGN 1
+#else
+#define EIGEN_HAS_CXX17_OVERALIGN 0
+#endif
+#endif
+
+#if defined(EIGEN_CUDACC) && EIGEN_HAS_CONSTEXPR
+  // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules
+  #if defined(__NVCC__)
+    // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr
+    #ifdef __CUDACC_RELAXED_CONSTEXPR__
+      #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
+    #endif
+  #elif defined(__clang__) && defined(__CUDA__) && __has_feature(cxx_relaxed_constexpr)
+    // clang++ always considers constexpr functions as implicitly __host__ __device__
+    #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
+  #endif
+#endif
+
+// Does the compiler support the __int128 and __uint128_t extensions for 128-bit
+// integer arithmetic?
+//
+// Clang and GCC define __SIZEOF_INT128__ when these extensions are supported,
+// but we avoid using them in certain cases:
+//
+// * Building using Clang for Windows, where the Clang runtime library has
+//   128-bit support only on LP64 architectures, but Windows is LLP64.
+#ifndef EIGEN_HAS_BUILTIN_INT128
+#if defined(__SIZEOF_INT128__) && !(EIGEN_OS_WIN && EIGEN_COMP_CLANG)
+#define EIGEN_HAS_BUILTIN_INT128 1
+#else
+#define EIGEN_HAS_BUILTIN_INT128 0
+#endif
+#endif
+
+//------------------------------------------------------------------------------------------
+// Preprocessor programming helpers
+//------------------------------------------------------------------------------------------
+
+// This macro can be used to prevent from macro expansion, e.g.:
+//   std::max EIGEN_NOT_A_MACRO(a,b)
+#define EIGEN_NOT_A_MACRO
+
+#define EIGEN_DEBUG_VAR(x) std::cerr << #x << " = " << x << std::endl;
+
+// concatenate two tokens
+#define EIGEN_CAT2(a,b) a ## b
+#define EIGEN_CAT(a,b) EIGEN_CAT2(a,b)
+
+#define EIGEN_COMMA ,
+
+// convert a token to a string
+#define EIGEN_MAKESTRING2(a) #a
+#define EIGEN_MAKESTRING(a) EIGEN_MAKESTRING2(a)
+
+// EIGEN_STRONG_INLINE is a stronger version of the inline, using __forceinline on MSVC,
+// but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline
+// but GCC is still doing fine with just inline.
+#ifndef EIGEN_STRONG_INLINE
+#if (EIGEN_COMP_MSVC || EIGEN_COMP_ICC) && !defined(EIGEN_GPUCC)
+#define EIGEN_STRONG_INLINE __forceinline
+#else
+#define EIGEN_STRONG_INLINE inline
+#endif
+#endif
+
+// EIGEN_ALWAYS_INLINE is the stronget, it has the effect of making the function inline and adding every possible
+// attribute to maximize inlining. This should only be used when really necessary: in particular,
+// it uses __attribute__((always_inline)) on GCC, which most of the time is useless and can severely harm compile times.
+// FIXME with the always_inline attribute,
+// gcc 3.4.x and 4.1 reports the following compilation error:
+//   Eval.h:91: sorry, unimplemented: inlining failed in call to 'const Eigen::Eval<Derived> Eigen::MatrixBase<Scalar, Derived>::eval() const'
+//    : function body not available
+//   See also bug 1367
+#if EIGEN_GNUC_AT_LEAST(4,2) && !defined(SYCL_DEVICE_ONLY)
+#define EIGEN_ALWAYS_INLINE __attribute__((always_inline)) inline
+#else
+#define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE
+#endif
+
+#if EIGEN_COMP_GNUC
+#define EIGEN_DONT_INLINE __attribute__((noinline))
+#elif EIGEN_COMP_MSVC
+#define EIGEN_DONT_INLINE __declspec(noinline)
+#else
+#define EIGEN_DONT_INLINE
+#endif
+
+#if EIGEN_COMP_GNUC
+#define EIGEN_PERMISSIVE_EXPR __extension__
+#else
+#define EIGEN_PERMISSIVE_EXPR
+#endif
+
+// GPU stuff
+
+// Disable some features when compiling with GPU compilers (NVCC/clang-cuda/SYCL/HIPCC)
+#if defined(EIGEN_CUDACC) || defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIPCC)
+  // Do not try asserts on device code
+  #ifndef EIGEN_NO_DEBUG
+  #define EIGEN_NO_DEBUG
+  #endif
+
+  #ifdef EIGEN_INTERNAL_DEBUGGING
+  #undef EIGEN_INTERNAL_DEBUGGING
+  #endif
+
+  #ifdef EIGEN_EXCEPTIONS
+  #undef EIGEN_EXCEPTIONS
+  #endif
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+  #ifndef EIGEN_DONT_VECTORIZE
+    #define EIGEN_DONT_VECTORIZE
+  #endif
+  #define EIGEN_DEVICE_FUNC __attribute__((flatten)) __attribute__((always_inline))
+// All functions callable from CUDA/HIP code must be qualified with __device__
+#elif defined(EIGEN_GPUCC)
+    #define EIGEN_DEVICE_FUNC __host__ __device__
+#else
+  #define EIGEN_DEVICE_FUNC
+#endif
+
+
+// this macro allows to get rid of linking errors about multiply defined functions.
+//  - static is not very good because it prevents definitions from different object files to be merged.
+//           So static causes the resulting linked executable to be bloated with multiple copies of the same function.
+//  - inline is not perfect either as it unwantedly hints the compiler toward inlining the function.
+#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC
+#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC inline
+
+#ifdef NDEBUG
+# ifndef EIGEN_NO_DEBUG
+#  define EIGEN_NO_DEBUG
+# endif
+#endif
+
+// eigen_plain_assert is where we implement the workaround for the assert() bug in GCC <= 4.3, see bug 89
+#ifdef EIGEN_NO_DEBUG
+  #ifdef SYCL_DEVICE_ONLY // used to silence the warning on SYCL device
+    #define eigen_plain_assert(x) EIGEN_UNUSED_VARIABLE(x)
+  #else
+    #define eigen_plain_assert(x)
+  #endif
+#else
+  #if EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO
+    namespace Eigen {
+    namespace internal {
+    inline bool copy_bool(bool b) { return b; }
+    }
+    }
+    #define eigen_plain_assert(x) assert(x)
+  #else
+    // work around bug 89
+    #include <cstdlib>   // for abort
+    #include <iostream>  // for std::cerr
+
+    namespace Eigen {
+    namespace internal {
+    // trivial function copying a bool. Must be EIGEN_DONT_INLINE, so we implement it after including Eigen headers.
+    // see bug 89.
+    namespace {
+    EIGEN_DONT_INLINE bool copy_bool(bool b) { return b; }
+    }
+    inline void assert_fail(const char *condition, const char *function, const char *file, int line)
+    {
+      std::cerr << "assertion failed: " << condition << " in function " << function << " at " << file << ":" << line << std::endl;
+      abort();
+    }
+    }
+    }
+    #define eigen_plain_assert(x) \
+      do { \
+        if(!Eigen::internal::copy_bool(x)) \
+          Eigen::internal::assert_fail(EIGEN_MAKESTRING(x), __PRETTY_FUNCTION__, __FILE__, __LINE__); \
+      } while(false)
+  #endif
+#endif
+
+// eigen_assert can be overridden
+#ifndef eigen_assert
+#define eigen_assert(x) eigen_plain_assert(x)
+#endif
+
+#ifdef EIGEN_INTERNAL_DEBUGGING
+#define eigen_internal_assert(x) eigen_assert(x)
+#else
+#define eigen_internal_assert(x)
+#endif
+
+#ifdef EIGEN_NO_DEBUG
+#define EIGEN_ONLY_USED_FOR_DEBUG(x) EIGEN_UNUSED_VARIABLE(x)
+#else
+#define EIGEN_ONLY_USED_FOR_DEBUG(x)
+#endif
+
+#ifndef EIGEN_NO_DEPRECATED_WARNING
+  #if EIGEN_COMP_GNUC
+    #define EIGEN_DEPRECATED __attribute__((deprecated))
+  #elif EIGEN_COMP_MSVC
+    #define EIGEN_DEPRECATED __declspec(deprecated)
+  #else
+    #define EIGEN_DEPRECATED
+  #endif
+#else
+  #define EIGEN_DEPRECATED
+#endif
+
+#if EIGEN_COMP_GNUC
+#define EIGEN_UNUSED __attribute__((unused))
+#else
+#define EIGEN_UNUSED
+#endif
+
+// Suppresses 'unused variable' warnings.
+namespace Eigen {
+  namespace internal {
+    template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ignore_unused_variable(const T&) {}
+  }
+}
+#define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var);
+
+#if !defined(EIGEN_ASM_COMMENT)
+  #if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64)
+    #define EIGEN_ASM_COMMENT(X)  __asm__("#" X)
+  #else
+    #define EIGEN_ASM_COMMENT(X)
+  #endif
+#endif
+
+
+// Acts as a barrier preventing operations involving `X` from crossing. This
+// occurs, for example, in the fast rounding trick where a magic constant is
+// added then subtracted, which is otherwise compiled away with -ffast-math.
+//
+// See bug 1674
+#if !defined(EIGEN_OPTIMIZATION_BARRIER)
+  #if EIGEN_COMP_GNUC
+    // According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html:
+    //   X: Any operand whatsoever.
+    //   r: A register operand is allowed provided that it is in a general
+    //      register.
+    //   g: Any register, memory or immediate integer operand is allowed, except
+    //      for registers that are not general registers.
+    //   w: (AArch32/AArch64) Floating point register, Advanced SIMD vector
+    //      register or SVE vector register.
+    //   x: (SSE) Any SSE register.
+    //      (AArch64) Like w, but restricted to registers 0 to 15 inclusive.
+    //   v: (PowerPC) An Altivec vector register.
+    //   wa:(PowerPC) A VSX register.
+    //
+    // "X" (uppercase) should work for all cases, though this seems to fail for
+    // some versions of GCC for arm/aarch64 with
+    //   "error: inconsistent operand constraints in an 'asm'"
+    // Clang x86_64/arm/aarch64 seems to require "g" to support both scalars and
+    // vectors, otherwise
+    //   "error: non-trivial scalar-to-vector conversion, possible invalid
+    //    constraint for vector type"
+    //
+    // GCC for ppc64le generates an internal compiler error with x/X/g.
+    // GCC for AVX generates an internal compiler error with X.
+    //
+    // Tested on icc/gcc/clang for sse, avx, avx2, avx512dq
+    //           gcc for arm, aarch64,
+    //           gcc for ppc64le,
+    // both vectors and scalars.
+    //
+    // Note that this is restricted to plain types - this will not work
+    // directly for std::complex<T>, Eigen::half, Eigen::bfloat16. For these,
+    // you will need to apply to the underlying POD type.
+    #if EIGEN_ARCH_PPC && EIGEN_COMP_GNUC_STRICT
+      // This seems to be broken on clang.  Packet4f is loaded into a single
+      //   register rather than a vector, zeroing out some entries.  Integer
+      //   types also generate a compile error.
+      // General, Altivec, VSX.
+      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+r,v,wa" (X));
+    #elif EIGEN_ARCH_ARM_OR_ARM64
+      // General, NEON.
+      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+g,w" (X));
+    #elif EIGEN_ARCH_i386_OR_x86_64
+      // General, SSE.
+      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+g,x" (X));
+    #else
+      // Not implemented for other architectures.
+      #define EIGEN_OPTIMIZATION_BARRIER(X)
+    #endif
+  #else
+    // Not implemented for other compilers.
+    #define EIGEN_OPTIMIZATION_BARRIER(X)
+  #endif
+#endif
+
+#if EIGEN_COMP_MSVC
+  // NOTE MSVC often gives C4127 warnings with compiletime if statements. See bug 1362.
+  // This workaround is ugly, but it does the job.
+#  define EIGEN_CONST_CONDITIONAL(cond)  (void)0, cond
+#else
+#  define EIGEN_CONST_CONDITIONAL(cond)  cond
+#endif
+
+#ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
+  #define EIGEN_RESTRICT
+#endif
+#ifndef EIGEN_RESTRICT
+  #define EIGEN_RESTRICT __restrict
+#endif
+
+
+#ifndef EIGEN_DEFAULT_IO_FORMAT
+#ifdef EIGEN_MAKING_DOCS
+// format used in Eigen's documentation
+// needed to define it here as escaping characters in CMake add_definition's argument seems very problematic.
+#define EIGEN_DEFAULT_IO_FORMAT Eigen::IOFormat(3, 0, " ", "\n", "", "")
+#else
+#define EIGEN_DEFAULT_IO_FORMAT Eigen::IOFormat()
+#endif
+#endif
+
+// just an empty macro !
+#define EIGEN_EMPTY
+
+
+// When compiling CUDA/HIP device code with NVCC or HIPCC
+// pull in math functions from the global namespace.
+// In host mode, and when device code is compiled with clang,
+// use the std versions.
+#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || defined(EIGEN_HIP_DEVICE_COMPILE)
+  #define EIGEN_USING_STD(FUNC) using ::FUNC;
+#else
+  #define EIGEN_USING_STD(FUNC) using std::FUNC;
+#endif
+
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || (EIGEN_COMP_MSVC == 1900 && EIGEN_COMP_NVCC))
+  // For older MSVC versions, as well as 1900 && CUDA 8, using the base operator is necessary,
+  //   otherwise we get duplicate definition errors
+  // For later MSVC versions, we require explicit operator= definition, otherwise we get
+  //   use of implicitly deleted operator errors.
+  // (cf Bugs 920, 1000, 1324, 2291)
+  #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    using Base::operator =;
+#elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
+  #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    using Base::operator =; \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
+    template <typename OtherDerived> \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other) { Base::operator=(other.derived()); return *this; }
+#else
+  #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    using Base::operator =; \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \
+    { \
+      Base::operator=(other); \
+      return *this; \
+    }
+#endif
+
+
+/**
+ * \internal
+ * \brief Macro to explicitly define the default copy constructor.
+ * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden.
+ */
+#if EIGEN_HAS_CXX11
+#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) CLASS(const CLASS&) = default;
+#else
+#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS)
+#endif
+
+
+
+/** \internal
+ * \brief Macro to manually inherit assignment operators.
+ * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
+ * With C++11 or later this also default-implements the copy-constructor
+ */
+#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived)  \
+    EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived)
+
+/** \internal
+ * \brief Macro to manually define default constructors and destructors.
+ * This is necessary when the copy constructor is re-defined.
+ * For empty helper classes this should usually be protected, to avoid accidentally creating empty objects.
+ *
+ * Hiding the default destructor lead to problems in C++03 mode together with boost::multiprecision
+ */
+#if EIGEN_HAS_CXX11
+#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived)  \
+    Derived() = default; \
+    ~Derived() = default;
+#else
+#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived)  \
+    Derived() {}; \
+    /* ~Derived() {}; */
+#endif
+
+
+
+
+
+/**
+* Just a side note. Commenting within defines works only by documenting
+* behind the object (via '!<'). Comments cannot be multi-line and thus
+* we have these extra long lines. What is confusing doxygen over here is
+* that we use '\' and basically have a bunch of typedefs with their
+* documentation in a single line.
+**/
+
+#define EIGEN_GENERIC_PUBLIC_INTERFACE(Derived) \
+  typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; /*!< \brief Numeric type, e.g. float, double, int or std::complex<float>. */ \
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; /*!< \brief The underlying numeric type for composed scalar types. \details In cases where Scalar is e.g. std::complex<T>, T were corresponding to RealScalar. */ \
+  typedef typename Base::CoeffReturnType CoeffReturnType; /*!< \brief The return type for coefficient access. \details Depending on whether the object allows direct coefficient access (e.g. for a MatrixXd), this type is either 'const Scalar&' or simply 'Scalar' for objects that do not allow direct coefficient access. */ \
+  typedef typename Eigen::internal::ref_selector<Derived>::type Nested; \
+  typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
+  typedef typename Eigen::internal::traits<Derived>::StorageIndex StorageIndex; \
+  enum CompileTimeTraits \
+      { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
+        ColsAtCompileTime = Eigen::internal::traits<Derived>::ColsAtCompileTime, \
+        Flags = Eigen::internal::traits<Derived>::Flags, \
+        SizeAtCompileTime = Base::SizeAtCompileTime, \
+        MaxSizeAtCompileTime = Base::MaxSizeAtCompileTime, \
+        IsVectorAtCompileTime = Base::IsVectorAtCompileTime }; \
+  using Base::derived; \
+  using Base::const_cast_derived;
+
+
+// FIXME Maybe the EIGEN_DENSE_PUBLIC_INTERFACE could be removed as importing PacketScalar is rarely needed
+#define EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
+  EIGEN_GENERIC_PUBLIC_INTERFACE(Derived) \
+  typedef typename Base::PacketScalar PacketScalar;
+
+
+#define EIGEN_PLAIN_ENUM_MIN(a,b) (((int)a <= (int)b) ? (int)a : (int)b)
+#define EIGEN_PLAIN_ENUM_MAX(a,b) (((int)a >= (int)b) ? (int)a : (int)b)
+
+// EIGEN_SIZE_MIN_PREFER_DYNAMIC gives the min between compile-time sizes. 0 has absolute priority, followed by 1,
+// followed by Dynamic, followed by other finite values. The reason for giving Dynamic the priority over
+// finite values is that min(3, Dynamic) should be Dynamic, since that could be anything between 0 and 3.
+#define EIGEN_SIZE_MIN_PREFER_DYNAMIC(a,b) (((int)a == 0 || (int)b == 0) ? 0 \
+                           : ((int)a == 1 || (int)b == 1) ? 1 \
+                           : ((int)a == Dynamic || (int)b == Dynamic) ? Dynamic \
+                           : ((int)a <= (int)b) ? (int)a : (int)b)
+
+// EIGEN_SIZE_MIN_PREFER_FIXED is a variant of EIGEN_SIZE_MIN_PREFER_DYNAMIC comparing MaxSizes. The difference is that finite values
+// now have priority over Dynamic, so that min(3, Dynamic) gives 3. Indeed, whatever the actual value is
+// (between 0 and 3), it is not more than 3.
+#define EIGEN_SIZE_MIN_PREFER_FIXED(a,b)  (((int)a == 0 || (int)b == 0) ? 0 \
+                           : ((int)a == 1 || (int)b == 1) ? 1 \
+                           : ((int)a == Dynamic && (int)b == Dynamic) ? Dynamic \
+                           : ((int)a == Dynamic) ? (int)b \
+                           : ((int)b == Dynamic) ? (int)a \
+                           : ((int)a <= (int)b) ? (int)a : (int)b)
+
+// see EIGEN_SIZE_MIN_PREFER_DYNAMIC. No need for a separate variant for MaxSizes here.
+#define EIGEN_SIZE_MAX(a,b) (((int)a == Dynamic || (int)b == Dynamic) ? Dynamic \
+                           : ((int)a >= (int)b) ? (int)a : (int)b)
+
+#define EIGEN_LOGICAL_XOR(a,b) (((a) || (b)) && !((a) && (b)))
+
+#define EIGEN_IMPLIES(a,b) (!(a) || (b))
+
+#if EIGEN_HAS_BUILTIN(__builtin_expect) || EIGEN_COMP_GNUC
+#define EIGEN_PREDICT_FALSE(x) (__builtin_expect(x, false))
+#define EIGEN_PREDICT_TRUE(x) (__builtin_expect(false || (x), true))
+#else
+#define EIGEN_PREDICT_FALSE(x) (x)
+#define EIGEN_PREDICT_TRUE(x) (x)
+#endif
+
+// the expression type of a standard coefficient wise binary operation
+#define EIGEN_CWISE_BINARY_RETURN_TYPE(LHS,RHS,OPNAME) \
+    CwiseBinaryOp< \
+      EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)< \
+          typename internal::traits<LHS>::Scalar, \
+          typename internal::traits<RHS>::Scalar \
+      >, \
+      const LHS, \
+      const RHS \
+    >
+
+#define EIGEN_MAKE_CWISE_BINARY_OP(METHOD,OPNAME) \
+  template<typename OtherDerived> \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,OPNAME) \
+  (METHOD)(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
+  { \
+    return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,OPNAME)(derived(), other.derived()); \
+  }
+
+#define EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,TYPEA,TYPEB) \
+  (Eigen::internal::has_ReturnType<Eigen::ScalarBinaryOpTraits<TYPEA,TYPEB,EIGEN_CAT(EIGEN_CAT(Eigen::internal::scalar_,OPNAME),_op)<TYPEA,TYPEB>  > >::value)
+
+#define EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(EXPR,SCALAR,OPNAME) \
+  CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<typename internal::traits<EXPR>::Scalar,SCALAR>, const EXPR, \
+                const typename internal::plain_constant_type<EXPR,SCALAR>::type>
+
+#define EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(SCALAR,EXPR,OPNAME) \
+  CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)<SCALAR,typename internal::traits<EXPR>::Scalar>, \
+                const typename internal::plain_constant_type<EXPR,SCALAR>::type, const EXPR>
+
+// Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010")
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC_STRICT<=1600)
+#define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if<true,X>::type
+#else
+#define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X
+#endif
+
+#define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) \
+  template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type,OPNAME))\
+  (METHOD)(const T& scalar) const { \
+    typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type PromotedT; \
+    return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedT,OPNAME)(derived(), \
+           typename internal::plain_constant_type<Derived,PromotedT>::type(derived().rows(), derived().cols(), internal::scalar_constant_op<PromotedT>(scalar))); \
+  }
+
+#define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \
+  template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend \
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type,Derived,OPNAME)) \
+  (METHOD)(const T& scalar, const StorageBaseType& matrix) { \
+    typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type PromotedT; \
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedT,Derived,OPNAME)( \
+           typename internal::plain_constant_type<Derived,PromotedT>::type(matrix.derived().rows(), matrix.derived().cols(), internal::scalar_constant_op<PromotedT>(scalar)), matrix.derived()); \
+  }
+
+#define EIGEN_MAKE_SCALAR_BINARY_OP(METHOD,OPNAME) \
+  EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \
+  EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME)
+
+
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE)
+  #define EIGEN_EXCEPTIONS
+#endif
+
+
+#ifdef EIGEN_EXCEPTIONS
+#  define EIGEN_THROW_X(X) throw X
+#  define EIGEN_THROW throw
+#  define EIGEN_TRY try
+#  define EIGEN_CATCH(X) catch (X)
+#else
+#  if defined(EIGEN_CUDA_ARCH)
+#    define EIGEN_THROW_X(X) asm("trap;")
+#    define EIGEN_THROW asm("trap;")
+#  elif defined(EIGEN_HIP_DEVICE_COMPILE)
+#    define EIGEN_THROW_X(X) asm("s_trap 0")
+#    define EIGEN_THROW asm("s_trap 0")
+#  else
+#    define EIGEN_THROW_X(X) std::abort()
+#    define EIGEN_THROW std::abort()
+#  endif
+#  define EIGEN_TRY if (true)
+#  define EIGEN_CATCH(X) else
+#endif
+
+
+#if EIGEN_HAS_CXX11_NOEXCEPT
+#   define EIGEN_INCLUDE_TYPE_TRAITS
+#   define EIGEN_NOEXCEPT noexcept
+#   define EIGEN_NOEXCEPT_IF(x) noexcept(x)
+#   define EIGEN_NO_THROW noexcept(true)
+#   define EIGEN_EXCEPTION_SPEC(X) noexcept(false)
+#else
+#   define EIGEN_NOEXCEPT
+#   define EIGEN_NOEXCEPT_IF(x)
+#   define EIGEN_NO_THROW throw()
+#   if EIGEN_COMP_MSVC || EIGEN_COMP_CXXVER>=17
+      // MSVC does not support exception specifications (warning C4290),
+      // and they are deprecated in c++11 anyway. This is even an error in c++17.
+#     define EIGEN_EXCEPTION_SPEC(X) throw()
+#   else
+#     define EIGEN_EXCEPTION_SPEC(X) throw(X)
+#   endif
+#endif
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+// The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input.
+namespace Eigen {
+namespace internal {
+
+inline bool all(){ return true; }
+
+template<typename T, typename ...Ts>
+bool all(T t, Ts ... ts){ return t && all(ts...); }
+
+}
+}
+#endif
+
+#if EIGEN_HAS_CXX11_OVERRIDE_FINAL
+// provide override and final specifiers if they are available:
+#   define EIGEN_OVERRIDE override
+#   define EIGEN_FINAL final
+#else
+#   define EIGEN_OVERRIDE
+#   define EIGEN_FINAL
+#endif
+
+// Wrapping #pragma unroll in a macro since it is required for SYCL
+#if defined(SYCL_DEVICE_ONLY)
+  #if defined(_MSC_VER)
+    #define EIGEN_UNROLL_LOOP __pragma(unroll)
+  #else
+    #define EIGEN_UNROLL_LOOP _Pragma("unroll")
+  #endif
+#else
+  #define EIGEN_UNROLL_LOOP
+#endif
+
+#endif // EIGEN_MACROS_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/Memory.h b/src/3rdparty/eigen/Eigen/src/Core/util/Memory.h
new file mode 100644
index 000000000..875318cdb
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/Memory.h
@@ -0,0 +1,1163 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2009 Kenneth Riddile <kfriddile@yahoo.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+// Copyright (C) 2010 Thomas Capricelli <orzel@freehackers.org>
+// Copyright (C) 2013 Pavel Holoborodko <pavel@holoborodko.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+/*****************************************************************************
+*** Platform checks for aligned malloc functions                           ***
+*****************************************************************************/
+
+#ifndef EIGEN_MEMORY_H
+#define EIGEN_MEMORY_H
+
+#ifndef EIGEN_MALLOC_ALREADY_ALIGNED
+
+// Try to determine automatically if malloc is already aligned.
+
+// On 64-bit systems, glibc's malloc returns 16-byte-aligned pointers, see:
+//   http://www.gnu.org/s/libc/manual/html_node/Aligned-Memory-Blocks.html
+// This is true at least since glibc 2.8.
+// This leaves the question how to detect 64-bit. According to this document,
+//   http://gcc.fyxm.net/summit/2003/Porting%20to%2064%20bit.pdf
+// page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
+// quite safe, at least within the context of glibc, to equate 64-bit with LP64.
+#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
+ && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_DEFAULT_ALIGN_BYTES == 16)
+  #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
+#else
+  #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
+#endif
+
+// FreeBSD 6 seems to have 16-byte aligned malloc
+//   See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
+// FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
+//   See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup
+#if defined(__FreeBSD__) && !(EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) && (EIGEN_DEFAULT_ALIGN_BYTES == 16)
+  #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
+#else
+  #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
+#endif
+
+#if (EIGEN_OS_MAC && (EIGEN_DEFAULT_ALIGN_BYTES == 16))     \
+ || (EIGEN_OS_WIN64 && (EIGEN_DEFAULT_ALIGN_BYTES == 16))   \
+ || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED              \
+ || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
+  #define EIGEN_MALLOC_ALREADY_ALIGNED 1
+#else
+  #define EIGEN_MALLOC_ALREADY_ALIGNED 0
+#endif
+
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+EIGEN_DEVICE_FUNC
+inline void throw_std_bad_alloc()
+{
+  #ifdef EIGEN_EXCEPTIONS
+    throw std::bad_alloc();
+  #else
+    std::size_t huge = static_cast<std::size_t>(-1);
+    #if defined(EIGEN_HIPCC)
+    //
+    // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining),
+    // and as a consequence the code in the #else block triggers the hipcc warning :
+    // "no overloaded function has restriction specifiers that are compatible with the ambient context"
+    //
+    // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects
+    // the same on "operator new"
+    // Reverting code back to the old version in this #if block for the hipcc compiler
+    //
+    new int[huge];
+    #else
+    void* unused = ::operator new(huge);
+    EIGEN_UNUSED_VARIABLE(unused);
+    #endif
+  #endif
+}
+
+/*****************************************************************************
+*** Implementation of handmade aligned functions                           ***
+*****************************************************************************/
+
+/* ----- Hand made implementations of aligned malloc/free and realloc ----- */
+
+/** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
+  * Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
+  */
+EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
+{
+  eigen_assert(alignment >= sizeof(void*) && (alignment & (alignment-1)) == 0 && "Alignment must be at least sizeof(void*) and a power of 2");
+
+  EIGEN_USING_STD(malloc)
+  void *original = malloc(size+alignment);
+  
+  if (original == 0) return 0;
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(alignment-1))) + alignment);
+  *(reinterpret_cast<void**>(aligned) - 1) = original;
+  return aligned;
+}
+
+/** \internal Frees memory allocated with handmade_aligned_malloc */
+EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr)
+{
+  if (ptr) {
+    EIGEN_USING_STD(free)
+    free(*(reinterpret_cast<void**>(ptr) - 1));
+  }
+}
+
+/** \internal
+  * \brief Reallocates aligned memory.
+  * Since we know that our handmade version is based on std::malloc
+  * we can use std::realloc to implement efficient reallocation.
+  */
+inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = 0)
+{
+  if (ptr == 0) return handmade_aligned_malloc(size);
+  void *original = *(reinterpret_cast<void**>(ptr) - 1);
+  std::ptrdiff_t previous_offset = static_cast<char *>(ptr)-static_cast<char *>(original);
+  original = std::realloc(original,size+EIGEN_DEFAULT_ALIGN_BYTES);
+  if (original == 0) return 0;
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES);
+  void *previous_aligned = static_cast<char *>(original)+previous_offset;
+  if(aligned!=previous_aligned)
+    std::memmove(aligned, previous_aligned, size);
+
+  *(reinterpret_cast<void**>(aligned) - 1) = original;
+  return aligned;
+}
+
+/*****************************************************************************
+*** Implementation of portable aligned versions of malloc/free/realloc     ***
+*****************************************************************************/
+
+#ifdef EIGEN_NO_MALLOC
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
+{
+  eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
+}
+#elif defined EIGEN_RUNTIME_NO_MALLOC
+EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
+{
+  static bool value = true;
+  if (update == 1)
+    value = new_value;
+  return value;
+}
+EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
+EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
+{
+  eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
+}
+#else
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
+{}
+#endif
+
+/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
+  * On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
+  */
+EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
+{
+  check_that_malloc_is_allowed();
+
+  void *result;
+  #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
+
+    EIGEN_USING_STD(malloc)
+    result = malloc(size);
+
+    #if EIGEN_DEFAULT_ALIGN_BYTES==16
+    eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade aligned memory allocator.");
+    #endif
+  #else
+    result = handmade_aligned_malloc(size);
+  #endif
+
+  if(!result && size)
+    throw_std_bad_alloc();
+
+  return result;
+}
+
+/** \internal Frees memory allocated with aligned_malloc. */
+EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
+{
+  #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
+
+    EIGEN_USING_STD(free)
+    free(ptr);
+
+  #else
+    handmade_aligned_free(ptr);
+  #endif
+}
+
+/**
+  * \internal
+  * \brief Reallocates an aligned block of memory.
+  * \throws std::bad_alloc on allocation failure
+  */
+inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size)
+{
+  EIGEN_UNUSED_VARIABLE(old_size)
+
+  void *result;
+#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
+  result = std::realloc(ptr,new_size);
+#else
+  result = handmade_aligned_realloc(ptr,new_size,old_size);
+#endif
+
+  if (!result && new_size)
+    throw_std_bad_alloc();
+
+  return result;
+}
+
+/*****************************************************************************
+*** Implementation of conditionally aligned functions                      ***
+*****************************************************************************/
+
+/** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
+  * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
+  */
+template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std::size_t size)
+{
+  return aligned_malloc(size);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std::size_t size)
+{
+  check_that_malloc_is_allowed();
+
+  EIGEN_USING_STD(malloc)
+  void *result = malloc(size);
+
+  if(!result && size)
+    throw_std_bad_alloc();
+  return result;
+}
+
+/** \internal Frees memory allocated with conditional_aligned_malloc */
+template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr)
+{
+  aligned_free(ptr);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
+{
+  EIGEN_USING_STD(free)
+  free(ptr);
+}
+
+template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size)
+{
+  return aligned_realloc(ptr, new_size, old_size);
+}
+
+template<> inline void* conditional_aligned_realloc<false>(void* ptr, std::size_t new_size, std::size_t)
+{
+  return std::realloc(ptr, new_size);
+}
+
+/*****************************************************************************
+*** Construction/destruction of array elements                             ***
+*****************************************************************************/
+
+/** \internal Destructs the elements of an array.
+  * The \a size parameters tells on how many objects to call the destructor of T.
+  */
+template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, std::size_t size)
+{
+  // always destruct an array starting from the end.
+  if(ptr)
+    while(size) ptr[--size].~T();
+}
+
+/** \internal Constructs the elements of an array.
+  * The \a size parameter tells on how many objects to call the constructor of T.
+  */
+template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, std::size_t size)
+{
+  std::size_t i;
+  EIGEN_TRY
+  {
+      for (i = 0; i < size; ++i) ::new (ptr + i) T;
+      return ptr;
+  }
+  EIGEN_CATCH(...)
+  {
+    destruct_elements_of_array(ptr, i);
+    EIGEN_THROW;
+  }
+  return NULL;
+}
+
+/*****************************************************************************
+*** Implementation of aligned new/delete-like functions                    ***
+*****************************************************************************/
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t size)
+{
+  if(size > std::size_t(-1) / sizeof(T))
+    throw_std_bad_alloc();
+}
+
+/** \internal Allocates \a size objects of type T. The returned pointer is guaranteed to have 16 bytes alignment.
+  * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown.
+  * The default constructor of T is called.
+  */
+template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size)
+{
+  check_size_for_overflow<T>(size);
+  T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
+  EIGEN_TRY
+  {
+    return construct_elements_of_array(result, size);
+  }
+  EIGEN_CATCH(...)
+  {
+    aligned_free(result);
+    EIGEN_THROW;
+  }
+  return result;
+}
+
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(std::size_t size)
+{
+  check_size_for_overflow<T>(size);
+  T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
+  EIGEN_TRY
+  {
+    return construct_elements_of_array(result, size);
+  }
+  EIGEN_CATCH(...)
+  {
+    conditional_aligned_free<Align>(result);
+    EIGEN_THROW;
+  }
+  return result;
+}
+
+/** \internal Deletes objects constructed with aligned_new
+  * The \a size parameters tells on how many objects to call the destructor of T.
+  */
+template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, std::size_t size)
+{
+  destruct_elements_of_array<T>(ptr, size);
+  Eigen::internal::aligned_free(ptr);
+}
+
+/** \internal Deletes objects constructed with conditional_aligned_new
+  * The \a size parameters tells on how many objects to call the destructor of T.
+  */
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, std::size_t size)
+{
+  destruct_elements_of_array<T>(ptr, size);
+  conditional_aligned_free<Align>(ptr);
+}
+
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, std::size_t new_size, std::size_t old_size)
+{
+  check_size_for_overflow<T>(new_size);
+  check_size_for_overflow<T>(old_size);
+  if(new_size < old_size)
+    destruct_elements_of_array(pts+new_size, old_size-new_size);
+  T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
+  if(new_size > old_size)
+  {
+    EIGEN_TRY
+    {
+      construct_elements_of_array(result+old_size, new_size-old_size);
+    }
+    EIGEN_CATCH(...)
+    {
+      conditional_aligned_free<Align>(result);
+      EIGEN_THROW;
+    }
+  }
+  return result;
+}
+
+
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(std::size_t size)
+{
+  if(size==0)
+    return 0; // short-cut. Also fixes Bug 884
+  check_size_for_overflow<T>(size);
+  T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
+  if(NumTraits<T>::RequireInitialization)
+  {
+    EIGEN_TRY
+    {
+      construct_elements_of_array(result, size);
+    }
+    EIGEN_CATCH(...)
+    {
+      conditional_aligned_free<Align>(result);
+      EIGEN_THROW;
+    }
+  }
+  return result;
+}
+
+template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size)
+{
+  check_size_for_overflow<T>(new_size);
+  check_size_for_overflow<T>(old_size);
+  if(NumTraits<T>::RequireInitialization && (new_size < old_size))
+    destruct_elements_of_array(pts+new_size, old_size-new_size);
+  T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
+  if(NumTraits<T>::RequireInitialization && (new_size > old_size))
+  {
+    EIGEN_TRY
+    {
+      construct_elements_of_array(result+old_size, new_size-old_size);
+    }
+    EIGEN_CATCH(...)
+    {
+      conditional_aligned_free<Align>(result);
+      EIGEN_THROW;
+    }
+  }
+  return result;
+}
+
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, std::size_t size)
+{
+  if(NumTraits<T>::RequireInitialization)
+    destruct_elements_of_array<T>(ptr, size);
+  conditional_aligned_free<Align>(ptr);
+}
+
+/****************************************************************************/
+
+/** \internal Returns the index of the first element of the array that is well aligned with respect to the requested \a Alignment.
+  *
+  * \tparam Alignment requested alignment in Bytes.
+  * \param array the address of the start of the array
+  * \param size the size of the array
+  *
+  * \note If no element of the array is well aligned or the requested alignment is not a multiple of a scalar,
+  * the size of the array is returned. For example with SSE, the requested alignment is typically 16-bytes. If
+  * packet size for the given scalar type is 1, then everything is considered well-aligned.
+  *
+  * \note Otherwise, if the Alignment is larger that the scalar size, we rely on the assumptions that sizeof(Scalar) is a
+  * power of 2. On the other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
+  * example with Scalar=double on certain 32-bit platforms, see bug #79.
+  *
+  * There is also the variant first_aligned(const MatrixBase&) defined in DenseCoeffsBase.h.
+  * \sa first_default_aligned()
+  */
+template<int Alignment, typename Scalar, typename Index>
+EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size)
+{
+  const Index ScalarSize = sizeof(Scalar);
+  const Index AlignmentSize = Alignment / ScalarSize;
+  const Index AlignmentMask = AlignmentSize-1;
+
+  if(AlignmentSize<=1)
+  {
+    // Either the requested alignment if smaller than a scalar, or it exactly match a 1 scalar
+    // so that all elements of the array have the same alignment.
+    return 0;
+  }
+  else if( (UIntPtr(array) & (sizeof(Scalar)-1)) || (Alignment%ScalarSize)!=0)
+  {
+    // The array is not aligned to the size of a single scalar, or the requested alignment is not a multiple of the scalar size.
+    // Consequently, no element of the array is well aligned.
+    return size;
+  }
+  else
+  {
+    Index first = (AlignmentSize - (Index((UIntPtr(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask;
+    return (first < size) ? first : size;
+  }
+}
+
+/** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet requirement.
+   * \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase<Derived>) */
+template<typename Scalar, typename Index>
+EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index size)
+{
+  typedef typename packet_traits<Scalar>::type DefaultPacketType;
+  return first_aligned<unpacket_traits<DefaultPacketType>::alignment>(array, size);
+}
+
+/** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size
+  */
+template<typename Index>
+inline Index first_multiple(Index size, Index base)
+{
+  return ((size+base-1)/base)*base;
+}
+
+// std::copy is much slower than memcpy, so let's introduce a smart_copy which
+// use memcpy on trivial types, i.e., on types that does not require an initialization ctor.
+template<typename T, bool UseMemcpy> struct smart_copy_helper;
+
+template<typename T> EIGEN_DEVICE_FUNC void smart_copy(const T* start, const T* end, T* target)
+{
+  smart_copy_helper<T,!NumTraits<T>::RequireInitialization>::run(start, end, target);
+}
+
+template<typename T> struct smart_copy_helper<T,true> {
+  EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target)
+  {
+    IntPtr size = IntPtr(end)-IntPtr(start);
+    if(size==0) return;
+    eigen_internal_assert(start!=0 && end!=0 && target!=0);
+    EIGEN_USING_STD(memcpy)
+    memcpy(target, start, size);
+  }
+};
+
+template<typename T> struct smart_copy_helper<T,false> {
+  EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target)
+  { std::copy(start, end, target); }
+};
+
+// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise.
+template<typename T, bool UseMemmove> struct smart_memmove_helper;
+
+template<typename T> void smart_memmove(const T* start, const T* end, T* target)
+{
+  smart_memmove_helper<T,!NumTraits<T>::RequireInitialization>::run(start, end, target);
+}
+
+template<typename T> struct smart_memmove_helper<T,true> {
+  static inline void run(const T* start, const T* end, T* target)
+  {
+    IntPtr size = IntPtr(end)-IntPtr(start);
+    if(size==0) return;
+    eigen_internal_assert(start!=0 && end!=0 && target!=0);
+    std::memmove(target, start, size);
+  }
+};
+
+template<typename T> struct smart_memmove_helper<T,false> {
+  static inline void run(const T* start, const T* end, T* target)
+  {
+    if (UIntPtr(target) < UIntPtr(start))
+    {
+      std::copy(start, end, target);
+    }
+    else
+    {
+      std::ptrdiff_t count = (std::ptrdiff_t(end)-std::ptrdiff_t(start)) / sizeof(T);
+      std::copy_backward(start, end, target + count);
+    }
+  }
+};
+
+#if EIGEN_HAS_RVALUE_REFERENCES
+template<typename T> EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target)
+{
+  return std::move(start, end, target);
+}
+#else
+template<typename T> EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target)
+{
+  return std::copy(start, end, target);
+}
+#endif
+
+/*****************************************************************************
+*** Implementation of runtime stack allocation (falling back to malloc)    ***
+*****************************************************************************/
+
+// you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
+// to the appropriate stack allocation function
+#if ! defined EIGEN_ALLOCA && ! defined EIGEN_GPU_COMPILE_PHASE
+  #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca)
+    #define EIGEN_ALLOCA alloca
+  #elif EIGEN_COMP_MSVC
+    #define EIGEN_ALLOCA _alloca
+  #endif
+#endif
+
+// With clang -Oz -mthumb, alloca changes the stack pointer in a way that is
+// not allowed in Thumb2. -DEIGEN_STACK_ALLOCATION_LIMIT=0 doesn't work because
+// the compiler still emits bad code because stack allocation checks use "<=".
+// TODO: Eliminate after https://bugs.llvm.org/show_bug.cgi?id=23772
+// is fixed.
+#if defined(__clang__) && defined(__thumb__)
+  #undef EIGEN_ALLOCA
+#endif
+
+// This helper class construct the allocated memory, and takes care of destructing and freeing the handled data
+// at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions.
+template<typename T> class aligned_stack_memory_handler : noncopyable
+{
+  public:
+    /* Creates a stack_memory_handler responsible for the buffer \a ptr of size \a size.
+     * Note that \a ptr can be 0 regardless of the other parameters.
+     * This constructor takes care of constructing/initializing the elements of the buffer if required by the scalar type T (see NumTraits<T>::RequireInitialization).
+     * In this case, the buffer elements will also be destructed when this handler will be destructed.
+     * Finally, if \a dealloc is true, then the pointer \a ptr is freed.
+     **/
+    EIGEN_DEVICE_FUNC
+    aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc)
+      : m_ptr(ptr), m_size(size), m_deallocate(dealloc)
+    {
+      if(NumTraits<T>::RequireInitialization && m_ptr)
+        Eigen::internal::construct_elements_of_array(m_ptr, size);
+    }
+    EIGEN_DEVICE_FUNC
+    ~aligned_stack_memory_handler()
+    {
+      if(NumTraits<T>::RequireInitialization && m_ptr)
+        Eigen::internal::destruct_elements_of_array<T>(m_ptr, m_size);
+      if(m_deallocate)
+        Eigen::internal::aligned_free(m_ptr);
+    }
+  protected:
+    T* m_ptr;
+    std::size_t m_size;
+    bool m_deallocate;
+};
+
+#ifdef EIGEN_ALLOCA
+
+template<typename Xpr, int NbEvaluations,
+         bool MapExternalBuffer = nested_eval<Xpr,NbEvaluations>::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic
+         >
+struct local_nested_eval_wrapper
+{
+  static const bool NeedExternalBuffer = false;
+  typedef typename Xpr::Scalar Scalar;
+  typedef typename nested_eval<Xpr,NbEvaluations>::type ObjectType;
+  ObjectType object;
+
+  EIGEN_DEVICE_FUNC
+  local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr)
+  {
+    EIGEN_UNUSED_VARIABLE(ptr);
+    eigen_internal_assert(ptr==0);
+  }
+};
+
+template<typename Xpr, int NbEvaluations>
+struct local_nested_eval_wrapper<Xpr,NbEvaluations,true>
+{
+  static const bool NeedExternalBuffer = true;
+  typedef typename Xpr::Scalar Scalar;
+  typedef typename plain_object_eval<Xpr>::type PlainObject;
+  typedef Map<PlainObject,EIGEN_DEFAULT_ALIGN_BYTES> ObjectType;
+  ObjectType object;
+
+  EIGEN_DEVICE_FUNC
+  local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr)
+    : object(ptr==0 ? reinterpret_cast<Scalar*>(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()),
+      m_deallocate(ptr==0)
+  {
+    if(NumTraits<Scalar>::RequireInitialization && object.data())
+      Eigen::internal::construct_elements_of_array(object.data(), object.size());
+    object = xpr;
+  }
+
+  EIGEN_DEVICE_FUNC
+  ~local_nested_eval_wrapper()
+  {
+    if(NumTraits<Scalar>::RequireInitialization && object.data())
+      Eigen::internal::destruct_elements_of_array(object.data(), object.size());
+    if(m_deallocate)
+      Eigen::internal::aligned_free(object.data());
+  }
+
+private:
+  bool m_deallocate;
+};
+
+#endif // EIGEN_ALLOCA
+
+template<typename T> class scoped_array : noncopyable
+{
+  T* m_ptr;
+public:
+  explicit scoped_array(std::ptrdiff_t size)
+  {
+    m_ptr = new T[size];
+  }
+  ~scoped_array()
+  {
+    delete[] m_ptr;
+  }
+  T& operator[](std::ptrdiff_t i) { return m_ptr[i]; }
+  const T& operator[](std::ptrdiff_t i) const { return m_ptr[i]; }
+  T* &ptr() { return m_ptr; }
+  const T* ptr() const { return m_ptr; }
+  operator const T*() const { return m_ptr; }
+};
+
+template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
+{
+  std::swap(a.ptr(),b.ptr());
+}
+
+} // end namespace internal
+
+/** \internal
+  *
+  * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates,
+  * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
+  * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
+  * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap.
+  * The allocated buffer is automatically deleted when exiting the scope of this declaration.
+  * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs.
+  * Here is an example:
+  * \code
+  * {
+  *   ei_declare_aligned_stack_constructed_variable(float,data,size,0);
+  *   // use data[0] to data[size-1]
+  * }
+  * \endcode
+  * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
+  *
+  * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to
+  * \code
+  *   typename internal::nested_eval<XPRT_T,N>::type NAME(XPR);
+  * \endcode
+  * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown.
+  * This is accomplished through alloca if this later is supported and if the required number of bytes
+  * is below EIGEN_STACK_ALLOCATION_LIMIT.
+  */
+#ifdef EIGEN_ALLOCA
+
+  #if EIGEN_DEFAULT_ALIGN_BYTES>0
+    // We always manually re-align the result of EIGEN_ALLOCA.
+    // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment.
+    #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((internal::UIntPtr(EIGEN_ALLOCA(SIZE+EIGEN_DEFAULT_ALIGN_BYTES-1)) + EIGEN_DEFAULT_ALIGN_BYTES-1) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1)))
+  #else
+    #define EIGEN_ALIGNED_ALLOCA(SIZE) EIGEN_ALLOCA(SIZE)
+  #endif
+
+  #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
+    Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
+    TYPE* NAME = (BUFFER)!=0 ? (BUFFER) \
+               : reinterpret_cast<TYPE*>( \
+                      (sizeof(TYPE)*SIZE<=EIGEN_STACK_ALLOCATION_LIMIT) ? EIGEN_ALIGNED_ALLOCA(sizeof(TYPE)*SIZE) \
+                    : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) );  \
+    Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT)
+
+
+  #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \
+    Eigen::internal::local_nested_eval_wrapper<XPR_T,N> EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast<typename XPR_T::Scalar*>( \
+      ( (Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \
+        ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \
+    typename Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object)
+
+#else
+
+  #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
+    Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
+    TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE));    \
+    Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
+
+
+#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval<XPR_T,N>::type NAME(XPR)
+
+#endif
+
+
+/*****************************************************************************
+*** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF]                ***
+*****************************************************************************/
+
+#if EIGEN_HAS_CXX17_OVERALIGN
+
+// C++17 -> no need to bother about alignment anymore :)
+
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign)
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size)
+
+#else
+
+// HIP does not support new/delete on device.
+#if EIGEN_MAX_ALIGN_BYTES!=0 && !defined(EIGEN_HIP_DEVICE_COMPILE)
+  #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+      EIGEN_DEVICE_FUNC \
+      void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \
+        EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
+        EIGEN_CATCH (...) { return 0; } \
+      }
+  #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \
+      EIGEN_DEVICE_FUNC \
+      void *operator new(std::size_t size) { \
+        return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
+      } \
+      EIGEN_DEVICE_FUNC \
+      void *operator new[](std::size_t size) { \
+        return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
+      } \
+      EIGEN_DEVICE_FUNC \
+      void operator delete(void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      EIGEN_DEVICE_FUNC \
+      void operator delete[](void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      EIGEN_DEVICE_FUNC \
+      void operator delete(void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      EIGEN_DEVICE_FUNC \
+      void operator delete[](void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      /* in-place new and delete. since (at least afaik) there is no actual   */ \
+      /* memory allocated we can safely let the default implementation handle */ \
+      /* this particular case. */ \
+      EIGEN_DEVICE_FUNC \
+      static void *operator new(std::size_t size, void *ptr) { return ::operator new(size,ptr); } \
+      EIGEN_DEVICE_FUNC \
+      static void *operator new[](std::size_t size, void* ptr) { return ::operator new[](size,ptr); } \
+      EIGEN_DEVICE_FUNC \
+      void operator delete(void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete(memory,ptr); } \
+      EIGEN_DEVICE_FUNC \
+      void operator delete[](void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete[](memory,ptr); } \
+      /* nothrow-new (returns zero instead of std::bad_alloc) */ \
+      EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+      EIGEN_DEVICE_FUNC \
+      void operator delete(void *ptr, const std::nothrow_t&) EIGEN_NO_THROW { \
+        Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
+      } \
+      typedef void eigen_aligned_operator_new_marker_type;
+#else
+  #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
+#endif
+
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size)                        \
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(                                                             \
+        ((Size)!=Eigen::Dynamic) &&                                                                    \
+        (((EIGEN_MAX_ALIGN_BYTES>=16) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES  )==0)) ||    \
+         ((EIGEN_MAX_ALIGN_BYTES>=32) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/2)==0)) ||    \
+         ((EIGEN_MAX_ALIGN_BYTES>=64) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/4)==0))   )))
+
+#endif
+
+/****************************************************************************/
+
+/** \class aligned_allocator
+* \ingroup Core_Module
+*
+* \brief STL compatible allocator to use with types requiring a non standrad alignment.
+*
+* The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd.
+* By default, it will thus provide at least 16 bytes alignment and more in following cases:
+*  - 32 bytes alignment if AVX is enabled.
+*  - 64 bytes alignment if AVX512 is enabled.
+*
+* This can be controlled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented
+* \link TopicPreprocessorDirectivesPerformance there \endlink.
+*
+* Example:
+* \code
+* // Matrix4f requires 16 bytes alignment:
+* std::map< int, Matrix4f, std::less<int>,
+*           aligned_allocator<std::pair<const int, Matrix4f> > > my_map_mat4;
+* // Vector3f does not require 16 bytes alignment, no need to use Eigen's allocator:
+* std::map< int, Vector3f > my_map_vec3;
+* \endcode
+*
+* \sa \blank \ref TopicStlContainers.
+*/
+template<class T>
+class aligned_allocator : public std::allocator<T>
+{
+public:
+  typedef std::size_t     size_type;
+  typedef std::ptrdiff_t  difference_type;
+  typedef T*              pointer;
+  typedef const T*        const_pointer;
+  typedef T&              reference;
+  typedef const T&        const_reference;
+  typedef T               value_type;
+
+  template<class U>
+  struct rebind
+  {
+    typedef aligned_allocator<U> other;
+  };
+
+  aligned_allocator() : std::allocator<T>() {}
+
+  aligned_allocator(const aligned_allocator& other) : std::allocator<T>(other) {}
+
+  template<class U>
+  aligned_allocator(const aligned_allocator<U>& other) : std::allocator<T>(other) {}
+
+  ~aligned_allocator() {}
+
+  #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0)
+  // In gcc std::allocator::max_size() is bugged making gcc triggers a warning:
+  // eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807
+  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544
+  size_type max_size() const {
+    return (std::numeric_limits<std::ptrdiff_t>::max)()/sizeof(T);
+  }
+  #endif
+
+  pointer allocate(size_type num, const void* /*hint*/ = 0)
+  {
+    internal::check_size_for_overflow<T>(num);
+    return static_cast<pointer>( internal::aligned_malloc(num * sizeof(T)) );
+  }
+
+  void deallocate(pointer p, size_type /*num*/)
+  {
+    internal::aligned_free(p);
+  }
+};
+
+//---------- Cache sizes ----------
+
+#if !defined(EIGEN_NO_CPUID)
+#  if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64
+#    if defined(__PIC__) && EIGEN_ARCH_i386
+       // Case for x86 with PIC
+#      define EIGEN_CPUID(abcd,func,id) \
+         __asm__ __volatile__ ("xchgl %%ebx, %k1;cpuid; xchgl %%ebx,%k1": "=a" (abcd[0]), "=&r" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "a" (func), "c" (id));
+#    elif defined(__PIC__) && EIGEN_ARCH_x86_64
+       // Case for x64 with PIC. In theory this is only a problem with recent gcc and with medium or large code model, not with the default small code model.
+       // However, we cannot detect which code model is used, and the xchg overhead is negligible anyway.
+#      define EIGEN_CPUID(abcd,func,id) \
+        __asm__ __volatile__ ("xchg{q}\t{%%}rbx, %q1; cpuid; xchg{q}\t{%%}rbx, %q1": "=a" (abcd[0]), "=&r" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "0" (func), "2" (id));
+#    else
+       // Case for x86_64 or x86 w/o PIC
+#      define EIGEN_CPUID(abcd,func,id) \
+         __asm__ __volatile__ ("cpuid": "=a" (abcd[0]), "=b" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "0" (func), "2" (id) );
+#    endif
+#  elif EIGEN_COMP_MSVC
+#    if (EIGEN_COMP_MSVC > 1500) && EIGEN_ARCH_i386_OR_x86_64
+#      define EIGEN_CPUID(abcd,func,id) __cpuidex((int*)abcd,func,id)
+#    endif
+#  endif
+#endif
+
+namespace internal {
+
+#ifdef EIGEN_CPUID
+
+inline bool cpuid_is_vendor(int abcd[4], const int vendor[3])
+{
+  return abcd[1]==vendor[0] && abcd[3]==vendor[1] && abcd[2]==vendor[2];
+}
+
+inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3)
+{
+  int abcd[4];
+  l1 = l2 = l3 = 0;
+  int cache_id = 0;
+  int cache_type = 0;
+  do {
+    abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
+    EIGEN_CPUID(abcd,0x4,cache_id);
+    cache_type  = (abcd[0] & 0x0F) >> 0;
+    if(cache_type==1||cache_type==3) // data or unified cache
+    {
+      int cache_level = (abcd[0] & 0xE0) >> 5;  // A[7:5]
+      int ways        = (abcd[1] & 0xFFC00000) >> 22; // B[31:22]
+      int partitions  = (abcd[1] & 0x003FF000) >> 12; // B[21:12]
+      int line_size   = (abcd[1] & 0x00000FFF) >>  0; // B[11:0]
+      int sets        = (abcd[2]);                    // C[31:0]
+
+      int cache_size = (ways+1) * (partitions+1) * (line_size+1) * (sets+1);
+
+      switch(cache_level)
+      {
+        case 1: l1 = cache_size; break;
+        case 2: l2 = cache_size; break;
+        case 3: l3 = cache_size; break;
+        default: break;
+      }
+    }
+    cache_id++;
+  } while(cache_type>0 && cache_id<16);
+}
+
+inline void queryCacheSizes_intel_codes(int& l1, int& l2, int& l3)
+{
+  int abcd[4];
+  abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
+  l1 = l2 = l3 = 0;
+  EIGEN_CPUID(abcd,0x00000002,0);
+  unsigned char * bytes = reinterpret_cast<unsigned char *>(abcd)+2;
+  bool check_for_p2_core2 = false;
+  for(int i=0; i<14; ++i)
+  {
+    switch(bytes[i])
+    {
+      case 0x0A: l1 = 8; break;   // 0Ah   data L1 cache, 8 KB, 2 ways, 32 byte lines
+      case 0x0C: l1 = 16; break;  // 0Ch   data L1 cache, 16 KB, 4 ways, 32 byte lines
+      case 0x0E: l1 = 24; break;  // 0Eh   data L1 cache, 24 KB, 6 ways, 64 byte lines
+      case 0x10: l1 = 16; break;  // 10h   data L1 cache, 16 KB, 4 ways, 32 byte lines (IA-64)
+      case 0x15: l1 = 16; break;  // 15h   code L1 cache, 16 KB, 4 ways, 32 byte lines (IA-64)
+      case 0x2C: l1 = 32; break;  // 2Ch   data L1 cache, 32 KB, 8 ways, 64 byte lines
+      case 0x30: l1 = 32; break;  // 30h   code L1 cache, 32 KB, 8 ways, 64 byte lines
+      case 0x60: l1 = 16; break;  // 60h   data L1 cache, 16 KB, 8 ways, 64 byte lines, sectored
+      case 0x66: l1 = 8; break;   // 66h   data L1 cache, 8 KB, 4 ways, 64 byte lines, sectored
+      case 0x67: l1 = 16; break;  // 67h   data L1 cache, 16 KB, 4 ways, 64 byte lines, sectored
+      case 0x68: l1 = 32; break;  // 68h   data L1 cache, 32 KB, 4 ways, 64 byte lines, sectored
+      case 0x1A: l2 = 96; break;   // code and data L2 cache, 96 KB, 6 ways, 64 byte lines (IA-64)
+      case 0x22: l3 = 512; break;   // code and data L3 cache, 512 KB, 4 ways (!), 64 byte lines, dual-sectored
+      case 0x23: l3 = 1024; break;   // code and data L3 cache, 1024 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x25: l3 = 2048; break;   // code and data L3 cache, 2048 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x29: l3 = 4096; break;   // code and data L3 cache, 4096 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x39: l2 = 128; break;   // code and data L2 cache, 128 KB, 4 ways, 64 byte lines, sectored
+      case 0x3A: l2 = 192; break;   // code and data L2 cache, 192 KB, 6 ways, 64 byte lines, sectored
+      case 0x3B: l2 = 128; break;   // code and data L2 cache, 128 KB, 2 ways, 64 byte lines, sectored
+      case 0x3C: l2 = 256; break;   // code and data L2 cache, 256 KB, 4 ways, 64 byte lines, sectored
+      case 0x3D: l2 = 384; break;   // code and data L2 cache, 384 KB, 6 ways, 64 byte lines, sectored
+      case 0x3E: l2 = 512; break;   // code and data L2 cache, 512 KB, 4 ways, 64 byte lines, sectored
+      case 0x40: l2 = 0; break;   // no integrated L2 cache (P6 core) or L3 cache (P4 core)
+      case 0x41: l2 = 128; break;   // code and data L2 cache, 128 KB, 4 ways, 32 byte lines
+      case 0x42: l2 = 256; break;   // code and data L2 cache, 256 KB, 4 ways, 32 byte lines
+      case 0x43: l2 = 512; break;   // code and data L2 cache, 512 KB, 4 ways, 32 byte lines
+      case 0x44: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 4 ways, 32 byte lines
+      case 0x45: l2 = 2048; break;   // code and data L2 cache, 2048 KB, 4 ways, 32 byte lines
+      case 0x46: l3 = 4096; break;   // code and data L3 cache, 4096 KB, 4 ways, 64 byte lines
+      case 0x47: l3 = 8192; break;   // code and data L3 cache, 8192 KB, 8 ways, 64 byte lines
+      case 0x48: l2 = 3072; break;   // code and data L2 cache, 3072 KB, 12 ways, 64 byte lines
+      case 0x49: if(l2!=0) l3 = 4096; else {check_for_p2_core2=true; l3 = l2 = 4096;} break;// code and data L3 cache, 4096 KB, 16 ways, 64 byte lines (P4) or L2 for core2
+      case 0x4A: l3 = 6144; break;   // code and data L3 cache, 6144 KB, 12 ways, 64 byte lines
+      case 0x4B: l3 = 8192; break;   // code and data L3 cache, 8192 KB, 16 ways, 64 byte lines
+      case 0x4C: l3 = 12288; break;   // code and data L3 cache, 12288 KB, 12 ways, 64 byte lines
+      case 0x4D: l3 = 16384; break;   // code and data L3 cache, 16384 KB, 16 ways, 64 byte lines
+      case 0x4E: l2 = 6144; break;   // code and data L2 cache, 6144 KB, 24 ways, 64 byte lines
+      case 0x78: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 4 ways, 64 byte lines
+      case 0x79: l2 = 128; break;   // code and data L2 cache, 128 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x7A: l2 = 256; break;   // code and data L2 cache, 256 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x7B: l2 = 512; break;   // code and data L2 cache, 512 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x7C: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x7D: l2 = 2048; break;   // code and data L2 cache, 2048 KB, 8 ways, 64 byte lines
+      case 0x7E: l2 = 256; break;   // code and data L2 cache, 256 KB, 8 ways, 128 byte lines, sect. (IA-64)
+      case 0x7F: l2 = 512; break;   // code and data L2 cache, 512 KB, 2 ways, 64 byte lines
+      case 0x80: l2 = 512; break;   // code and data L2 cache, 512 KB, 8 ways, 64 byte lines
+      case 0x81: l2 = 128; break;   // code and data L2 cache, 128 KB, 8 ways, 32 byte lines
+      case 0x82: l2 = 256; break;   // code and data L2 cache, 256 KB, 8 ways, 32 byte lines
+      case 0x83: l2 = 512; break;   // code and data L2 cache, 512 KB, 8 ways, 32 byte lines
+      case 0x84: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 8 ways, 32 byte lines
+      case 0x85: l2 = 2048; break;   // code and data L2 cache, 2048 KB, 8 ways, 32 byte lines
+      case 0x86: l2 = 512; break;   // code and data L2 cache, 512 KB, 4 ways, 64 byte lines
+      case 0x87: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 8 ways, 64 byte lines
+      case 0x88: l3 = 2048; break;   // code and data L3 cache, 2048 KB, 4 ways, 64 byte lines (IA-64)
+      case 0x89: l3 = 4096; break;   // code and data L3 cache, 4096 KB, 4 ways, 64 byte lines (IA-64)
+      case 0x8A: l3 = 8192; break;   // code and data L3 cache, 8192 KB, 4 ways, 64 byte lines (IA-64)
+      case 0x8D: l3 = 3072; break;   // code and data L3 cache, 3072 KB, 12 ways, 128 byte lines (IA-64)
+
+      default: break;
+    }
+  }
+  if(check_for_p2_core2 && l2 == l3)
+    l3 = 0;
+  l1 *= 1024;
+  l2 *= 1024;
+  l3 *= 1024;
+}
+
+inline void queryCacheSizes_intel(int& l1, int& l2, int& l3, int max_std_funcs)
+{
+  if(max_std_funcs>=4)
+    queryCacheSizes_intel_direct(l1,l2,l3);
+  else if(max_std_funcs>=2)
+    queryCacheSizes_intel_codes(l1,l2,l3);
+  else
+    l1 = l2 = l3 = 0;
+}
+
+inline void queryCacheSizes_amd(int& l1, int& l2, int& l3)
+{
+  int abcd[4];
+  abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
+  
+  // First query the max supported function.
+  EIGEN_CPUID(abcd,0x80000000,0);
+  if(static_cast<numext::uint32_t>(abcd[0]) >= static_cast<numext::uint32_t>(0x80000006))
+  {
+    EIGEN_CPUID(abcd,0x80000005,0);
+    l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB
+    abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
+    EIGEN_CPUID(abcd,0x80000006,0);
+    l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB
+    l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB
+  }
+  else
+  {
+    l1 = l2 = l3 = 0;
+  }
+}
+#endif
+
+/** \internal
+ * Queries and returns the cache sizes in Bytes of the L1, L2, and L3 data caches respectively */
+inline void queryCacheSizes(int& l1, int& l2, int& l3)
+{
+  #ifdef EIGEN_CPUID
+  int abcd[4];
+  const int GenuineIntel[] = {0x756e6547, 0x49656e69, 0x6c65746e};
+  const int AuthenticAMD[] = {0x68747541, 0x69746e65, 0x444d4163};
+  const int AMDisbetter_[] = {0x69444d41, 0x74656273, 0x21726574}; // "AMDisbetter!"
+
+  // identify the CPU vendor
+  EIGEN_CPUID(abcd,0x0,0);
+  int max_std_funcs = abcd[0];
+  if(cpuid_is_vendor(abcd,GenuineIntel))
+    queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
+  else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_))
+    queryCacheSizes_amd(l1,l2,l3);
+  else
+    // by default let's use Intel's API
+    queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
+
+  // here is the list of other vendors:
+//   ||cpuid_is_vendor(abcd,"VIA VIA VIA ")
+//   ||cpuid_is_vendor(abcd,"CyrixInstead")
+//   ||cpuid_is_vendor(abcd,"CentaurHauls")
+//   ||cpuid_is_vendor(abcd,"GenuineTMx86")
+//   ||cpuid_is_vendor(abcd,"TransmetaCPU")
+//   ||cpuid_is_vendor(abcd,"RiseRiseRise")
+//   ||cpuid_is_vendor(abcd,"Geode by NSC")
+//   ||cpuid_is_vendor(abcd,"SiS SiS SiS ")
+//   ||cpuid_is_vendor(abcd,"UMC UMC UMC ")
+//   ||cpuid_is_vendor(abcd,"NexGenDriven")
+  #else
+  l1 = l2 = l3 = -1;
+  #endif
+}
+
+/** \internal
+ * \returns the size in Bytes of the L1 data cache */
+inline int queryL1CacheSize()
+{
+  int l1(-1), l2, l3;
+  queryCacheSizes(l1,l2,l3);
+  return l1;
+}
+
+/** \internal
+ * \returns the size in Bytes of the L2 or L3 cache if this later is present */
+inline int queryTopLevelCacheSize()
+{
+  int l1, l2(-1), l3(-1);
+  queryCacheSizes(l1,l2,l3);
+  return (std::max)(l2,l3);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MEMORY_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/Meta.h b/src/3rdparty/eigen/Eigen/src/Core/util/Meta.h
new file mode 100644
index 000000000..81ae2a32d
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/Meta.h
@@ -0,0 +1,812 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_META_H
+#define EIGEN_META_H
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+
+ #include <cfloat>
+
+ #if defined(EIGEN_CUDA_ARCH)
+  #include <math_constants.h>
+ #endif
+
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+  #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
+  #endif
+
+#endif
+
+// Recent versions of ICC require <cstdint> for pointer types below.
+#define EIGEN_ICC_NEEDS_CSTDINT (EIGEN_COMP_ICC>=1600 && EIGEN_COMP_CXXVER >= 11)
+
+// Define portable (u)int{32,64} types
+#if EIGEN_HAS_CXX11 || EIGEN_ICC_NEEDS_CSTDINT
+#include <cstdint>
+namespace Eigen {
+namespace numext {
+typedef std::uint8_t  uint8_t;
+typedef std::int8_t   int8_t;
+typedef std::uint16_t uint16_t;
+typedef std::int16_t  int16_t;
+typedef std::uint32_t uint32_t;
+typedef std::int32_t  int32_t;
+typedef std::uint64_t uint64_t;
+typedef std::int64_t  int64_t;
+}
+}
+#else
+// Without c++11, all compilers able to compile Eigen also
+// provide the C99 stdint.h header file.
+#include <stdint.h>
+namespace Eigen {
+namespace numext {
+typedef ::uint8_t  uint8_t;
+typedef ::int8_t   int8_t;
+typedef ::uint16_t uint16_t;
+typedef ::int16_t  int16_t;
+typedef ::uint32_t uint32_t;
+typedef ::int32_t  int32_t;
+typedef ::uint64_t uint64_t;
+typedef ::int64_t  int64_t;
+}
+}
+#endif
+
+namespace Eigen {
+
+typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex;
+
+/**
+ * \brief The Index type as used for the API.
+ * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
+ * \sa \blank \ref TopicPreprocessorDirectives, StorageIndex.
+ */
+
+typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index;
+
+namespace internal {
+
+/** \internal
+  * \file Meta.h
+  * This file contains generic metaprogramming classes which are not specifically related to Eigen.
+  * \note In case you wonder, yes we're aware that Boost already provides all these features,
+  * we however don't want to add a dependency to Boost.
+  */
+
+// Only recent versions of ICC complain about using ptrdiff_t to hold pointers,
+// and older versions do not provide *intptr_t types.
+#if EIGEN_ICC_NEEDS_CSTDINT
+typedef std::intptr_t  IntPtr;
+typedef std::uintptr_t UIntPtr;
+#else
+typedef std::ptrdiff_t IntPtr;
+typedef std::size_t UIntPtr;
+#endif
+#undef EIGEN_ICC_NEEDS_CSTDINT
+
+struct true_type {  enum { value = 1 }; };
+struct false_type { enum { value = 0 }; };
+
+template<bool Condition>
+struct bool_constant;
+
+template<>
+struct bool_constant<true> : true_type {};
+
+template<>
+struct bool_constant<false> : false_type {};
+
+template<bool Condition, typename Then, typename Else>
+struct conditional { typedef Then type; };
+
+template<typename Then, typename Else>
+struct conditional <false, Then, Else> { typedef Else type; };
+
+template<typename T> struct remove_reference { typedef T type; };
+template<typename T> struct remove_reference<T&> { typedef T type; };
+
+template<typename T> struct remove_pointer { typedef T type; };
+template<typename T> struct remove_pointer<T*> { typedef T type; };
+template<typename T> struct remove_pointer<T*const> { typedef T type; };
+
+template <class T> struct remove_const { typedef T type; };
+template <class T> struct remove_const<const T> { typedef T type; };
+template <class T> struct remove_const<const T[]> { typedef T type[]; };
+template <class T, unsigned int Size> struct remove_const<const T[Size]> { typedef T type[Size]; };
+
+template<typename T> struct remove_all { typedef T type; };
+template<typename T> struct remove_all<const T>   { typedef typename remove_all<T>::type type; };
+template<typename T> struct remove_all<T const&>  { typedef typename remove_all<T>::type type; };
+template<typename T> struct remove_all<T&>        { typedef typename remove_all<T>::type type; };
+template<typename T> struct remove_all<T const*>  { typedef typename remove_all<T>::type type; };
+template<typename T> struct remove_all<T*>        { typedef typename remove_all<T>::type type; };
+
+template<typename T> struct is_arithmetic      { enum { value = false }; };
+template<> struct is_arithmetic<float>         { enum { value = true }; };
+template<> struct is_arithmetic<double>        { enum { value = true }; };
+template<> struct is_arithmetic<long double>   { enum { value = true }; };
+template<> struct is_arithmetic<bool>          { enum { value = true }; };
+template<> struct is_arithmetic<char>          { enum { value = true }; };
+template<> struct is_arithmetic<signed char>   { enum { value = true }; };
+template<> struct is_arithmetic<unsigned char> { enum { value = true }; };
+template<> struct is_arithmetic<signed short>  { enum { value = true }; };
+template<> struct is_arithmetic<unsigned short>{ enum { value = true }; };
+template<> struct is_arithmetic<signed int>    { enum { value = true }; };
+template<> struct is_arithmetic<unsigned int>  { enum { value = true }; };
+template<> struct is_arithmetic<signed long>   { enum { value = true }; };
+template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
+
+template<typename T, typename U> struct is_same { enum { value = 0 }; };
+template<typename T> struct is_same<T,T> { enum { value = 1 }; };
+
+template< class T >
+struct is_void : is_same<void, typename remove_const<T>::type> {};
+
+#if EIGEN_HAS_CXX11
+template<> struct is_arithmetic<signed long long>   { enum { value = true }; };
+template<> struct is_arithmetic<unsigned long long> { enum { value = true }; };
+using std::is_integral;
+#else
+template<typename T> struct is_integral               { enum { value = false }; };
+template<> struct is_integral<bool>                   { enum { value = true }; };
+template<> struct is_integral<char>                   { enum { value = true }; };
+template<> struct is_integral<signed char>            { enum { value = true }; };
+template<> struct is_integral<unsigned char>          { enum { value = true }; };
+template<> struct is_integral<signed short>           { enum { value = true }; };
+template<> struct is_integral<unsigned short>         { enum { value = true }; };
+template<> struct is_integral<signed int>             { enum { value = true }; };
+template<> struct is_integral<unsigned int>           { enum { value = true }; };
+template<> struct is_integral<signed long>            { enum { value = true }; };
+template<> struct is_integral<unsigned long>          { enum { value = true }; };
+#if EIGEN_COMP_MSVC
+template<> struct is_integral<signed __int64>         { enum { value = true }; };
+template<> struct is_integral<unsigned __int64>       { enum { value = true }; };
+#endif
+#endif
+
+#if EIGEN_HAS_CXX11
+using std::make_unsigned;
+#else
+// TODO: Possibly improve this implementation of make_unsigned.
+// It is currently used only by
+// template<typename Scalar> struct random_default_impl<Scalar, false, true>.
+template<typename> struct make_unsigned;
+template<> struct make_unsigned<char>             { typedef unsigned char type; };
+template<> struct make_unsigned<signed char>      { typedef unsigned char type; };
+template<> struct make_unsigned<unsigned char>    { typedef unsigned char type; };
+template<> struct make_unsigned<signed short>     { typedef unsigned short type; };
+template<> struct make_unsigned<unsigned short>   { typedef unsigned short type; };
+template<> struct make_unsigned<signed int>       { typedef unsigned int type; };
+template<> struct make_unsigned<unsigned int>     { typedef unsigned int type; };
+template<> struct make_unsigned<signed long>      { typedef unsigned long type; };
+template<> struct make_unsigned<unsigned long>    { typedef unsigned long type; };
+#if EIGEN_COMP_MSVC
+template<> struct make_unsigned<signed __int64>   { typedef unsigned __int64 type; };
+template<> struct make_unsigned<unsigned __int64> { typedef unsigned __int64 type; };
+#endif
+
+// Some platforms define int64_t as `long long` even for C++03, where
+// `long long` is not guaranteed by the standard. In this case we are missing
+// the definition for make_unsigned. If we just define it, we run into issues
+// where `long long` doesn't exist in some compilers for C++03. We therefore add
+// the specialization for these platforms only.
+#if EIGEN_OS_MAC || EIGEN_COMP_MINGW
+template<> struct make_unsigned<unsigned long long> { typedef unsigned long long type; };
+template<> struct make_unsigned<long long>          { typedef unsigned long long type; };
+#endif
+#endif
+
+template <typename T> struct add_const { typedef const T type; };
+template <typename T> struct add_const<T&> { typedef T& type; };
+
+template <typename T> struct is_const { enum { value = 0 }; };
+template <typename T> struct is_const<T const> { enum { value = 1 }; };
+
+template<typename T> struct add_const_on_value_type            { typedef const T type;  };
+template<typename T> struct add_const_on_value_type<T&>        { typedef T const& type; };
+template<typename T> struct add_const_on_value_type<T*>        { typedef T const* type; };
+template<typename T> struct add_const_on_value_type<T* const>  { typedef T const* const type; };
+template<typename T> struct add_const_on_value_type<T const* const>  { typedef T const* const type; };
+
+#if EIGEN_HAS_CXX11
+
+using std::is_convertible;
+
+#else
+
+template<typename From, typename To>
+struct is_convertible_impl
+{
+private:
+  struct any_conversion
+  {
+    template <typename T> any_conversion(const volatile T&);
+    template <typename T> any_conversion(T&);
+  };
+  struct yes {int a[1];};
+  struct no  {int a[2];};
+
+  template<typename T>
+  static yes test(T, int);
+
+  template<typename T>
+  static no  test(any_conversion, ...);
+
+public:
+  static typename internal::remove_reference<From>::type* ms_from;
+#ifdef __INTEL_COMPILER
+  #pragma warning push
+  #pragma warning ( disable : 2259 )
+#endif
+  enum { value = sizeof(test<To>(*ms_from, 0))==sizeof(yes) };
+#ifdef __INTEL_COMPILER
+  #pragma warning pop
+#endif
+};
+
+template<typename From, typename To>
+struct is_convertible
+{
+  enum { value = is_convertible_impl<From,To>::value };
+};
+
+template<typename T>
+struct is_convertible<T,T&> { enum { value = false }; };
+
+template<typename T>
+struct is_convertible<const T,const T&> { enum { value = true }; };
+
+#endif
+
+/** \internal Allows to enable/disable an overload
+  * according to a compile time condition.
+  */
+template<bool Condition, typename T=void> struct enable_if;
+
+template<typename T> struct enable_if<true,T>
+{ typedef T type; };
+
+#if defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11
+#if !defined(__FLT_EPSILON__)
+#define __FLT_EPSILON__ FLT_EPSILON
+#define __DBL_EPSILON__ DBL_EPSILON
+#endif
+
+namespace device {
+
+template<typename T> struct numeric_limits
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_CONSTEXPR T epsilon() { return 0; }
+  static T (max)() { assert(false && "Highest not supported for this type"); }
+  static T (min)() { assert(false && "Lowest not supported for this type"); }
+  static T infinity() { assert(false && "Infinity not supported for this type"); }
+  static T quiet_NaN() { assert(false && "quiet_NaN not supported for this type"); }
+};
+template<> struct numeric_limits<float>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static float epsilon() { return __FLT_EPSILON__; }
+  EIGEN_DEVICE_FUNC
+  static float (max)() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_MAX_NORMAL_F;
+  #else
+    return HIPRT_MAX_NORMAL_F;
+  #endif
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static float (min)() { return FLT_MIN; }
+  EIGEN_DEVICE_FUNC
+  static float infinity() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_INF_F;
+  #else
+    return HIPRT_INF_F;
+  #endif
+  }
+  EIGEN_DEVICE_FUNC
+  static float quiet_NaN() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_NAN_F;
+  #else
+    return HIPRT_NAN_F;
+  #endif
+  }
+};
+template<> struct numeric_limits<double>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static double epsilon() { return __DBL_EPSILON__; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static double (max)() { return DBL_MAX; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static double (min)() { return DBL_MIN; }
+  EIGEN_DEVICE_FUNC
+  static double infinity() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_INF;
+  #else
+    return HIPRT_INF;
+  #endif
+  }
+  EIGEN_DEVICE_FUNC
+  static double quiet_NaN() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_NAN;
+  #else
+    return HIPRT_NAN;
+  #endif
+  }
+};
+template<> struct numeric_limits<int>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int (max)() { return INT_MAX; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int (min)() { return INT_MIN; }
+};
+template<> struct numeric_limits<unsigned int>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned int epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned int (max)() { return UINT_MAX; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned int (min)() { return 0; }
+};
+template<> struct numeric_limits<long>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static long (max)() { return LONG_MAX; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static long (min)() { return LONG_MIN; }
+};
+template<> struct numeric_limits<unsigned long>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned long (max)() { return ULONG_MAX; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned long (min)() { return 0; }
+};
+template<> struct numeric_limits<long long>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static long long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static long long (max)() { return LLONG_MAX; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static long long (min)() { return LLONG_MIN; }
+};
+template<> struct numeric_limits<unsigned long long>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned long long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned long long (max)() { return ULLONG_MAX; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static unsigned long long (min)() { return 0; }
+};
+template<> struct numeric_limits<bool>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static bool epsilon() { return false; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static bool (max)() { return true; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR 
+  static bool (min)() { return false; }
+};
+
+}
+
+#endif // defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11
+
+/** \internal
+  * A base class do disable default copy ctor and copy assignment operator.
+  */
+class noncopyable
+{
+  EIGEN_DEVICE_FUNC noncopyable(const noncopyable&);
+  EIGEN_DEVICE_FUNC const noncopyable& operator=(const noncopyable&);
+protected:
+  EIGEN_DEVICE_FUNC noncopyable() {}
+  EIGEN_DEVICE_FUNC ~noncopyable() {}
+};
+
+/** \internal
+  * Provides access to the number of elements in the object of as a compile-time constant expression.
+  * It "returns" Eigen::Dynamic if the size cannot be resolved at compile-time (default).
+  *
+  * Similar to std::tuple_size, but more general.
+  *
+  * It currently supports:
+  *  - any types T defining T::SizeAtCompileTime
+  *  - plain C arrays as T[N]
+  *  - std::array (c++11)
+  *  - some internal types such as SingleRange and AllRange
+  *
+  * The second template parameter eases SFINAE-based specializations.
+  */
+template<typename T, typename EnableIf = void> struct array_size {
+  enum { value = Dynamic };
+};
+
+template<typename T> struct array_size<T,typename internal::enable_if<((T::SizeAtCompileTime&0)==0)>::type> {
+  enum { value = T::SizeAtCompileTime };
+};
+
+template<typename T, int N> struct array_size<const T (&)[N]> {
+  enum { value = N };
+};
+template<typename T, int N> struct array_size<T (&)[N]> {
+  enum { value = N };
+};
+
+#if EIGEN_HAS_CXX11
+template<typename T, std::size_t N> struct array_size<const std::array<T,N> > {
+  enum { value = N };
+};
+template<typename T, std::size_t N> struct array_size<std::array<T,N> > {
+  enum { value = N };
+};
+#endif
+
+/** \internal
+  * Analogue of the std::size free function.
+  * It returns the size of the container or view \a x of type \c T
+  *
+  * It currently supports:
+  *  - any types T defining a member T::size() const
+  *  - plain C arrays as T[N]
+  *
+  */
+template<typename T>
+EIGEN_CONSTEXPR Index size(const T& x) { return x.size(); }
+
+template<typename T,std::size_t N>
+EIGEN_CONSTEXPR Index size(const T (&) [N]) { return N; }
+
+/** \internal
+  * Convenient struct to get the result type of a nullary, unary, binary, or
+  * ternary functor.
+  * 
+  * Pre C++11:
+  * Supports both a Func::result_type member and templated
+  * Func::result<Func(ArgTypes...)>::type member.
+  * 
+  * If none of these members is provided, then the type of the first
+  * argument is returned.
+  * 
+  * Post C++11:
+  * This uses std::result_of. However, note the `type` member removes
+  * const and converts references/pointers to their corresponding value type.
+  */
+#if EIGEN_HAS_STD_INVOKE_RESULT
+template<typename T> struct result_of;
+
+template<typename F, typename... ArgTypes>
+struct result_of<F(ArgTypes...)> {
+  typedef typename std::invoke_result<F, ArgTypes...>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#elif EIGEN_HAS_STD_RESULT_OF
+template<typename T> struct result_of {
+  typedef typename std::result_of<T>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#else
+template<typename T> struct result_of { };
+
+struct has_none {int a[1];};
+struct has_std_result_type {int a[2];};
+struct has_tr1_result {int a[3];};
+
+template<typename Func, int SizeOf>
+struct nullary_result_of_select {};
+
+template<typename Func>
+struct nullary_result_of_select<Func, sizeof(has_std_result_type)> {typedef typename Func::result_type type;};
+
+template<typename Func>
+struct nullary_result_of_select<Func, sizeof(has_tr1_result)> {typedef typename Func::template result<Func()>::type type;};
+
+template<typename Func>
+struct result_of<Func()> {
+    template<typename T>
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T()>::type const * = 0);
+    static has_none               testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename nullary_result_of_select<Func, FunctorType>::type type;
+};
+
+template<typename Func, typename ArgType, int SizeOf=sizeof(has_none)>
+struct unary_result_of_select {typedef typename internal::remove_all<ArgType>::type type;};
+
+template<typename Func, typename ArgType>
+struct unary_result_of_select<Func, ArgType, sizeof(has_std_result_type)> {typedef typename Func::result_type type;};
+
+template<typename Func, typename ArgType>
+struct unary_result_of_select<Func, ArgType, sizeof(has_tr1_result)> {typedef typename Func::template result<Func(ArgType)>::type type;};
+
+template<typename Func, typename ArgType>
+struct result_of<Func(ArgType)> {
+    template<typename T>
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType)>::type const * = 0);
+    static has_none               testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename unary_result_of_select<Func, ArgType, FunctorType>::type type;
+};
+
+template<typename Func, typename ArgType0, typename ArgType1, int SizeOf=sizeof(has_none)>
+struct binary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1>
+struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_std_result_type)>
+{typedef typename Func::result_type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1>
+struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_tr1_result)>
+{typedef typename Func::template result<Func(ArgType0,ArgType1)>::type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1>
+struct result_of<Func(ArgType0,ArgType1)> {
+    template<typename T>
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1)>::type const * = 0);
+    static has_none               testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename binary_result_of_select<Func, ArgType0, ArgType1, FunctorType>::type type;
+};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2, int SizeOf=sizeof(has_none)>
+struct ternary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
+struct ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, sizeof(has_std_result_type)>
+{typedef typename Func::result_type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
+struct ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, sizeof(has_tr1_result)>
+{typedef typename Func::template result<Func(ArgType0,ArgType1,ArgType2)>::type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
+struct result_of<Func(ArgType0,ArgType1,ArgType2)> {
+    template<typename T>
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1,ArgType2)>::type const * = 0);
+    static has_none               testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, FunctorType>::type type;
+};
+
+#endif
+
+#if EIGEN_HAS_STD_INVOKE_RESULT
+template<typename F, typename... ArgTypes>
+struct invoke_result {
+  typedef typename std::invoke_result<F, ArgTypes...>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#elif EIGEN_HAS_CXX11
+template<typename F, typename... ArgTypes>
+struct invoke_result {
+  typedef typename result_of<F(ArgTypes...)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#else
+template<typename F, typename ArgType0 = void, typename ArgType1 = void, typename ArgType2 = void>
+struct invoke_result {
+  typedef typename result_of<F(ArgType0, ArgType1, ArgType2)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F>
+struct invoke_result<F, void, void, void> {
+  typedef typename result_of<F()>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F, typename ArgType0>
+struct invoke_result<F, ArgType0, void, void> {
+  typedef typename result_of<F(ArgType0)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F, typename ArgType0, typename ArgType1>
+struct invoke_result<F, ArgType0, ArgType1, void> {
+  typedef typename result_of<F(ArgType0, ArgType1)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#endif
+
+struct meta_yes { char a[1]; };
+struct meta_no  { char a[2]; };
+
+// Check whether T::ReturnType does exist
+template <typename T>
+struct has_ReturnType
+{
+  template <typename C> static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0);
+  template <typename C> static meta_no  testFunctor(...);
+
+  enum { value = sizeof(testFunctor<T>(static_cast<T*>(0))) == sizeof(meta_yes) };
+};
+
+template<typename T> const T* return_ptr();
+
+template <typename T, typename IndexType=Index>
+struct has_nullary_operator
+{
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()())>0)>::type * = 0);
+  static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
+};
+
+template <typename T, typename IndexType=Index>
+struct has_unary_operator
+{
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()(IndexType(0)))>0)>::type * = 0);
+  static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
+};
+
+template <typename T, typename IndexType=Index>
+struct has_binary_operator
+{
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()(IndexType(0),IndexType(0)))>0)>::type * = 0);
+  static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
+};
+
+/** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
+  * Usage example: \code meta_sqrt<1023>::ret \endcode
+  */
+template<int Y,
+         int InfX = 0,
+         int SupX = ((Y==1) ? 1 : Y/2),
+         bool Done = ((SupX-InfX)<=1 ? true : ((SupX*SupX <= Y) && ((SupX+1)*(SupX+1) > Y))) >
+                                // use ?: instead of || just to shut up a stupid gcc 4.3 warning
+class meta_sqrt
+{
+    enum {
+      MidX = (InfX+SupX)/2,
+      TakeInf = MidX*MidX > Y ? 1 : 0,
+      NewInf = int(TakeInf) ? InfX : int(MidX),
+      NewSup = int(TakeInf) ? int(MidX) : SupX
+    };
+  public:
+    enum { ret = meta_sqrt<Y,NewInf,NewSup>::ret };
+};
+
+template<int Y, int InfX, int SupX>
+class meta_sqrt<Y, InfX, SupX, true> { public:  enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; };
+
+
+/** \internal Computes the least common multiple of two positive integer A and B
+  * at compile-time. 
+  */
+template<int A, int B, int K=1, bool Done = ((A*K)%B)==0, bool Big=(A>=B)>
+struct meta_least_common_multiple
+{
+  enum { ret = meta_least_common_multiple<A,B,K+1>::ret };
+};
+template<int A, int B, int K, bool Done>
+struct meta_least_common_multiple<A,B,K,Done,false>
+{
+  enum { ret = meta_least_common_multiple<B,A,K>::ret };
+};
+template<int A, int B, int K>
+struct meta_least_common_multiple<A,B,K,true,true>
+{
+  enum { ret = A*K };
+};
+
+
+/** \internal determines whether the product of two numeric types is allowed and what the return type is */
+template<typename T, typename U> struct scalar_product_traits
+{
+  enum { Defined = 0 };
+};
+
+// FIXME quick workaround around current limitation of result_of
+// template<typename Scalar, typename ArgType0, typename ArgType1>
+// struct result_of<scalar_product_op<Scalar>(ArgType0,ArgType1)> {
+// typedef typename scalar_product_traits<typename remove_all<ArgType0>::type, typename remove_all<ArgType1>::type>::ReturnType type;
+// };
+
+/** \internal Obtains a POD type suitable to use as storage for an object of a size
+  * of at most Len bytes, aligned as specified by \c Align.
+  */
+template<unsigned Len, unsigned Align>
+struct aligned_storage {
+  struct type {
+    EIGEN_ALIGN_TO_BOUNDARY(Align) unsigned char data[Len];
+  };
+};
+
+} // end namespace internal
+
+namespace numext {
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template<typename T> EIGEN_DEVICE_FUNC   void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
+#else
+template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
+#endif
+
+#if defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11
+using internal::device::numeric_limits;
+#else
+using std::numeric_limits;
+#endif
+
+// Integer division with rounding up.
+// T is assumed to be an integer type with a>=0, and b>0
+template<typename T>
+EIGEN_DEVICE_FUNC
+T div_ceil(const T &a, const T &b)
+{
+  return (a+b-1) / b;
+}
+
+// The aim of the following functions is to bypass -Wfloat-equal warnings
+// when we really want a strict equality comparison on floating points.
+template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool equal_strict(const X& x,const Y& y) { return x == y; }
+
+#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); }
+
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool equal_strict(const double& x,const double& y) { return std::equal_to<double>()(x,y); }
+#endif
+
+template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool not_equal_strict(const X& x,const Y& y) { return x != y; }
+
+#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); }
+
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to<double>()(x,y); }
+#endif
+
+} // end namespace numext
+
+} // end namespace Eigen
+
+#endif // EIGEN_META_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/NonMPL2.h b/src/3rdparty/eigen/Eigen/src/Core/util/NonMPL2.h
new file mode 100644
index 000000000..1af67cf18
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/NonMPL2.h
@@ -0,0 +1,3 @@
+#ifdef EIGEN_MPL2_ONLY
+#error Including non-MPL2 code in EIGEN_MPL2_ONLY mode
+#endif
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h b/src/3rdparty/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h
new file mode 100644
index 000000000..1ce6fd1b0
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h
@@ -0,0 +1,31 @@
+#ifdef EIGEN_WARNINGS_DISABLED_2
+// "DisableStupidWarnings.h" was included twice recursively: Do not reenable warnings yet!
+#  undef EIGEN_WARNINGS_DISABLED_2
+
+#elif defined(EIGEN_WARNINGS_DISABLED)
+#undef EIGEN_WARNINGS_DISABLED
+
+#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+  #ifdef _MSC_VER
+    #pragma warning( pop )
+  #elif defined __INTEL_COMPILER
+    #pragma warning pop
+  #elif defined __clang__
+    #pragma clang diagnostic pop
+  #elif defined __GNUC__  &&  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+    #pragma GCC diagnostic pop
+  #endif
+
+  #if defined __NVCC__
+//    Don't reenable the diagnostic messages, as it turns out these messages need
+//    to be disabled at the point of the template instantiation (i.e the user code)
+//    otherwise they'll be triggered by nvcc.
+//    #pragma diag_default code_is_unreachable
+//    #pragma diag_default initialization_not_reachable
+//    #pragma diag_default 2651
+//    #pragma diag_default 2653
+  #endif
+
+#endif
+
+#endif // EIGEN_WARNINGS_DISABLED
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/ReshapedHelper.h b/src/3rdparty/eigen/Eigen/src/Core/util/ReshapedHelper.h
new file mode 100644
index 000000000..412432132
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/ReshapedHelper.h
@@ -0,0 +1,51 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_RESHAPED_HELPER_H
+#define EIGEN_RESHAPED_HELPER_H
+
+namespace Eigen {
+
+enum AutoSize_t   { AutoSize };
+const int AutoOrder = 2;
+
+namespace internal {
+
+template<typename SizeType,typename OtherSize, int TotalSize>
+struct get_compiletime_reshape_size {
+  enum { value = get_fixed_value<SizeType>::value };
+};
+
+template<typename SizeType>
+Index get_runtime_reshape_size(SizeType size, Index /*other*/, Index /*total*/) {
+  return internal::get_runtime_value(size);
+}
+
+template<typename OtherSize, int TotalSize>
+struct get_compiletime_reshape_size<AutoSize_t,OtherSize,TotalSize> {
+  enum {
+    other_size = get_fixed_value<OtherSize>::value,
+    value = (TotalSize==Dynamic || other_size==Dynamic) ? Dynamic : TotalSize / other_size };
+};
+
+inline Index get_runtime_reshape_size(AutoSize_t /*size*/, Index other, Index total) {
+  return total/other;
+}
+
+template<int Flags, int Order>
+struct get_compiletime_reshape_order {
+  enum { value = Order == AutoOrder ? Flags & RowMajorBit : Order };
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_RESHAPED_HELPER_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/StaticAssert.h b/src/3rdparty/eigen/Eigen/src/Core/util/StaticAssert.h
new file mode 100644
index 000000000..c45de5901
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/StaticAssert.h
@@ -0,0 +1,221 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STATIC_ASSERT_H
+#define EIGEN_STATIC_ASSERT_H
+
+/* Some notes on Eigen's static assertion mechanism:
+ *
+ *  - in EIGEN_STATIC_ASSERT(CONDITION,MSG) the parameter CONDITION must be a compile time boolean
+ *    expression, and MSG an enum listed in struct internal::static_assertion<true>
+ *
+ *  - define EIGEN_NO_STATIC_ASSERT to disable them (and save compilation time)
+ *    in that case, the static assertion is converted to the following runtime assert:
+ *      eigen_assert(CONDITION && "MSG")
+ *
+ *  - currently EIGEN_STATIC_ASSERT can only be used in function scope
+ *
+ */
+
+#ifndef EIGEN_STATIC_ASSERT
+#ifndef EIGEN_NO_STATIC_ASSERT
+
+  #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (EIGEN_COMP_CXXVER >= 11) || (EIGEN_COMP_MSVC >= 1600))
+
+    // if native static_assert is enabled, let's use it
+    #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG);
+
+  #else // not CXX0X
+
+    namespace Eigen {
+
+    namespace internal {
+
+    template<bool condition>
+    struct static_assertion {};
+
+    template<>
+    struct static_assertion<true>
+    {
+      enum {
+        YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX=1,
+        YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES=1,
+        YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES=1,
+        THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE=1,
+        THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE=1,
+        THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE=1,
+        OUT_OF_RANGE_ACCESS=1,
+        YOU_MADE_A_PROGRAMMING_MISTAKE=1,
+        EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT=1,
+        EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE=1,
+        YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR=1,
+        YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR=1,
+        UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC=1,
+        THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES=1,
+        FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED=1,
+        NUMERIC_TYPE_MUST_BE_REAL=1,
+        COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED=1,
+        WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED=1,
+        THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE=1,
+        INVALID_MATRIX_PRODUCT=1,
+        INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS=1,
+        INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION=1,
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY=1,
+        THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES=1,
+        THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES=1,
+        INVALID_MATRIX_TEMPLATE_PARAMETERS=1,
+        INVALID_MATRIXBASE_TEMPLATE_PARAMETERS=1,
+        BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER=1,
+        THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX=1,
+        THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE=1,
+        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES=1,
+        YOU_ALREADY_SPECIFIED_THIS_STRIDE=1,
+        INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION=1,
+        THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD=1,
+        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1=1,
+        THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS=1,
+        YOU_CANNOT_MIX_ARRAYS_AND_MATRICES=1,
+        YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION=1,
+        THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY=1,
+        YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT=1,
+        THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS=1,
+        THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS=1,
+        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL=1,
+        THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES=1,
+        YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED=1,
+        YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED=1,
+        THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE=1,
+        THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH=1,
+        OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG=1,
+        IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY=1,
+        STORAGE_LAYOUT_DOES_NOT_MATCH=1,
+        EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE=1,
+        THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS=1,
+        MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY=1,
+        THIS_TYPE_IS_NOT_SUPPORTED=1,
+        STORAGE_KIND_MUST_MATCH=1,
+        STORAGE_INDEX_MUST_MATCH=1,
+        CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1,
+        SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1,
+        INVALID_TEMPLATE_PARAMETER=1,
+        GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1,
+        THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE=1
+      };
+    };
+
+    } // end namespace internal
+
+    } // end namespace Eigen
+
+    // Specialized implementation for MSVC to avoid "conditional
+    // expression is constant" warnings.  This implementation doesn't
+    // appear to work under GCC, hence the multiple implementations.
+    #if EIGEN_COMP_MSVC
+
+      #define EIGEN_STATIC_ASSERT(CONDITION,MSG) \
+        {Eigen::internal::static_assertion<bool(CONDITION)>::MSG;}
+
+    #else
+      // In some cases clang interprets bool(CONDITION) as function declaration
+      #define EIGEN_STATIC_ASSERT(CONDITION,MSG) \
+        if (Eigen::internal::static_assertion<static_cast<bool>(CONDITION)>::MSG) {}
+
+    #endif
+
+  #endif // not CXX0X
+
+#else // EIGEN_NO_STATIC_ASSERT
+
+  #define EIGEN_STATIC_ASSERT(CONDITION,MSG) eigen_assert((CONDITION) && #MSG);
+
+#endif // EIGEN_NO_STATIC_ASSERT
+#endif // EIGEN_STATIC_ASSERT
+
+// static assertion failing if the type \a TYPE is not a vector type
+#define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \
+  EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime, \
+                      YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX)
+
+// static assertion failing if the type \a TYPE is not fixed-size
+#define EIGEN_STATIC_ASSERT_FIXED_SIZE(TYPE) \
+  EIGEN_STATIC_ASSERT(TYPE::SizeAtCompileTime!=Eigen::Dynamic, \
+                      YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR)
+
+// static assertion failing if the type \a TYPE is not dynamic-size
+#define EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(TYPE) \
+  EIGEN_STATIC_ASSERT(TYPE::SizeAtCompileTime==Eigen::Dynamic, \
+                      YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR)
+
+// static assertion failing if the type \a TYPE is not a vector type of the given size
+#define EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(TYPE, SIZE) \
+  EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime && TYPE::SizeAtCompileTime==SIZE, \
+                      THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE)
+
+// static assertion failing if the type \a TYPE is not a vector type of the given size
+#define EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(TYPE, ROWS, COLS) \
+  EIGEN_STATIC_ASSERT(TYPE::RowsAtCompileTime==ROWS && TYPE::ColsAtCompileTime==COLS, \
+                      THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE)
+
+// static assertion failing if the two vector expression types are not compatible (same fixed-size or dynamic size)
+#define EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(TYPE0,TYPE1) \
+  EIGEN_STATIC_ASSERT( \
+      (int(TYPE0::SizeAtCompileTime)==Eigen::Dynamic \
+    || int(TYPE1::SizeAtCompileTime)==Eigen::Dynamic \
+    || int(TYPE0::SizeAtCompileTime)==int(TYPE1::SizeAtCompileTime)),\
+    YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES)
+
+#define EIGEN_PREDICATE_SAME_MATRIX_SIZE(TYPE0,TYPE1) \
+     ( \
+        (int(Eigen::internal::size_of_xpr_at_compile_time<TYPE0>::ret)==0 && int(Eigen::internal::size_of_xpr_at_compile_time<TYPE1>::ret)==0) \
+    || (\
+          (int(TYPE0::RowsAtCompileTime)==Eigen::Dynamic \
+        || int(TYPE1::RowsAtCompileTime)==Eigen::Dynamic \
+        || int(TYPE0::RowsAtCompileTime)==int(TYPE1::RowsAtCompileTime)) \
+      &&  (int(TYPE0::ColsAtCompileTime)==Eigen::Dynamic \
+        || int(TYPE1::ColsAtCompileTime)==Eigen::Dynamic \
+        || int(TYPE0::ColsAtCompileTime)==int(TYPE1::ColsAtCompileTime))\
+       ) \
+     )
+
+#define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
+    EIGEN_STATIC_ASSERT(!Eigen::NumTraits<TYPE>::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
+
+
+// static assertion failing if it is guaranteed at compile-time that the two matrix expression types have different sizes
+#define EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(TYPE0,TYPE1) \
+  EIGEN_STATIC_ASSERT( \
+     EIGEN_PREDICATE_SAME_MATRIX_SIZE(TYPE0,TYPE1),\
+    YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES)
+
+#define EIGEN_STATIC_ASSERT_SIZE_1x1(TYPE) \
+      EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Eigen::Dynamic) && \
+                          (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Eigen::Dynamic), \
+                          THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS)
+
+#define EIGEN_STATIC_ASSERT_LVALUE(Derived) \
+      EIGEN_STATIC_ASSERT(Eigen::internal::is_lvalue<Derived>::value, \
+                          THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY)
+
+#define EIGEN_STATIC_ASSERT_ARRAYXPR(Derived) \
+      EIGEN_STATIC_ASSERT((Eigen::internal::is_same<typename Eigen::internal::traits<Derived>::XprKind, ArrayXpr>::value), \
+                          THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES)
+
+#define EIGEN_STATIC_ASSERT_SAME_XPR_KIND(Derived1, Derived2) \
+      EIGEN_STATIC_ASSERT((Eigen::internal::is_same<typename Eigen::internal::traits<Derived1>::XprKind, \
+                                             typename Eigen::internal::traits<Derived2>::XprKind \
+                                            >::value), \
+                          YOU_CANNOT_MIX_ARRAYS_AND_MATRICES)
+
+// Check that a cost value is positive, and that is stay within a reasonable range
+// TODO this check could be enabled for internal debugging only
+#define EIGEN_INTERNAL_CHECK_COST_VALUE(C) \
+      EIGEN_STATIC_ASSERT((C)>=0 && (C)<=HugeCost*HugeCost, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE);
+
+#endif // EIGEN_STATIC_ASSERT_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/SymbolicIndex.h b/src/3rdparty/eigen/Eigen/src/Core/util/SymbolicIndex.h
new file mode 100644
index 000000000..354dd9add
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/SymbolicIndex.h
@@ -0,0 +1,293 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SYMBOLIC_INDEX_H
+#define EIGEN_SYMBOLIC_INDEX_H
+
+namespace Eigen {
+
+/** \namespace Eigen::symbolic
+  * \ingroup Core_Module
+  *
+  * This namespace defines a set of classes and functions to build and evaluate symbolic expressions of scalar type Index.
+  * Here is a simple example:
+  *
+  * \code
+  * // First step, defines symbols:
+  * struct x_tag {};  static const symbolic::SymbolExpr<x_tag> x;
+  * struct y_tag {};  static const symbolic::SymbolExpr<y_tag> y;
+  * struct z_tag {};  static const symbolic::SymbolExpr<z_tag> z;
+  *
+  * // Defines an expression:
+  * auto expr = (x+3)/y+z;
+  *
+  * // And evaluate it: (c++14)
+  * std::cout << expr.eval(x=6,y=3,z=-13) << "\n";
+  *
+  * // In c++98/11, only one symbol per expression is supported for now:
+  * auto expr98 = (3-x)/2;
+  * std::cout << expr98.eval(x=6) << "\n";
+  * \endcode
+  *
+  * It is currently only used internally to define and manipulate the Eigen::last and Eigen::lastp1 symbols in Eigen::seq and Eigen::seqN.
+  *
+  */
+namespace symbolic {
+
+template<typename Tag> class Symbol;
+template<typename Arg0> class NegateExpr;
+template<typename Arg1,typename Arg2> class AddExpr;
+template<typename Arg1,typename Arg2> class ProductExpr;
+template<typename Arg1,typename Arg2> class QuotientExpr;
+
+// A simple wrapper around an integral value to provide the eval method.
+// We could also use a free-function symbolic_eval...
+template<typename IndexType=Index>
+class ValueExpr {
+public:
+  ValueExpr(IndexType val) : m_value(val) {}
+  template<typename T>
+  IndexType eval_impl(const T&) const { return m_value; }
+protected:
+  IndexType m_value;
+};
+
+// Specialization for compile-time value,
+// It is similar to ValueExpr(N) but this version helps the compiler to generate better code.
+template<int N>
+class ValueExpr<internal::FixedInt<N> > {
+public:
+  ValueExpr() {}
+  template<typename T>
+  EIGEN_CONSTEXPR Index eval_impl(const T&) const { return N; }
+};
+
+
+/** \class BaseExpr
+  * \ingroup Core_Module
+  * Common base class of any symbolic expressions
+  */
+template<typename Derived>
+class BaseExpr
+{
+public:
+  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+  /** Evaluate the expression given the \a values of the symbols.
+    *
+    * \param values defines the values of the symbols, it can either be a SymbolValue or a std::tuple of SymbolValue
+    *               as constructed by SymbolExpr::operator= operator.
+    *
+    */
+  template<typename T>
+  Index eval(const T& values) const { return derived().eval_impl(values); }
+
+#if EIGEN_HAS_CXX14
+  template<typename... Types>
+  Index eval(Types&&... values) const { return derived().eval_impl(std::make_tuple(values...)); }
+#endif
+
+  NegateExpr<Derived> operator-() const { return NegateExpr<Derived>(derived()); }
+
+  AddExpr<Derived,ValueExpr<> > operator+(Index b) const
+  { return AddExpr<Derived,ValueExpr<> >(derived(),  b); }
+  AddExpr<Derived,ValueExpr<> > operator-(Index a) const
+  { return AddExpr<Derived,ValueExpr<> >(derived(), -a); }
+  ProductExpr<Derived,ValueExpr<> > operator*(Index a) const
+  { return ProductExpr<Derived,ValueExpr<> >(derived(),a); }
+  QuotientExpr<Derived,ValueExpr<> > operator/(Index a) const
+  { return QuotientExpr<Derived,ValueExpr<> >(derived(),a); }
+
+  friend AddExpr<Derived,ValueExpr<> > operator+(Index a, const BaseExpr& b)
+  { return AddExpr<Derived,ValueExpr<> >(b.derived(), a); }
+  friend AddExpr<NegateExpr<Derived>,ValueExpr<> > operator-(Index a, const BaseExpr& b)
+  { return AddExpr<NegateExpr<Derived>,ValueExpr<> >(-b.derived(), a); }
+  friend ProductExpr<ValueExpr<>,Derived> operator*(Index a, const BaseExpr& b)
+  { return ProductExpr<ValueExpr<>,Derived>(a,b.derived()); }
+  friend QuotientExpr<ValueExpr<>,Derived> operator/(Index a, const BaseExpr& b)
+  { return QuotientExpr<ValueExpr<>,Derived>(a,b.derived()); }
+
+  template<int N>
+  AddExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator+(internal::FixedInt<N>) const
+  { return AddExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(), ValueExpr<internal::FixedInt<N> >()); }
+  template<int N>
+  AddExpr<Derived,ValueExpr<internal::FixedInt<-N> > > operator-(internal::FixedInt<N>) const
+  { return AddExpr<Derived,ValueExpr<internal::FixedInt<-N> > >(derived(), ValueExpr<internal::FixedInt<-N> >()); }
+  template<int N>
+  ProductExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator*(internal::FixedInt<N>) const
+  { return ProductExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(),ValueExpr<internal::FixedInt<N> >()); }
+  template<int N>
+  QuotientExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator/(internal::FixedInt<N>) const
+  { return QuotientExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(),ValueExpr<internal::FixedInt<N> >()); }
+
+  template<int N>
+  friend AddExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator+(internal::FixedInt<N>, const BaseExpr& b)
+  { return AddExpr<Derived,ValueExpr<internal::FixedInt<N> > >(b.derived(), ValueExpr<internal::FixedInt<N> >()); }
+  template<int N>
+  friend AddExpr<NegateExpr<Derived>,ValueExpr<internal::FixedInt<N> > > operator-(internal::FixedInt<N>, const BaseExpr& b)
+  { return AddExpr<NegateExpr<Derived>,ValueExpr<internal::FixedInt<N> > >(-b.derived(), ValueExpr<internal::FixedInt<N> >()); }
+  template<int N>
+  friend ProductExpr<ValueExpr<internal::FixedInt<N> >,Derived> operator*(internal::FixedInt<N>, const BaseExpr& b)
+  { return ProductExpr<ValueExpr<internal::FixedInt<N> >,Derived>(ValueExpr<internal::FixedInt<N> >(),b.derived()); }
+  template<int N>
+  friend QuotientExpr<ValueExpr<internal::FixedInt<N> >,Derived> operator/(internal::FixedInt<N>, const BaseExpr& b)
+  { return QuotientExpr<ValueExpr<internal::FixedInt<N> > ,Derived>(ValueExpr<internal::FixedInt<N> >(),b.derived()); }
+
+#if (!EIGEN_HAS_CXX14)
+  template<int N>
+  AddExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator+(internal::FixedInt<N> (*)()) const
+  { return AddExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(), ValueExpr<internal::FixedInt<N> >()); }
+  template<int N>
+  AddExpr<Derived,ValueExpr<internal::FixedInt<-N> > > operator-(internal::FixedInt<N> (*)()) const
+  { return AddExpr<Derived,ValueExpr<internal::FixedInt<-N> > >(derived(), ValueExpr<internal::FixedInt<-N> >()); }
+  template<int N>
+  ProductExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator*(internal::FixedInt<N> (*)()) const
+  { return ProductExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(),ValueExpr<internal::FixedInt<N> >()); }
+  template<int N>
+  QuotientExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator/(internal::FixedInt<N> (*)()) const
+  { return QuotientExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(),ValueExpr<internal::FixedInt<N> >()); }
+
+  template<int N>
+  friend AddExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator+(internal::FixedInt<N> (*)(), const BaseExpr& b)
+  { return AddExpr<Derived,ValueExpr<internal::FixedInt<N> > >(b.derived(), ValueExpr<internal::FixedInt<N> >()); }
+  template<int N>
+  friend AddExpr<NegateExpr<Derived>,ValueExpr<internal::FixedInt<N> > > operator-(internal::FixedInt<N> (*)(), const BaseExpr& b)
+  { return AddExpr<NegateExpr<Derived>,ValueExpr<internal::FixedInt<N> > >(-b.derived(), ValueExpr<internal::FixedInt<N> >()); }
+  template<int N>
+  friend ProductExpr<ValueExpr<internal::FixedInt<N> >,Derived> operator*(internal::FixedInt<N> (*)(), const BaseExpr& b)
+  { return ProductExpr<ValueExpr<internal::FixedInt<N> >,Derived>(ValueExpr<internal::FixedInt<N> >(),b.derived()); }
+  template<int N>
+  friend QuotientExpr<ValueExpr<internal::FixedInt<N> >,Derived> operator/(internal::FixedInt<N> (*)(), const BaseExpr& b)
+  { return QuotientExpr<ValueExpr<internal::FixedInt<N> > ,Derived>(ValueExpr<internal::FixedInt<N> >(),b.derived()); }
+#endif
+
+
+  template<typename OtherDerived>
+  AddExpr<Derived,OtherDerived> operator+(const BaseExpr<OtherDerived> &b) const
+  { return AddExpr<Derived,OtherDerived>(derived(),  b.derived()); }
+
+  template<typename OtherDerived>
+  AddExpr<Derived,NegateExpr<OtherDerived> > operator-(const BaseExpr<OtherDerived> &b) const
+  { return AddExpr<Derived,NegateExpr<OtherDerived> >(derived(), -b.derived()); }
+
+  template<typename OtherDerived>
+  ProductExpr<Derived,OtherDerived> operator*(const BaseExpr<OtherDerived> &b) const
+  { return ProductExpr<Derived,OtherDerived>(derived(), b.derived()); }
+
+  template<typename OtherDerived>
+  QuotientExpr<Derived,OtherDerived> operator/(const BaseExpr<OtherDerived> &b) const
+  { return QuotientExpr<Derived,OtherDerived>(derived(), b.derived()); }
+};
+
+template<typename T>
+struct is_symbolic {
+  // BaseExpr has no conversion ctor, so we only have to check whether T can be statically cast to its base class BaseExpr<T>.
+  enum { value = internal::is_convertible<T,BaseExpr<T> >::value };
+};
+
+/** Represents the actual value of a symbol identified by its tag
+  *
+  * It is the return type of SymbolValue::operator=, and most of the time this is only way it is used.
+  */
+template<typename Tag>
+class SymbolValue
+{
+public:
+  /** Default constructor from the value \a val */
+  SymbolValue(Index val) : m_value(val) {}
+
+  /** \returns the stored value of the symbol */
+  Index value() const { return m_value; }
+protected:
+  Index m_value;
+};
+
+/** Expression of a symbol uniquely identified by the template parameter type \c tag */
+template<typename tag>
+class SymbolExpr : public BaseExpr<SymbolExpr<tag> >
+{
+public:
+  /** Alias to the template parameter \c tag */
+  typedef tag Tag;
+
+  SymbolExpr() {}
+
+  /** Associate the value \a val to the given symbol \c *this, uniquely identified by its \c Tag.
+    *
+    * The returned object should be passed to ExprBase::eval() to evaluate a given expression with this specified runtime-time value.
+    */
+  SymbolValue<Tag> operator=(Index val) const {
+    return SymbolValue<Tag>(val);
+  }
+
+  Index eval_impl(const SymbolValue<Tag> &values) const { return values.value(); }
+
+#if EIGEN_HAS_CXX14
+  // C++14 versions suitable for multiple symbols
+  template<typename... Types>
+  Index eval_impl(const std::tuple<Types...>& values) const { return std::get<SymbolValue<Tag> >(values).value(); }
+#endif
+};
+
+template<typename Arg0>
+class NegateExpr : public BaseExpr<NegateExpr<Arg0> >
+{
+public:
+  NegateExpr(const Arg0& arg0) : m_arg0(arg0) {}
+
+  template<typename T>
+  Index eval_impl(const T& values) const { return -m_arg0.eval_impl(values); }
+protected:
+  Arg0 m_arg0;
+};
+
+template<typename Arg0, typename Arg1>
+class AddExpr : public BaseExpr<AddExpr<Arg0,Arg1> >
+{
+public:
+  AddExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
+
+  template<typename T>
+  Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) + m_arg1.eval_impl(values); }
+protected:
+  Arg0 m_arg0;
+  Arg1 m_arg1;
+};
+
+template<typename Arg0, typename Arg1>
+class ProductExpr : public BaseExpr<ProductExpr<Arg0,Arg1> >
+{
+public:
+  ProductExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
+
+  template<typename T>
+  Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) * m_arg1.eval_impl(values); }
+protected:
+  Arg0 m_arg0;
+  Arg1 m_arg1;
+};
+
+template<typename Arg0, typename Arg1>
+class QuotientExpr : public BaseExpr<QuotientExpr<Arg0,Arg1> >
+{
+public:
+  QuotientExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
+
+  template<typename T>
+  Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) / m_arg1.eval_impl(values); }
+protected:
+  Arg0 m_arg0;
+  Arg1 m_arg1;
+};
+
+} // end namespace symbolic
+
+} // end namespace Eigen
+
+#endif // EIGEN_SYMBOLIC_INDEX_H
diff --git a/src/3rdparty/eigen/Eigen/src/Core/util/XprHelper.h b/src/3rdparty/eigen/Eigen/src/Core/util/XprHelper.h
new file mode 100644
index 000000000..71c32b8a1
--- /dev/null
+++ b/src/3rdparty/eigen/Eigen/src/Core/util/XprHelper.h
@@ -0,0 +1,856 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_XPRHELPER_H
+#define EIGEN_XPRHELPER_H
+
+// just a workaround because GCC seems to not really like empty structs
+// FIXME: gcc 4.3 generates bad code when strict-aliasing is enabled
+// so currently we simply disable this optimization for gcc 4.3
+#if EIGEN_COMP_GNUC && !EIGEN_GNUC_AT(4,3)
+  #define EIGEN_EMPTY_STRUCT_CTOR(X) \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE X() {} \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE X(const X& ) {}
+#else
+  #define EIGEN_EMPTY_STRUCT_CTOR(X)
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename IndexDest, typename IndexSrc>
+EIGEN_DEVICE_FUNC
+inline IndexDest convert_index(const IndexSrc& idx) {
+  // for sizeof(IndexDest)>=sizeof(IndexSrc) compilers should be able to optimize this away:
+  eigen_internal_assert(idx <= NumTraits<IndexDest>::highest() && "Index value to big for target type");
+  return IndexDest(idx);
+}
+
+// true if T can be considered as an integral index (i.e., and integral type or enum)
+template<typename T> struct is_valid_index_type
+{
+  enum { value =
+#if EIGEN_HAS_TYPE_TRAITS
+    internal::is_integral<T>::value || std::is_enum<T>::value
+#elif EIGEN_COMP_MSVC
+    internal::is_integral<T>::value || __is_enum(T)
+#else
+    // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index.
+    internal::is_convertible<T,Index>::value && !internal::is_same<T,float>::value && !is_same<T,double>::value
+#endif
+  };
+};
+
+// true if both types are not valid index types
+template<typename RowIndices, typename ColIndices>
+struct valid_indexed_view_overload {
+  enum { value = !(internal::is_valid_index_type<RowIndices>::value && internal::is_valid_index_type<ColIndices>::value) };
+};
+
+// promote_scalar_arg is an helper used in operation between an expression and a scalar, like:
+//    expression * scalar
+// Its role is to determine how the type T of the scalar operand should be promoted given the scalar type ExprScalar of the given expression.
+// The IsSupported template parameter must be provided by the caller as: internal::has_ReturnType<ScalarBinaryOpTraits<ExprScalar,T,op> >::value using the proper order for ExprScalar and T.
+// Then the logic is as follows:
+//  - if the operation is natively supported as defined by IsSupported, then the scalar type is not promoted, and T is returned.
+//  - otherwise, NumTraits<ExprScalar>::Literal is returned if T is implicitly convertible to NumTraits<ExprScalar>::Literal AND that this does not imply a float to integer conversion.
+//  - otherwise, ExprScalar is returned if T is implicitly convertible to ExprScalar AND that this does not imply a float to integer conversion.
+//  - In all other cases, the promoted type is not defined, and the respective operation is thus invalid and not available (SFINAE).
+template<typename ExprScalar,typename T, bool IsSupported>
+struct promote_scalar_arg;
+
+template<typename S,typename T>
+struct promote_scalar_arg<S,T,true>
+{
+  typedef T type;
+};
+
+// Recursively check safe conversion to PromotedType, and then ExprScalar if they are different.
+template<typename ExprScalar,typename T,typename PromotedType,
+  bool ConvertibleToLiteral = internal::is_convertible<T,PromotedType>::value,
+  bool IsSafe = NumTraits<T>::IsInteger || !NumTraits<PromotedType>::IsInteger>
+struct promote_scalar_arg_unsupported;
+
+// Start recursion with NumTraits<ExprScalar>::Literal
+template<typename S,typename T>
+struct promote_scalar_arg<S,T,false> : promote_scalar_arg_unsupported<S,T,typename NumTraits<S>::Literal> {};
+
+// We found a match!
+template<typename S,typename T, typename PromotedType>
+struct promote_scalar_arg_unsupported<S,T,PromotedType,true,true>
+{
+  typedef PromotedType type;
+};
+
+// No match, but no real-to-integer issues, and ExprScalar and current PromotedType are different,
+// so let's try to promote to ExprScalar
+template<typename ExprScalar,typename T, typename PromotedType>
+struct promote_scalar_arg_unsupported<ExprScalar,T,PromotedType,false,true>
+   : promote_scalar_arg_unsupported<ExprScalar,T,ExprScalar>
+{};
+
+// Unsafe real-to-integer, let's stop.
+template<typename S,typename T, typename PromotedType, bool ConvertibleToLiteral>
+struct promote_scalar_arg_unsupported<S,T,PromotedType,ConvertibleToLiteral,false> {};
+
+// T is not even convertible to ExprScalar, let's stop.
+template<typename S,typename T>
+struct promote_scalar_arg_unsupported<S,T,S,false,true> {};
+
+//classes inheriting no_assignment_operator don't generate a default operator=.
+class no_assignment_operator
+{
+  private:
+    no_assignment_operator& operator=(const no_assignment_operator&);
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(no_assignment_operator)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(no_assignment_operator)
+};
+
+/** \internal return the index type with the largest number of bits */
+template<typename I1, typename I2>
+struct promote_index_type
+{
+  typedef typename conditional<(sizeof(I1)<sizeof(I2)), I2, I1>::type type;
+};
+
+/** \internal If the template parameter Value is Dynamic, this class is just a wrapper around a T variable that
+  * can be accessed using value() and setValue().
+  * Otherwise, this class is an empty structure and value() just returns the template parameter Value.
+  */
+template<typename T, int Value> class variable_if_dynamic
+{
+  public:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(variable_if_dynamic)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
+    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    T value() { return T(Value); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    operator T() const { return T(Value); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void setValue(T v) const { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
+};
+
+template<typename T> class variable_if_dynamic<T, Dynamic>
+{
+    T m_value;
+  public:
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) EIGEN_NO_THROW : m_value(value) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
+};
+
+/** \internal like variable_if_dynamic but for DynamicIndex
+  */
+template<typename T, int Value> class variable_if_dynamicindex
+{
+  public:
+    EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamicindex)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
+    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    T value() { return T(Value); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void setValue(T) {}
+};
+
+template<typename T> class variable_if_dynamicindex<T, DynamicIndex>
+{
+    T m_value;
+    EIGEN_DEVICE_FUNC variable_if_dynamicindex() { eigen_assert(false); }
+  public:
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T value) : m_value(value) {}
+    EIGEN_DEVICE_FUNC T EIGEN_STRONG_INLINE value() const { return m_value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
+};
+
+template<typename T> struct functor_traits
+{
+  enum
+  {
+    Cost = 10,
+    PacketAccess = false,
+    IsRepeatable = false
+  };
+};
+
+template<typename T> struct packet_traits;
+
+template<typename T> struct unpacket_traits;
+
+template<int Size, typename PacketType,
+         bool Stop = Size==Dynamic || (Size%unpacket_traits<PacketType>::size)==0 || is_same<PacketType,typename unpacket_traits<PacketType>::half>::value>
+struct find_best_packet_helper;
+
+template< int Size, typename PacketType>
+struct find_best_packet_helper<Size,PacketType,true>
+{
+  typedef PacketType type;
+};
+
+template<int Size, typename PacketType>
+struct find_best_packet_helper<Size,PacketType,false>
+{
+  typedef typename find_best_packet_helper<Size,typename unpacket_traits<PacketType>::half>::type type;
+};
+
+template<typename T, int Size>
+struct find_best_packet
+{
+  typedef typename find_best_packet_helper<Size,typename packet_traits<T>::type>::type type;
+};
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+template<int ArrayBytes, int AlignmentBytes,
+         bool Match     =  bool((ArrayBytes%AlignmentBytes)==0),
+         bool TryHalf   =  bool(EIGEN_MIN_ALIGN_BYTES<AlignmentBytes) >
+struct compute_default_alignment_helper
+{
+  enum { value = 0 };
+};
+
+template<int ArrayBytes, int AlignmentBytes, bool TryHalf>
+struct compute_default_alignment_helper<ArrayBytes, AlignmentBytes, true, TryHalf> // Match
+{
+  enum { value = AlignmentBytes };
+};
+
+template<int ArrayBytes, int AlignmentBytes>
+struct compute_default_alignment_helper<ArrayBytes, AlignmentBytes, false, true> // Try-half
+{
+  // current packet too large, try with an half-packet
+  enum { value = compute_default_alignment_helper<ArrayBytes, AlignmentBytes/2>::value };
+};
+#else
+// If static alignment is disabled, no need to bother.
+// This also avoids a division by zero in "bool Match =  bool((ArrayBytes%AlignmentBytes)==0)"
+template<int ArrayBytes, int AlignmentBytes>
+struct compute_default_alignment_helper
+{
+  enum { value = 0 };
+};
+#endif
+
+template<typename T, int Size> struct compute_default_alignment {
+  enum { value = compute_default_alignment_helper<Size*sizeof(T),EIGEN_MAX_STATIC_ALIGN_BYTES>::value };
+};
+
+template<typename T> struct compute_default_alignment<T,Dynamic> {
+  enum { value = EIGEN_MAX_ALIGN_BYTES };
+};
+
+template<typename _Scalar, int _Rows, int _Cols,
+         int _Options = AutoAlign |
+                          ( (_Rows==1 && _Cols!=1) ? RowMajor
+                          : (_Cols==1 && _Rows!=1) ? ColMajor
+                          : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ),
+         int _MaxRows = _Rows,
+         int _MaxCols = _Cols
+> class make_proper_matrix_type
+{
+    enum {
+      IsColVector = _Cols==1 && _Rows!=1,
+      IsRowVector = _Rows==1 && _Cols!=1,
+      Options = IsColVector ? (_Options | ColMajor) & ~RowMajor
+              : IsRowVector ? (_Options | RowMajor) & ~ColMajor
+              : _Options
+    };
+  public:
+    typedef Matrix<_Scalar, _Rows, _Cols, Options, _MaxRows, _MaxCols> type;
+};
+
+template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+class compute_matrix_flags
+{
+    enum { row_major_bit = Options&RowMajor ? RowMajorBit : 0 };
+  public:
+    // FIXME currently we still have to handle DirectAccessBit at the expression level to handle DenseCoeffsBase<>
+    // and then propagate this information to the evaluator's flags.
+    // However, I (Gael) think that DirectAccessBit should only matter at the evaluation stage.
+    enum { ret = DirectAccessBit | LvalueBit | NestByRefBit | row_major_bit };
+};
+
+template<int _Rows, int _Cols> struct size_at_compile_time
+{
+  enum { ret = (_Rows==Dynamic || _Cols==Dynamic) ? Dynamic : _Rows * _Cols };
+};
+
+template<typename XprType> struct size_of_xpr_at_compile_time
+{
+  enum { ret = size_at_compile_time<traits<XprType>::RowsAtCompileTime,traits<XprType>::ColsAtCompileTime>::ret };
+};
+
+/* plain_matrix_type : the difference from eval is that plain_matrix_type is always a plain matrix type,
+ * whereas eval is a const reference in the case of a matrix
+ */
+
+template<typename T, typename StorageKind = typename traits<T>::StorageKind> struct plain_matrix_type;
+template<typename T, typename BaseClassType, int Flags> struct plain_matrix_type_dense;
+template<typename T> struct plain_matrix_type<T,Dense>
+{
+  typedef typename plain_matrix_type_dense<T,typename traits<T>::XprKind, traits<T>::Flags>::type type;
+};
+template<typename T> struct plain_matrix_type<T,DiagonalShape>
+{
+  typedef typename T::PlainObject type;
+};
+
+template<typename T, int Flags> struct plain_matrix_type_dense<T,MatrixXpr,Flags>
+{
+  typedef Matrix<typename traits<T>::Scalar,
+                traits<T>::RowsAtCompileTime,
+                traits<T>::ColsAtCompileTime,
+                AutoAlign | (Flags&RowMajorBit ? RowMajor : ColMajor),
+                traits<T>::MaxRowsAtCompileTime,
+                traits<T>::MaxColsAtCompileTime
+          > type;
+};
+
+template<typename T, int Flags> struct plain_matrix_type_dense<T,ArrayXpr,Flags>
+{
+  typedef Array<typename traits<T>::Scalar,
+                traits<T>::RowsAtCompileTime,
+                traits<T>::ColsAtCompileTime,
+                AutoAlign | (Flags&RowMajorBit ? RowMajor : ColMajor),
+                traits<T>::MaxRowsAtCompileTime,
+                traits<T>::MaxColsAtCompileTime
+          > type;
+};
+
+/* eval : the return type of eval(). For matrices, this is just a const reference
+ * in order to avoid a useless copy
+ */
+
+template<typename T, typename StorageKind = typename traits<T>::StorageKind> struct eval;
+
+template<typename T> struct eval<T,Dense>
+{
+  typedef typename plain_matrix_type<T>::type type;
+//   typedef typename T::PlainObject type;
+//   typedef T::Matrix<typename traits<T>::Scalar,
+//                 traits<T>::RowsAtCompileTime,
+//                 traits<T>::ColsAtCompileTime,
+//                 AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
+//                 traits<T>::MaxRowsAtCompileTime,
+//                 traits<T>::MaxColsAtCompileTime
+//           > type;
+};
+
+template<typename T> struct eval<T,DiagonalShape>
+{
+  typedef typename plain_matrix_type<T>::type type;
+};
+
+// for matrices, no need to evaluate, just use a const reference to avoid a useless copy
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct eval<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>, Dense>
+{
+  typedef const Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>& type;
+};
+
+template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
+struct eval<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>, Dense>
+{
+  typedef const Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>& type;
+};
+
+
+/* similar to plain_matrix_type, but using the evaluator's Flags */
+template<typename T, typename StorageKind = typename traits<T>::StorageKind> struct plain_object_eval;
+
+template<typename T>
+struct plain_object_eval<T,Dense>
+{
+  typedef typename plain_matrix_type_dense<T,typename traits<T>::XprKind, evaluator<T>::Flags>::type type;
+};
+
+
+/* plain_matrix_type_column_major : same as plain_matrix_type but guaranteed to be column-major
+ */
+template<typename T> struct plain_matrix_type_column_major
+{
+  enum { Rows = traits<T>::RowsAtCompileTime,
+         Cols = traits<T>::ColsAtCompileTime,
+         MaxRows = traits<T>::MaxRowsAtCompileTime,
+         MaxCols = traits<T>::MaxColsAtCompileTime
+  };
+  typedef Matrix<typename traits<T>::Scalar,
+                Rows,
+                Cols,
+                (MaxRows==1&&MaxCols!=1) ? RowMajor : ColMajor,
+                MaxRows,
+                MaxCols
+          > type;
+};
+
+/* plain_matrix_type_row_major : same as plain_matrix_type but guaranteed to be row-major
+ */
+template<typename T> struct plain_matrix_type_row_major
+{
+  enum { Rows = traits<T>::RowsAtCompileTime,
+         Cols = traits<T>::ColsAtCompileTime,
+         MaxRows = traits<T>::MaxRowsAtCompileTime,
+         MaxCols = traits<T>::MaxColsAtCompileTime
+  };
+  typedef Matrix<typename traits<T>::Scalar,
+                Rows,
+                Cols,
+                (MaxCols==1&&MaxRows!=1) ? ColMajor : RowMajor,
+                MaxRows,
+                MaxCols
+          > type;
+};
+
+/** \internal The reference selector for template expressions. The idea is that we don't
+  * need to use references for expressions since they are light weight proxy
+  * objects which should generate no copying overhead. */
+template <typename T>
+struct ref_selector
+{
+  typedef typename conditional<
+    bool(traits<T>::Flags & NestByRefBit),
+    T const&,
+    const T
+  >::type type;
+
+  typedef typename conditional<
+    bool(traits<T>::Flags & NestByRefBit),
+    T &,
+    T
+  >::type non_const_type;
+};
+
+/** \internal Adds the const qualifier on the value-type of T2 if and only if T1 is a const type */
+template<typename T1, typename T2>
+struct transfer_constness
+{
+  typedef typename conditional<
+    bool(internal::is_const<T1>::value),
+    typename internal::add_const_on_value_type<T2>::type,
+    T2
+  >::type type;
+};
+
+
+// However, we still need a mechanism to detect whether an expression which is evaluated multiple time
+// has to be evaluated into a temporary.
+// That's the purpose of this new nested_eval helper:
+/** \internal Determines how a given expression should be nested when evaluated multiple times.
+  * For example, when you do a * (b+c), Eigen will determine how the expression b+c should be
+  * evaluated into the bigger product expression. The choice is between nesting the expression b+c as-is, or
+  * evaluating that expression b+c into a temporary variable d, and nest d so that the resulting expression is
+  * a*d. Evaluating can be beneficial for example if every coefficient access in the resulting expression causes
+  * many coefficient accesses in the nested expressions -- as is the case with matrix product for example.
+  *
+  * \tparam T the type of the expression being nested.
+  * \tparam n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression.
+  * \tparam PlainObject the type of the temporary if needed.
+  */
+template<typename T, int n, typename PlainObject = typename plain_object_eval<T>::type> struct nested_eval
+{
+  enum {
+    ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
+    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a temporary?
+                                                  //      Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1.
+                                                  //      This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON
+                                                  //      for all evaluator creating a temporary. This flag is then propagated by the parent evaluators.
+                                                  //      Another solution could be to count the number of temps?
+    NAsInteger = n == Dynamic ? HugeCost : n,
+    CostEval   = (NAsInteger+1) * ScalarReadCost + CoeffReadCost,
+    CostNoEval = NAsInteger * CoeffReadCost,
+    Evaluate = (int(evaluator<T>::Flags) & EvalBeforeNestingBit) || (int(CostEval) < int(CostNoEval))
+  };
+
+  typedef typename conditional<Evaluate, PlainObject, typename ref_selector<T>::type>::type type;
+};
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+inline T* const_cast_ptr(const T* ptr)
+{
+  return const_cast<T*>(ptr);
+}
+
+template<typename Derived, typename XprKind = typename traits<Derived>::XprKind>
+struct dense_xpr_base
+{
+  /* dense_xpr_base should only ever be used on dense expressions, thus falling either into the MatrixXpr or into the ArrayXpr cases */
+};
+
+template<typename Derived>
+struct dense_xpr_base<Derived, MatrixXpr>
+{
+  typedef MatrixBase<Derived> type;
+};
+
+template<typename Derived>
+struct dense_xpr_base<Derived, ArrayXpr>
+{
+  typedef ArrayBase<Derived> type;
+};
+
+template<typename Derived, typename XprKind = typename traits<Derived>::XprKind, typename StorageKind = typename traits<Derived>::StorageKind>
+struct generic_xpr_base;
+
+template<typename Derived, typename XprKind>
+struct generic_xpr_base<Derived, XprKind, Dense>
+{
+  typedef typename dense_xpr_base<Derived,XprKind>::type type;
+};
+
+template<typename XprType, typename CastType> struct cast_return_type
+{
+  typedef typename XprType::Scalar CurrentScalarType;
+  typedef typename remove_all<CastType>::type _CastType;
+  typedef typename _CastType::Scalar NewScalarType;
+  typedef typename conditional<is_same<CurrentScalarType,NewScalarType>::value,
+                              const XprType&,CastType>::type type;
+};
+
+template <typename A, typename B> struct promote_storage_type;
+
+template <typename A> struct promote_storage_type<A,A>
+{
+  typedef A ret;
+};
+template <typename A> struct promote_storage_type<A, const A>
+{
+  typedef A ret;
+};
+template <typename A> struct promote_storage_type<const A, A>
+{
+  typedef A ret;
+};
+
+/** \internal Specify the "storage kind" of applying a coefficient-wise
+  * binary operations between two expressions of kinds A and B respectively.
+  * The template parameter Functor permits to specialize the resulting storage kind wrt to
+  * the functor.
+  * The default rules are as follows:
+  * \code
+  * A      op A      -> A
+  * A      op dense  -> dense
+  * dense  op B      -> dense
+  * sparse op dense  -> sparse
+  * dense  op sparse -> sparse
+  * \endcode
+  */
+template <typename A, typename B, typename Functor> struct cwise_promote_storage_type;
+
+template <typename A, typename Functor>                   struct cwise_promote_storage_type<A,A,Functor>                                      { typedef A      ret; };
+template <typename Functor>                               struct cwise_promote_storage_type<Dense,Dense,Functor>                              { typedef Dense  ret; };
+template <typename A, typename Functor>                   struct cwise_promote_storage_type<A,Dense,Functor>                                  { typedef Dense  ret; };
+template <typename B, typename Functor>                   struct cwise_promote_storage_type<Dense,B,Functor>                                  { typedef Dense  ret; };
+template <typename Functor>                               struct cwise_promote_storage_type<Sparse,Dense,Functor>                             { typedef Sparse ret; };
+template <typename Functor>                               struct cwise_promote_storage_type<Dense,Sparse,Functor>                             { typedef Sparse ret; };
+
+template <typename LhsKind, typename RhsKind, int LhsOrder, int RhsOrder> struct cwise_promote_storage_order {
+  enum { value = LhsOrder };
+};
+
+template <typename LhsKind, int LhsOrder, int RhsOrder>   struct cwise_promote_storage_order<LhsKind,Sparse,LhsOrder,RhsOrder>                { enum { value = RhsOrder }; };
+template <typename RhsKind, int LhsOrder, int RhsOrder>   struct cwise_promote_storage_order<Sparse,RhsKind,LhsOrder,RhsOrder>                { enum { value = LhsOrder }; };
+template <int Order>                                      struct cwise_promote_storage_order<Sparse,Sparse,Order,Order>                       { enum { value = Order }; };
+
+
+/** \internal Specify the "storage kind" of multiplying an expression of kind A with kind B.
+  * The template parameter ProductTag permits to specialize the resulting storage kind wrt to
+  * some compile-time properties of the product: GemmProduct, GemvProduct, OuterProduct, InnerProduct.
+  * The default rules are as follows:
+  * \code
+  *  K * K            -> K
+  *  dense * K        -> dense
+  *  K * dense        -> dense
+  *  diag * K         -> K
+  *  K * diag         -> K
+  *  Perm * K         -> K
+  * K * Perm          -> K
+  * \endcode
+  */
+template <typename A, typename B, int ProductTag> struct product_promote_storage_type;
+
+template <typename A, int ProductTag> struct product_promote_storage_type<A,                  A,                  ProductTag> { typedef A     ret;};
+template <int ProductTag>             struct product_promote_storage_type<Dense,              Dense,              ProductTag> { typedef Dense ret;};
+template <typename A, int ProductTag> struct product_promote_storage_type<A,                  Dense,              ProductTag> { typedef Dense ret; };
+template <typename B, int ProductTag> struct product_promote_storage_type<Dense,              B,                  ProductTag> { typedef Dense ret; };
+
+template <typename A, int ProductTag> struct product_promote_storage_type<A,                  DiagonalShape,      ProductTag> { typedef A ret; };
+template <typename B, int ProductTag> struct product_promote_storage_type<DiagonalShape,      B,                  ProductTag> { typedef B ret; };
+template <int ProductTag>             struct product_promote_storage_type<Dense,              DiagonalShape,      ProductTag> { typedef Dense ret; };
+template <int ProductTag>             struct product_promote_storage_type<DiagonalShape,      Dense,              ProductTag> { typedef Dense ret; };
+
+template <typename A, int ProductTag> struct product_promote_storage_type<A,                  PermutationStorage, ProductTag> { typedef A ret; };
+template <typename B, int ProductTag> struct product_promote_storage_type<PermutationStorage, B,                  ProductTag> { typedef B ret; };
+template <int ProductTag>             struct product_promote_storage_type<Dense,              PermutationStorage, ProductTag> { typedef Dense ret; };
+template <int ProductTag>             struct product_promote_storage_type<PermutationStorage, Dense,              ProductTag> { typedef Dense ret; };
+
+/** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type.
+  * \tparam Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType.
+  */
+template<typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
+struct plain_row_type
+{
+  typedef Matrix<Scalar, 1, ExpressionType::ColsAtCompileTime,
+                 int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> MatrixRowType;
+  typedef Array<Scalar, 1, ExpressionType::ColsAtCompileTime,
+                 int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> ArrayRowType;
+
+  typedef typename conditional<
+    is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
+    MatrixRowType,
+    ArrayRowType
+  >::type type;
+};
+
+template<typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
+struct plain_col_type
+{
+  typedef Matrix<Scalar, ExpressionType::RowsAtCompileTime, 1,
+                 ExpressionType::PlainObject::Options & ~RowMajor, ExpressionType::MaxRowsAtCompileTime, 1> MatrixColType;
+  typedef Array<Scalar, ExpressionType::RowsAtCompileTime, 1,
+                 ExpressionType::PlainObject::Options & ~RowMajor, ExpressionType::MaxRowsAtCompileTime, 1> ArrayColType;
+
+  typedef typename conditional<
+    is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
+    MatrixColType,
+    ArrayColType
+  >::type type;
+};
+
+template<typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
+struct plain_diag_type
+{
+  enum { diag_size = EIGEN_SIZE_MIN_PREFER_DYNAMIC(ExpressionType::RowsAtCompileTime, ExpressionType::ColsAtCompileTime),
+         max_diag_size = EIGEN_SIZE_MIN_PREFER_FIXED(ExpressionType::MaxRowsAtCompileTime, ExpressionType::MaxColsAtCompileTime)
+  };
+  typedef Matrix<Scalar, diag_size, 1, ExpressionType::PlainObject::Options & ~RowMajor, max_diag_size, 1> MatrixDiagType;
+  typedef Array<Scalar, diag_size, 1, ExpressionType::PlainObject::Options & ~RowMajor, max_diag_size, 1> ArrayDiagType;
+
+  typedef typename conditional<
+    is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
+    MatrixDiagType,
+    ArrayDiagType
+  >::type type;
+};
+
+template<typename Expr,typename Scalar = typename Expr::Scalar>
+struct plain_constant_type
+{
+  enum { Options = (traits<Expr>::Flags&RowMajorBit)?RowMajor:0 };
+
+  typedef Array<Scalar,  traits<Expr>::RowsAtCompileTime,   traits<Expr>::ColsAtCompileTime,
+                Options, traits<Expr>::MaxRowsAtCompileTime,traits<Expr>::MaxColsAtCompileTime> array_type;
+
+  typedef Matrix<Scalar,  traits<Expr>::RowsAtCompileTime,   traits<Expr>::ColsAtCompileTime,
+                 Options, traits<Expr>::MaxRowsAtCompileTime,traits<Expr>::MaxColsAtCompileTime> matrix_type;
+
+  typedef CwiseNullaryOp<scalar_constant_op<Scalar>, const typename conditional<is_same< typename traits<Expr>::XprKind, MatrixXpr >::value, matrix_type, array_type>::type > type;
+};
+
+template<typename ExpressionType>
+struct is_lvalue
+{
+  enum { value = (!bool(is_const<ExpressionType>::value)) &&
+                 bool(traits<ExpressionType>::Flags & LvalueBit) };
+};
+
+template<typename T> struct is_diagonal
+{ enum { ret = false }; };
+
+template<typename T> struct is_diagonal<DiagonalBase<T> >
+{ enum { ret = true }; };
+
+template<typename T> struct is_diagonal<DiagonalWrapper<T> >
+{ enum { ret = true }; };
+
+template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
+{ enum { ret = true }; };
+
+
+template<typename T> struct is_identity
+{ enum { value = false }; };
+
+template<typename T> struct is_identity<CwiseNullaryOp<internal::scalar_identity_op<typename T::Scalar>, T> >
+{ enum { value = true }; };
+
+
+template<typename S1, typename S2> struct glue_shapes;
+template<> struct glue_shapes<DenseShape,TriangularShape> { typedef TriangularShape type;  };
+
+template<typename T1, typename T2>
+struct possibly_same_dense {
+  enum { value = has_direct_access<T1>::ret && has_direct_access<T2>::ret && is_same<typename T1::Scalar,typename T2::Scalar>::value };
+};
+
+template<typename T1, typename T2>
+EIGEN_DEVICE_FUNC
+bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<possibly_same_dense<T1,T2>::value>::type * = 0)
+{
+  return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride());
+}
+
+template<typename T1, typename T2>
+EIGEN_DEVICE_FUNC
+bool is_same_dense(const T1 &, const T2 &, typename enable_if<!possibly_same_dense<T1,T2>::value>::type * = 0)
+{
+  return false;
+}
+
+// Internal helper defining the cost of a scalar division for the type T.
+// The default heuristic can be specialized for each scalar type and architecture.
+template<typename T,bool Vectorized=false,typename EnableIf = void>
+struct scalar_div_cost {
+  enum { value = 8*NumTraits<T>::MulCost };
+};
+
+template<typename T,bool Vectorized>
+struct scalar_div_cost<std::complex<T>, Vectorized> {
+  enum { value = 2*scalar_div_cost<T>::value
+               + 6*NumTraits<T>::MulCost
+               + 3*NumTraits<T>::AddCost
+  };
+};
+
+
+template<bool Vectorized>
+struct scalar_div_cost<signed long,Vectorized,typename conditional<sizeof(long)==8,void,false_type>::type> { enum { value = 24 }; };
+template<bool Vectorized>
+struct scalar_div_cost<unsigned long,Vectorized,typename conditional<sizeof(long)==8,void,false_type>::type> { enum { value = 21 }; };
+
+
+#ifdef EIGEN_DEBUG_ASSIGN
+std::string demangle_traversal(int t)
+{
+  if(t==DefaultTraversal) return "DefaultTraversal";
+  if(t==LinearTraversal) return "LinearTraversal";
+  if(t==InnerVectorizedTraversal) return "InnerVectorizedTraversal";
+  if(t==LinearVectorizedTraversal) return "LinearVectorizedTraversal";
+  if(t==SliceVectorizedTraversal) return "SliceVectorizedTraversal";
+  return "?";
+}
+std::string demangle_unrolling(int t)
+{
+  if(t==NoUnrolling) return "NoUnrolling";
+  if(t==InnerUnrolling) return "InnerUnrolling";
+  if(t==CompleteUnrolling) return "CompleteUnrolling";
+  return "?";
+}
+std::string demangle_flags(int f)
+{
+  std::string res;
+  if(f&RowMajorBit)                 res += " | RowMajor";
+  if(f&PacketAccessBit)             res += " | Packet";
+  if(f&LinearAccessBit)             res += " | Linear";
+  if(f&LvalueBit)                   res += " | Lvalue";
+  if(f&DirectAccessBit)             res += " | Direct";
+  if(f&NestByRefBit)                res += " | NestByRef";
+  if(f&NoPreferredStorageOrderBit)  res += " | NoPreferredStorageOrderBit";
+
+  return res;
+}
+#endif
+
+} // end namespace internal
+
+
+/** \class ScalarBinaryOpTraits
+  * \ingroup Core_Module
+  *
+  * \brief Determines whether the given binary operation of two numeric types is allowed and what the scalar return type is.
+  *
+  * This class permits to control the scalar return type of any binary operation performed on two different scalar types through (partial) template specializations.
+  *
+  * For instance, let \c U1, \c U2 and \c U3 be three user defined scalar types for which most operations between instances of \c U1 and \c U2 returns an \c U3.
+  * You can let %Eigen knows that by defining:
+    \code
+    template<typename BinaryOp>
+    struct ScalarBinaryOpTraits<U1,U2,BinaryOp> { typedef U3 ReturnType;  };
+    template<typename BinaryOp>
+    struct ScalarBinaryOpTraits<U2,U1,BinaryOp> { typedef U3 ReturnType;  };
+    \endcode
+  * You can then explicitly disable some particular operations to get more explicit error messages:
+    \code
+    template<>
+    struct ScalarBinaryOpTraits<U1,U2,internal::scalar_max_op<U1,U2> > {};
+    \endcode
+  * Or customize the return type for individual operation:
+    \code
+    template<>
+    struct ScalarBinaryOpTraits<U1,U2,internal::scalar_sum_op<U1,U2> > { typedef U1 ReturnType; };
+    \endcode
+  *
+  * By default, the following generic combinations are supported:
+  <table class="manual">
+  <tr><th>ScalarA</th><th>ScalarB</th><th>BinaryOp</th><th>ReturnType</th><th>Note</th></tr>
+  <tr            ><td>\c T </td><td>\c T </td><td>\c * </td><td>\c T </td><td></td></tr>
+  <tr class="alt"><td>\c NumTraits<T>::Real </td><td>\c T </td><td>\c * </td><td>\c T </td><td>Only if \c NumTraits<T>::IsComplex </td></tr>
+  <tr            ><td>\c T </td><td>\c NumTraits<T>::Real </td><td>\c * </td><td>\c T </td><td>Only if \c NumTraits<T>::IsComplex </td></tr>
+  </table>
+  *
+  * \sa CwiseBinaryOp
+  */
+template<typename ScalarA, typename ScalarB, typename BinaryOp=internal::scalar_product_op<ScalarA,ScalarB> >
+struct ScalarBinaryOpTraits
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  // for backward compatibility, use the hints given by the (deprecated) internal::scalar_product_traits class.
+  : internal::scalar_product_traits<ScalarA,ScalarB>
+#endif // EIGEN_PARSED_BY_DOXYGEN
+{};
+
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T,T,BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+template <typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T, typename NumTraits<typename internal::enable_if<NumTraits<T>::IsComplex,T>::type>::Real, BinaryOp>
+{
+  typedef T ReturnType;
+};
+template <typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<typename NumTraits<typename internal::enable_if<NumTraits<T>::IsComplex,T>::type>::Real, T, BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+// For Matrix * Permutation
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T,void,BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+// For Permutation * Matrix
+template<typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<void,T,BinaryOp>
+{
+  typedef T ReturnType;
+};
+
+// for Permutation*Permutation
+template<typename BinaryOp>
+struct ScalarBinaryOpTraits<void,void,BinaryOp>
+{
+  typedef void ReturnType;
+};
+
+// We require Lhs and Rhs to have "compatible" scalar types.
+// It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths.
+// So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to
+// add together a float matrix and a double matrix.
+#define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
+  EIGEN_STATIC_ASSERT((Eigen::internal::has_ReturnType<ScalarBinaryOpTraits<LHS, RHS,BINOP> >::value), \
+    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+} // end namespace Eigen
+
+#endif // EIGEN_XPRHELPER_H