diff --git a/CMake/NeighborhoodSearch.cmake b/CMake/NeighborhoodSearch.cmake index bb9631f9..f208d002 100644 --- a/CMake/NeighborhoodSearch.cmake +++ b/CMake/NeighborhoodSearch.cmake @@ -8,7 +8,7 @@ else(BUILD_SHARED_LIBS) set(LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) endif(BUILD_SHARED_LIBS) -option(USE_GPU_NEIGHBORHOOD_SEARCH "Use GPU neighborhood search" OFF) +option(USE_GPU_NEIGHBORHOOD_SEARCH "Use GPU neighborhood search" ON) if(USE_GPU_NEIGHBORHOOD_SEARCH) @@ -24,8 +24,8 @@ if(USE_GPU_NEIGHBORHOOD_SEARCH) ExternalProject_Add( Ext_NeighborhoodSearch PREFIX "${CMAKE_SOURCE_DIR}/extern/cuNSearch" - GIT_REPOSITORY https://github.com/InteractiveComputerGraphics/cuNSearch.git - GIT_TAG "aba3da18cb4f45cd05d729465d1725891ffc33da" + GIT_REPOSITORY https://gitlab.com/R.Baumgartner/cunsearch_update.git + GIT_TAG "9cd6d64c03a1b60d6eb99dcaf4fe647f90fe7020" INSTALL_DIR ${ExternalInstallDir}/NeighborhoodSearch CMAKE_ARGS -DCMAKE_BUILD_TYPE=${EXT_CMAKE_BUILD_TYPE} -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_INSTALL_PREFIX:PATH=${ExternalInstallDir}/NeighborhoodSearch -DCUNSEARCH_USE_DOUBLE_PRECISION:BOOL=${USE_DOUBLE_PRECISION} -DBUILD_DEMO:BOOL=OFF ) diff --git a/CMakeLists.txt b/CMakeLists.txt index ba0538b4..89825170 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -82,7 +82,8 @@ ExternalProject_Add( Ext_Discregrid PREFIX "${CMAKE_SOURCE_DIR}/extern/Discregrid" GIT_REPOSITORY https://github.com/InteractiveComputerGraphics/Discregrid.git - GIT_TAG "c0fb5aeac4c8a83e9f37c720315f13a834409b81" + #GIT_TAG "c0fb5aeac4c8a83e9f37c720315f13a834409b81" + GIT_TAG "c992835894e32427f9d4d7262f301df7454150c9" INSTALL_DIR ${ExternalInstallDir}/Discregrid CMAKE_ARGS -DCMAKE_BUILD_TYPE:STRING=${EXT_CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX:PATH=${ExternalInstallDir}/Discregrid -DBUILD_CMD_EXECUTABLE:BOOL=0 -DEIGEN3_INCLUDE_DIR:PATH=${EIGEN3_INCLUDE_DIR} ) diff --git a/SPlisHSPlasH/BoundaryModel_Akinci2012.h b/SPlisHSPlasH/BoundaryModel_Akinci2012.h index 1021a2f0..ac8634ca 100644 --- a/SPlisHSPlasH/BoundaryModel_Akinci2012.h +++ b/SPlisHSPlasH/BoundaryModel_Akinci2012.h @@ -46,6 +46,26 @@ namespace SPH virtual void loadState(BinaryFileReader &binReader); void initModel(RigidBodyObject *rbo, const unsigned int numBoundaryParticles, Vector3r *boundaryParticles); + + FORCE_INLINE const Vector3r &getRigidBodyPosition() + { + return m_rigidBody->getPosition(); + } + + FORCE_INLINE const bool isDynamic() + { + return m_rigidBody->isDynamic(); + } + + FORCE_INLINE std::vector &getForcesPerThread() + { + return m_forcePerThread; + } + + FORCE_INLINE std::vector &getTorquesPerThread() + { + return m_torquePerThread; + } FORCE_INLINE Vector3r &getPosition0(const unsigned int i) { @@ -77,6 +97,11 @@ namespace SPH m_x[i] = pos; } + FORCE_INLINE std::vector &getVelocities() + { + return m_v; + } + FORCE_INLINE Vector3r &getVelocity(const unsigned int i) { return m_v[i]; @@ -92,6 +117,11 @@ namespace SPH m_v[i] = vel; } + FORCE_INLINE std::vector& getVolumes() + { + return m_V; + } + FORCE_INLINE const Real& getVolume(const unsigned int i) const { return m_V[i]; diff --git a/SPlisHSPlasH/CMakeLists.txt b/SPlisHSPlasH/CMakeLists.txt index 4b727451..dd2ef482 100644 --- a/SPlisHSPlasH/CMakeLists.txt +++ b/SPlisHSPlasH/CMakeLists.txt @@ -1,11 +1,15 @@ set(WCSPH_HEADER_FILES WCSPH/SimulationDataWCSPH.h WCSPH/TimeStepWCSPH.h + + WCSPH/TimeStepWCSPHGPU.h ) set(WCSPH_SOURCE_FILES WCSPH/SimulationDataWCSPH.cpp WCSPH/TimeStepWCSPH.cpp + + WCSPH/TimeStepWCSPHGPU.cu ) set(PCISPH_HEADER_FILES @@ -43,11 +47,15 @@ set(IISPH_SOURCE_FILES set(DFSPH_HEADER_FILES DFSPH/SimulationDataDFSPH.h DFSPH/TimeStepDFSPH.h + + DFSPH/TimeStepDFSPHGPU.h ) set(DFSPH_SOURCE_FILES DFSPH/SimulationDataDFSPH.cpp DFSPH/TimeStepDFSPH.cpp + + DFSPH/TimeStepDFSPHGPU.cu ) set(PF_HEADER_FILES @@ -59,6 +67,11 @@ set(PF_SOURCE_FILES PF/SimulationDataPF.cpp PF/TimeStepPF.cpp ) + +set(GPU_UTILS + UtilitiesGPU/Kernels.cuh + UtilitiesGPU/Kernels.cu + ) set(SURFACETENSION_HEADER_FILES SurfaceTension/SurfaceTensionBase.h @@ -178,7 +191,7 @@ include_directories(${PROJECT_PATH}/extern/install/Discregrid/include) ############################################################ include_directories(${PROJECT_PATH}/extern/install/GenericParameters/include) -add_library(SPlisHSPlasH +cuda_add_library(SPlisHSPlasH Common.h NeighborhoodSearch.h @@ -251,6 +264,8 @@ add_library(SPlisHSPlasH ${UTILS_HEADER_FILES} ${UTILS_SOURCE_FILES} + + ${GPU_UTILS} ) add_dependencies(SPlisHSPlasH Ext_NeighborhoodSearch) diff --git a/SPlisHSPlasH/DFSPH/TimeStepDFSPHGPU.cu b/SPlisHSPlasH/DFSPH/TimeStepDFSPHGPU.cu new file mode 100644 index 00000000..8ac082ea --- /dev/null +++ b/SPlisHSPlasH/DFSPH/TimeStepDFSPHGPU.cu @@ -0,0 +1,949 @@ +#include "TimeStepDFSPHGPU.h" +#include "SPlisHSPlasH/TimeManager.h" +#include "SPlisHSPlasH/SPHKernels.h" +#include "SimulationDataDFSPH.h" +#include +#include "Utilities/Timing.h" +#include "Utilities/Counting.h" +#include "SPlisHSPlasH/Simulation.h" +#include "SPlisHSPlasH/BoundaryModel_Akinci2012.h" +#include "SPlisHSPlasH/BoundaryModel_Koschier2017.h" +#include "SPlisHSPlasH/BoundaryModel_Bender2019.h" + +#include "../../extern/cuNSearch/src/Ext_NeighborhoodSearch/src/PointSetImplementation.h" + + +using namespace SPH; +using namespace std; +using namespace GenParam; +using namespace cuNSearch; + +#define USE_CORRECTED_FORMULATION + +int TimeStepDFSPHGPU::SOLVER_ITERATIONS_V = -1; +int TimeStepDFSPHGPU::MAX_ITERATIONS_V = -1; +int TimeStepDFSPHGPU::MAX_ERROR_V = -1; +int TimeStepDFSPHGPU::USE_DIVERGENCE_SOLVER = -1; + + +TimeStepDFSPHGPU::TimeStepDFSPHGPU() : + TimeStep(), + m_simulationData() +{ + m_simulationData.init(); + m_counter = 0; + m_iterationsV = 0; + m_enableDivergenceSolver = true; + m_maxIterationsV = 100; + m_maxErrorV = 0.1; + + CudaHelper::CudaMalloc(&d_kernelData, 1); + + Simulation *sim = Simulation::getCurrent(); + const unsigned int nModels = sim->numberOfFluidModels(); + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nModels; fluidModelIndex++) + { + FluidModel *model = sim->getFluidModel(fluidModelIndex); + model->addField({ "factor", FieldType::Scalar, [this, fluidModelIndex](const unsigned int i) -> Real* { return &m_simulationData.getFactor(fluidModelIndex, i); } }); + model->addField({ "advected density", FieldType::Scalar, [this, fluidModelIndex](const unsigned int i) -> Real* { return &m_simulationData.getDensityAdv(fluidModelIndex, i); } }); + model->addField({ "kappa", FieldType::Scalar, [this, fluidModelIndex](const unsigned int i) -> Real* { return &m_simulationData.getKappa(fluidModelIndex, i); }, true }); + model->addField({ "kappa_v", FieldType::Scalar, [this, fluidModelIndex](const unsigned int i) -> Real* { return &m_simulationData.getKappaV(fluidModelIndex, i); }, true }); + } +} + +TimeStepDFSPHGPU::~TimeStepDFSPHGPU(void) +{ + CudaHelper::CudaFree(d_kernelData); + CudaHelper::CudaFree(d_neighbors); + CudaHelper::CudaFree(d_neighborCounts); + CudaHelper::CudaFree(d_neighborOffsets); + CudaHelper::CudaFree(d_neighborPointsetIndices); + + Simulation *sim = Simulation::getCurrent(); + const unsigned int nModels = sim->numberOfFluidModels(); + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nModels; fluidModelIndex++) + { + FluidModel *model = sim->getFluidModel(fluidModelIndex); + model->removeFieldByName("factor"); + model->removeFieldByName("advected density"); + model->removeFieldByName("kappa"); + model->removeFieldByName("kappa_v"); + } +} + +void TimeStepDFSPHGPU::initParameters() +{ + TimeStep::initParameters(); + + SOLVER_ITERATIONS_V = createNumericParameter("iterationsV", "Iterations (divergence)", &m_iterationsV); + setGroup(SOLVER_ITERATIONS_V, "DFSPH"); + setDescription(SOLVER_ITERATIONS_V, "Iterations required by the divergence solver."); + getParameter(SOLVER_ITERATIONS_V)->setReadOnly(true); + + MAX_ITERATIONS_V = createNumericParameter("maxIterationsV", "Max. iterations (divergence)", &m_maxIterationsV); + setGroup(MAX_ITERATIONS_V, "DFSPH"); + setDescription(MAX_ITERATIONS_V, "Maximal number of iterations of the divergence solver."); + static_cast*>(getParameter(MAX_ITERATIONS_V))->setMinValue(1); + + MAX_ERROR_V = createNumericParameter("maxErrorV", "Max. divergence error(%)", &m_maxErrorV); + setGroup(MAX_ERROR_V, "DFSPH"); + setDescription(MAX_ERROR_V, "Maximal divergence error (%)."); + static_cast(getParameter(MAX_ERROR_V))->setMinValue(1e-6); + + USE_DIVERGENCE_SOLVER = createBoolParameter("enableDivergenceSolver", "Enable divergence solver", &m_enableDivergenceSolver); + setGroup(USE_DIVERGENCE_SOLVER, "DFSPH"); + setDescription(USE_DIVERGENCE_SOLVER, "Turn divergence solver on/off."); +} + +void TimeStepDFSPHGPU::initCUDA() +{ // sim init in static boundary simulator + Simulation *sim = Simulation::getCurrent(); + const unsigned int nModels = sim->numberOfFluidModels(); + + std::vector &pointSets = sim->getCurrent()->getPointSets(); + d_particles.resize(pointSets.size()); + for(int i = 0 ; i < pointSets.size() ; ++i) + { + d_particles[i] = CudaHelper::GetPointer(pointSets[i].getPointSetImplementation()->getParticles()); + } + + d_volumes.resize(nModels); + d_densities0.resize(nModels); + for(unsigned int pid = 0; pid < nModels; pid++) + { + FluidModel *fm = sim->getFluidModel(pid); + d_volumes[pid] = fm->getVolume(0); // TODO: ask Prof. Bender regarding scalability + d_densities0[pid] = fm->getDensity0(); + } + + d_fmIndices.resize(nModels); + + d_rigidBodyPositions.resize(sim->numberOfPointSets() - nModels); + d_isDynamic.resize(sim->numberOfPointSets() - nModels); + d_forcesPerThreadIndices.resize(sim->numberOfPointSets() - nModels); + d_torquesPerThreadIndices.resize(sim->numberOfPointSets() - nModels); +} + +void TimeStepDFSPHGPU::step() +{ + Simulation *sim = Simulation::getCurrent(); + TimeManager *tm = TimeManager::getCurrent(); + const Real h = tm->getTimeStepSize(); + const unsigned int nModels = sim->numberOfFluidModels(); + const unsigned int nPointSets = sim->numberOfPointSets(); + + performNeighborhoodSearch(); + + if(!isInitialized) + { + initCUDA(); // TODO: shift this in init or constructor + } + + prepareData(); + + // re-compute the precomputed kernel if necessary + if( sim->getSupportRadius() != PrecomputedKernel::getRadius() || !isInitialized) + { + PrecomputedKernel::setRadius(sim->getSupportRadius()); + updateKernelData(kernelData); + CudaHelper::MemcpyHostToDevice(&kernelData, d_kernelData, 1); + + isInitialized = true; + } + + START_TIMING("compute the densities"); + unsigned int sumActiveParticles = 0; + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nModels; fluidModelIndex++) + { + FluidModel *model = sim->getFluidModel(fluidModelIndex); + const unsigned int numParticles = model->numActiveParticles(); + std::vector &pointSets = sim->getCurrent()->getPointSets(); + PointSetImplementation *impl = pointSets[fluidModelIndex].getPointSetImplementation(); + const Real W_zero = sim->W_zero(); + + //computeDensities(fluidModelIndex); + computeDensitiesGPU<<getNumberOfBlocks(), impl->getThreadsPerBlock(), impl->getThreadsPerBlock() * sizeof(Real)>>>( d_fmDensities, CudaHelper::GetPointer(d_volumes), CudaHelper::GetPointer(d_boundaryVolumes), + CudaHelper::GetPointer(d_boundaryVolumeIndices), CudaHelper::GetPointer(d_fmIndices), CudaHelper::GetPointer(d_densities0), W_zero, d_kernelData, + CudaHelper::GetPointer(d_particles), d_neighbors, d_neighborCounts, d_neighborOffsets, d_neighborPointsetIndices, nModels, + nPointSets, fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); + CudaHelper::DeviceSynchronize(); + + CudaHelper::MemcpyDeviceToHost(d_fmDensities + sumActiveParticles, &(model->getDensity(0)), sumParticles); + sumActiveParticles += numParticles; + } + STOP_TIMING_AVG; + + START_TIMING("computeDFSPHFactor"); + CudaHelper::CudaMalloc(&d_factors, sumParticles); + + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nModels; fluidModelIndex++) + { + FluidModel *model = sim->getFluidModel(fluidModelIndex); + const unsigned int numParticles = model->numActiveParticles(); + std::vector &pointSets = sim->getCurrent()->getPointSets(); + PointSetImplementation *impl = pointSets[fluidModelIndex].getPointSetImplementation(); + + computeDFSPHFactors<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>( d_factors, CudaHelper::GetPointer(d_boundaryVolumes), CudaHelper::GetPointer(d_boundaryVolumeIndices), d_kernelData, + CudaHelper::GetPointer(d_fmIndices), CudaHelper::GetPointer(d_volumes), m_eps, CudaHelper::GetPointer(d_particles), + d_neighbors, d_neighborCounts, d_neighborOffsets, d_neighborPointsetIndices, nModels, + nPointSets, fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); + CudaHelper::DeviceSynchronize(); + } + + STOP_TIMING_AVG; + + if (m_enableDivergenceSolver) + { + for(unsigned int pid = 0; pid < nModels; pid++) + { + FluidModel *fm_neighbor = sim->getFluidModel(pid); + d_fmVelocities.insert(d_fmVelocities.end(), fm_neighbor->getVelocities().begin(), fm_neighbor->getVelocities().begin() + fm_neighbor->numActiveParticles()); + } + + START_TIMING("divergenceSolve"); + divergenceSolve(); + STOP_TIMING_AVG + + sumActiveParticles = 0; + for(unsigned int fluidModelIndex = 0; fluidModelIndex < sim->numberOfFluidModels(); fluidModelIndex++) + { + FluidModel *fm_neighbor = sim->getFluidModel(fluidModelIndex); + CudaHelper::MemcpyDeviceToHost( CudaHelper::GetPointer(d_fmVelocities) + sumActiveParticles, &(fm_neighbor->getVelocity(0)), sim->getFluidModel(fluidModelIndex)->numActiveParticles()); + sumActiveParticles += fm_neighbor->numActiveParticles(); + } + } + + else + m_iterationsV = 0; + + // Compute accelerations: a(t) + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nModels; fluidModelIndex++) + clearAccelerations(fluidModelIndex); + + sim->computeNonPressureForces(); + + sim->updateTimeStepSize(); + + // compute new velocities only considering non-pressure forces + for (unsigned int m = 0; m < nModels; m++) + { + FluidModel *fm = sim->getFluidModel(m); + const unsigned int numParticles = fm->numActiveParticles(); + #pragma omp parallel default(shared) + { + #pragma omp for schedule(static) + for (int i = 0; i < (int)numParticles; i++) + { + if (fm->getParticleState(i) == ParticleState::Active) + { + Vector3r &vel = fm->getVelocity(i); + vel += h * fm->getAcceleration(i); + } + } + } + } + + // put velocities on GPU again + d_fmVelocities.clear(); d_fmVelocities.shrink_to_fit(); + for(unsigned int pid = 0; pid < nModels; pid++) + { + FluidModel *fm_neighbor = sim->getFluidModel(pid); + d_fmVelocities.insert(d_fmVelocities.end(), fm_neighbor->getVelocities().begin(), fm_neighbor->getVelocities().begin() + fm_neighbor->numActiveParticles()); + } + + START_TIMING("pressureSolve"); + pressureSolve(); + STOP_TIMING_AVG; + + START_TIMING("Copy data back from GPU"); + getDataBack(); + STOP_TIMING_AVG; + + // compute final positions + for (unsigned int m = 0; m < nModels; m++) + { + FluidModel *fm = sim->getFluidModel(m); + const unsigned int numParticles = fm->numActiveParticles(); + #pragma omp parallel default(shared) + { + #pragma omp for schedule(static) + for (int i = 0; i < (int)numParticles; i++) + { + if (fm->getParticleState(i) == ParticleState::Active) + { + Vector3r &xi = fm->getPosition(i); + const Vector3r &vi = fm->getVelocity(i); + xi += h * vi; + } + } + } + } + + sim->emitParticles(); + sim->animateParticles(); + + // Compute new time + tm->setTime (tm->getTime () + h); +} + +#ifdef USE_WARMSTART +void TimeStepDFSPHGPU::warmstartPressureSolve(const unsigned int fluidModelIndex) +{ + const Real h = TimeManager::getCurrent()->getTimeStepSize(); + const Real h2 = h*h; + const Real invH = static_cast(1.0) / h; + const Real invH2 = static_cast(1.0) / h2; + Simulation *sim = Simulation::getCurrent(); + FluidModel *model = sim->getFluidModel(fluidModelIndex); + const Real density0 = model->getDensity0(); + const int numParticles = (int)model->numActiveParticles(); + const unsigned int nPointSets = sim->numberOfPointSets(); + std::vector &pointSets = sim->getCurrent()->getPointSets(); + PointSetImplementation *impl = pointSets[fluidModelIndex].getPointSetImplementation(); + if (numParticles == 0) + return; + + const unsigned int nFluids = sim->numberOfFluidModels(); + const unsigned int nBoundaries = sim->numberOfBoundaryModels(); + + ////////////////////////////////////////////////////////////////////////// + // Divide by h^2, the time step size has been removed in + // the last step to make the stiffness value independent + // of the time step size + ////////////////////////////////////////////////////////////////////////// + warmstartPressureSolveKappa<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>(d_kappa, CudaHelper::GetPointer(d_fmIndices), CudaHelper::GetPointer(d_densities0), invH2, fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); + + pressureSolveWarmstart<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>( CudaHelper::GetPointer(d_fmVelocities), CudaHelper::GetPointer(d_forcesPerThread), CudaHelper::GetPointer(d_torquesPerThread), + CudaHelper::GetPointer(d_forcesPerThreadIndices), CudaHelper::GetPointer(d_torquesPerThreadIndices), CudaHelper::GetPointer(d_rigidBodyPositions), d_kappa, + d_densitiesAdv, CudaHelper::GetPointer(d_masses), CudaHelper::GetPointer(d_volumes), CudaHelper::GetPointer(d_fmIndices), CudaHelper::GetPointer(d_boundaryVolumes), + CudaHelper::GetPointer(d_boundaryVolumeIndices), CudaHelper::GetPointer(d_densities0), CudaHelper::GetPointer(d_isDynamic), omp_get_thread_num(), h, m_eps, d_kernelData, + CudaHelper::GetPointer(d_particles), d_neighbors, d_neighborCounts, d_neighborOffsets, d_neighborPointsetIndices, nFluids, + nPointSets, fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); +} +#endif + +void TimeStepDFSPHGPU::pressureSolve() +{ + const Real h = TimeManager::getCurrent()->getTimeStepSize(); + const Real h2 = h*h; + const Real invH = static_cast(1.0) / h; + const Real invH2 = static_cast(1.0) / h2; + Simulation *sim = Simulation::getCurrent(); + const unsigned int nFluids = sim->numberOfFluidModels(); + const unsigned int nPointSets = sim->numberOfPointSets(); + unsigned int sumActiveParticles = 0; + +#ifdef USE_WARMSTART + CudaHelper::CudaMalloc(&d_kappa, sumParticles); + + sumActiveParticles = 0; + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nFluids; fluidModelIndex++) + { + CudaHelper::MemcpyHostToDevice( &(m_simulationData.getKappa(fluidModelIndex, 0)), d_kappa + sumActiveParticles, sim->getFluidModel(fluidModelIndex)->numActiveParticles()); + sumActiveParticles += sim->getFluidModel(fluidModelIndex)->numActiveParticles(); + } + + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nFluids; fluidModelIndex++) + warmstartPressureSolve(fluidModelIndex); +#endif + + ////////////////////////////////////////////////////////////////////////// + // Compute rho_adv + ////////////////////////////////////////////////////////////////////////// + + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nFluids; fluidModelIndex++) + { + FluidModel *model = sim->getFluidModel(fluidModelIndex); + const int numParticles = (int)model->numActiveParticles(); + std::vector &pointSets = sim->getCurrent()->getPointSets(); + PointSetImplementation *impl = pointSets[fluidModelIndex].getPointSetImplementation(); + + computeDensityAdvs<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>(d_densitiesAdv, d_fmDensities, CudaHelper::GetPointer(d_fmVelocities), CudaHelper::GetPointer(d_bmVelocities), + CudaHelper::GetPointer(d_fmIndices), CudaHelper::GetPointer(d_volumes), CudaHelper::GetPointer(d_boundaryVolumes), CudaHelper::GetPointer(d_boundaryVolumeIndices), + CudaHelper::GetPointer(d_densities0), h, d_kernelData, CudaHelper::GetPointer(d_particles), d_neighbors, d_neighborCounts, d_neighborOffsets, d_neighborPointsetIndices, nFluids, + nPointSets, fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); + CudaHelper::DeviceSynchronize(); + + multiplyRealWithConstant<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>( d_factors, CudaHelper::GetPointer(d_fmIndices), invH2, fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); + + #ifdef USE_WARMSTART + setRealToZero<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>(d_kappa, CudaHelper::GetPointer(d_fmIndices), fluidModelIndex, numParticles); + CudaHelper::CheckLastError(); + #endif + } + + m_iterations = 0; + + ////////////////////////////////////////////////////////////////////////// + // Start solver + ////////////////////////////////////////////////////////////////////////// + + Real avg_density_err = 0.0; + bool chk = false; + + + while ((!chk || (m_iterations < m_minIterations)) && (m_iterations < m_maxIterations)) + { + chk = true; + for (unsigned int i = 0; i < nFluids; i++) + { + FluidModel *model = sim->getFluidModel(i); + const Real density0 = model->getDensity0(); + + avg_density_err = 0.0; + pressureSolveIteration(i, avg_density_err); + + // Maximal allowed density fluctuation + const Real eta = m_maxError * static_cast(0.01) * density0; // maxError is given in percent + chk = chk && (avg_density_err <= eta); + } + + m_iterations++; + } + + INCREASE_COUNTER("DFSPH - iterations", static_cast(m_iterations)); + +#ifdef USE_WARMSTART + ////////////////////////////////////////////////////////////////////////// + // Multiply by h^2, the time step size has to be removed + // to make the stiffness value independent + // of the time step size + ////////////////////////////////////////////////////////////////////////// + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nFluids; fluidModelIndex++) + { + FluidModel *model = sim->getFluidModel(fluidModelIndex); + std::vector &pointSets = sim->getCurrent()->getPointSets(); + PointSetImplementation *impl = pointSets[fluidModelIndex].getPointSetImplementation(); + const int numParticles = (int)model->numActiveParticles(); + + multiplyRealWithConstant<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>( d_kappa, CudaHelper::GetPointer(d_fmIndices), h2, fluidModelIndex, numParticles); + CudaHelper::CheckLastError(); + } + + sumActiveParticles = 0; + for(unsigned int fluidModelIndex = 0; fluidModelIndex < nFluids; fluidModelIndex++) + { + CudaHelper::MemcpyDeviceToHost(d_kappa + sumActiveParticles, &(m_simulationData.getKappa(fluidModelIndex, 0)), sim->getFluidModel(fluidModelIndex)->numActiveParticles()); + sumActiveParticles += sim->getFluidModel(fluidModelIndex)->numActiveParticles(); + } + CudaHelper::CudaFree(d_kappa); +#endif +} + +void TimeStepDFSPHGPU::pressureSolveIteration(const unsigned int fluidModelIndex, Real &avg_density_err) +{ + Simulation *sim = Simulation::getCurrent(); + FluidModel *model = sim->getFluidModel(fluidModelIndex); + const Real density0 = model->getDensity0(); + const int numParticles = (int)model->numActiveParticles(); + const unsigned int nPointSets = sim->numberOfPointSets(); + std::vector &pointSets = sim->getCurrent()->getPointSets(); + PointSetImplementation *impl = pointSets[fluidModelIndex].getPointSetImplementation(); + if (numParticles == 0) + return; + + const unsigned int nFluids = sim->numberOfFluidModels(); + const Real h = TimeManager::getCurrent()->getTimeStepSize(); + const Real invH = static_cast(1.0) / h; + //Real density_error = 0.0; + + Real density_error = 0.0, *d_density_error; + CudaHelper::CudaMalloc(&d_density_error, 1); + CudaHelper::MemcpyHostToDevice( &density_error, d_density_error, 1); + +#ifdef USE_WARMSTART + pressureSolveUpdateFluidVelocities<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>( CudaHelper::GetPointer(d_fmVelocities), d_kappa, CudaHelper::GetPointer(d_forcesPerThread), + CudaHelper::GetPointer(d_torquesPerThread), CudaHelper::GetPointer(d_forcesPerThreadIndices), CudaHelper::GetPointer(d_torquesPerThreadIndices), + CudaHelper::GetPointer(d_rigidBodyPositions), d_densitiesAdv, d_factors, CudaHelper::GetPointer(d_fmIndices), CudaHelper::GetPointer(d_masses), + CudaHelper::GetPointer(d_volumes), CudaHelper::GetPointer(d_boundaryVolumes), CudaHelper::GetPointer(d_boundaryVolumeIndices), + CudaHelper::GetPointer(d_densities0), CudaHelper::GetPointer(d_isDynamic), omp_get_thread_num(), h, invH, d_kernelData, m_eps, + CudaHelper::GetPointer(d_particles), d_neighbors, d_neighborCounts, d_neighborOffsets, d_neighborPointsetIndices, nFluids, + nPointSets, fluidModelIndex, numParticles); +#else + pressureSolveUpdateFluidVelocities<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>( CudaHelper::GetPointer(d_fmVelocities), CudaHelper::GetPointer(d_forcesPerThread), + CudaHelper::GetPointer(d_torquesPerThread), CudaHelper::GetPointer(d_forcesPerThreadIndices), CudaHelper::GetPointer(d_torquesPerThreadIndices), + CudaHelper::GetPointer(d_rigidBodyPositions), d_densitiesAdv, d_factors, CudaHelper::GetPointer(d_fmIndices), CudaHelper::GetPointer(d_masses), + CudaHelper::GetPointer(d_volumes), CudaHelper::GetPointer(d_boundaryVolumes), CudaHelper::GetPointer(d_boundaryVolumeIndices), + CudaHelper::GetPointer(d_densities0), CudaHelper::GetPointer(d_isDynamic), omp_get_thread_num(), h, invH, d_kernelData, m_eps, + CudaHelper::GetPointer(d_particles), d_neighbors, d_neighborCounts, d_neighborOffsets, d_neighborPointsetIndices, nFluids, + nPointSets, fluidModelIndex, numParticles); +#endif + + CudaHelper::CheckLastError(); + + computeDensityAdvs<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>(d_densitiesAdv, d_fmDensities, CudaHelper::GetPointer(d_fmVelocities), CudaHelper::GetPointer(d_bmVelocities), + CudaHelper::GetPointer(d_fmIndices), CudaHelper::GetPointer(d_volumes), CudaHelper::GetPointer(d_boundaryVolumes), CudaHelper::GetPointer(d_boundaryVolumeIndices), + CudaHelper::GetPointer(d_densities0), h, d_kernelData, CudaHelper::GetPointer(d_particles), d_neighbors, d_neighborCounts, + d_neighborOffsets, d_neighborPointsetIndices, nFluids, nPointSets, fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); + CudaHelper::DeviceSynchronize(); + + updateDensityErrorPressureSolve<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>( d_density_error, d_densitiesAdv, CudaHelper::GetPointer(d_densities0), + CudaHelper::GetPointer(d_fmIndices), fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); + CudaHelper::DeviceSynchronize(); + + CudaHelper::MemcpyDeviceToHost(d_density_error, &density_error, 1); + CudaHelper::CudaFree(d_density_error); + + avg_density_err = density_error / numParticles; +} + +#ifdef USE_WARMSTART_V +void TimeStepDFSPHGPU::warmstartDivergenceSolve(const unsigned int fluidModelIndex) +{ + const Real h = TimeManager::getCurrent()->getTimeStepSize(); + const Real invH = static_cast(1.0) / h; + Simulation *sim = Simulation::getCurrent(); + FluidModel *model = sim->getFluidModel(fluidModelIndex); + const Real density0 = model->getDensity0(); + const int numParticles = (int)model->numActiveParticles(); + const unsigned int nPointSets = sim->numberOfPointSets(); + std::vector &pointSets = sim->getCurrent()->getPointSets(); + PointSetImplementation *impl = pointSets[fluidModelIndex].getPointSetImplementation(); + if (numParticles == 0) + return; + + const unsigned int nFluids = sim->numberOfFluidModels(); + const unsigned int nBoundaries = sim->numberOfBoundaryModels(); + + warmstartDivergenceSolveKappaV<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>(d_kappaV, CudaHelper::GetPointer(d_fmIndices), CudaHelper::GetPointer(d_densities0), invH, fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); + + computeDensityChanges<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>(d_densitiesAdv, CudaHelper::GetPointer(d_fmVelocities), CudaHelper::GetPointer(d_bmVelocities), + CudaHelper::GetPointer(d_fmIndices), CudaHelper::GetPointer(d_volumes), CudaHelper::GetPointer(d_boundaryVolumes), CudaHelper::GetPointer(d_boundaryVolumeIndices), + d_kernelData, CudaHelper::GetPointer(d_particles), d_neighbors, d_neighborCounts, d_neighborOffsets, d_neighborPointsetIndices, nFluids, + nPointSets, fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); + + divergenceSolveWarmstart<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>( CudaHelper::GetPointer(d_fmVelocities), CudaHelper::GetPointer(d_forcesPerThread), + CudaHelper::GetPointer(d_torquesPerThread), CudaHelper::GetPointer(d_forcesPerThreadIndices), CudaHelper::GetPointer(d_torquesPerThreadIndices), + CudaHelper::GetPointer(d_rigidBodyPositions), d_kappaV, CudaHelper::GetPointer(d_fmIndices), CudaHelper::GetPointer(d_masses), + CudaHelper::GetPointer(d_volumes), CudaHelper::GetPointer(d_boundaryVolumes), CudaHelper::GetPointer(d_boundaryVolumeIndices), + CudaHelper::GetPointer(d_densities0), CudaHelper::GetPointer(d_isDynamic), omp_get_thread_num(), h, d_kernelData, m_eps, + CudaHelper::GetPointer(d_particles), d_neighbors, d_neighborCounts, d_neighborOffsets, d_neighborPointsetIndices, nFluids, + nPointSets, fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); +} +#endif + +void TimeStepDFSPHGPU::divergenceSolve() +{ + ////////////////////////////////////////////////////////////////////////// + // Init parameters + ////////////////////////////////////////////////////////////////////////// + + const Real h = TimeManager::getCurrent()->getTimeStepSize(); + const Real invH = static_cast(1.0) / h; + Simulation *sim = Simulation::getCurrent(); + const unsigned int maxIter = m_maxIterationsV; + const Real maxError = m_maxErrorV; + const unsigned int nFluids = sim->numberOfFluidModels(); + const unsigned int nPointSets = sim->numberOfPointSets(); + unsigned int sumActiveParticles = 0; // helper for data transfers + +#ifdef USE_WARMSTART_V + CudaHelper::CudaMalloc( &d_kappaV, sumParticles); + + sumActiveParticles = 0; + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nFluids; fluidModelIndex++) + { + CudaHelper::MemcpyHostToDevice( &(m_simulationData.getKappaV(fluidModelIndex, 0)), d_kappaV + sumActiveParticles, sim->getFluidModel(fluidModelIndex)->numActiveParticles()); + sumActiveParticles += sim->getFluidModel(fluidModelIndex)->numActiveParticles(); + } + + for(unsigned int fluidModelIndex =0; fluidModelIndex < nFluids; fluidModelIndex++) + warmstartDivergenceSolve(fluidModelIndex); +#endif + + ////////////////////////////////////////////////////////////////////////// + // Compute velocity of density change + ////////////////////////////////////////////////////////////////////////// + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nFluids; fluidModelIndex++) + { + FluidModel *model = sim->getFluidModel(fluidModelIndex); + std::vector &pointSets = sim->getCurrent()->getPointSets(); + PointSetImplementation *impl = pointSets[fluidModelIndex].getPointSetImplementation(); + const int numParticles = (int)model->numActiveParticles(); + + computeDensityChanges<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>(d_densitiesAdv, CudaHelper::GetPointer(d_fmVelocities), CudaHelper::GetPointer(d_bmVelocities), + CudaHelper::GetPointer(d_fmIndices), CudaHelper::GetPointer(d_volumes), CudaHelper::GetPointer(d_boundaryVolumes), CudaHelper::GetPointer(d_boundaryVolumeIndices), + d_kernelData, CudaHelper::GetPointer(d_particles), d_neighbors, d_neighborCounts, d_neighborOffsets, d_neighborPointsetIndices, nFluids, + nPointSets, fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); + + multiplyRealWithConstant<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>( d_factors, CudaHelper::GetPointer(d_fmIndices), invH, fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); + +#ifdef USE_WARMSTART_V + setRealToZero<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>(d_kappaV, CudaHelper::GetPointer(d_fmIndices), fluidModelIndex, numParticles); + CudaHelper::CheckLastError(); +#endif + } + + m_iterationsV = 0; + + ////////////////////////////////////////////////////////////////////////// + // Start solver + ////////////////////////////////////////////////////////////////////////// + + Real avg_density_err = 0.0; + bool chk = false; + + while ((!chk || (m_iterationsV < 1)) && (m_iterationsV < maxIter)) + { + chk = true; + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nFluids; fluidModelIndex++) + { + FluidModel *model = sim->getFluidModel(fluidModelIndex); + const Real density0 = model->getDensity0(); + + avg_density_err = 0.0; + divergenceSolveIteration(fluidModelIndex, avg_density_err); + + // Maximal allowed density fluctuation + // use maximal density error divided by time step size + const Real eta = (static_cast(1.0) / h) * maxError * static_cast(0.01) * density0; // maxError is given in percent + chk = chk && (avg_density_err <= eta); + } + + m_iterationsV++; + } + + INCREASE_COUNTER("DFSPH - iterationsV", static_cast(m_iterationsV)); + + ////////////////////////////////////////////////////////////////////////// + // Multiply by h, the time step size has to be removed + // to make the stiffness value independent + // of the time step size + ////////////////////////////////////////////////////////////////////////// + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nFluids; fluidModelIndex++) + { + FluidModel *model = sim->getFluidModel(fluidModelIndex); + std::vector &pointSets = sim->getCurrent()->getPointSets(); + PointSetImplementation *impl = pointSets[fluidModelIndex].getPointSetImplementation(); + const int numParticles = (int)model->numActiveParticles(); + +#ifdef USE_WARMSTART_V + multiplyRealWithConstant<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>( d_kappaV, CudaHelper::GetPointer(d_fmIndices), h, fluidModelIndex, numParticles); + CudaHelper::CheckLastError(); +#endif + + multiplyRealWithConstant<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>( d_factors, CudaHelper::GetPointer(d_fmIndices), h, fluidModelIndex, numParticles); + CudaHelper::CheckLastError(); + } + +#ifdef USE_WARMSTART_V + sumActiveParticles = 0; + for(unsigned int fluidModelIndex = 0; fluidModelIndex < nFluids; fluidModelIndex++) + { + CudaHelper::MemcpyDeviceToHost(d_kappaV + sumActiveParticles, &(m_simulationData.getKappaV(fluidModelIndex, 0)), sim->getFluidModel(fluidModelIndex)->numActiveParticles()); + sumActiveParticles += sim->getFluidModel(fluidModelIndex)->numActiveParticles(); + } + CudaHelper::CudaFree(d_kappaV); +#endif +} + +void TimeStepDFSPHGPU::divergenceSolveIteration(const unsigned int fluidModelIndex, Real &avg_density_err) +{ + Simulation *sim = Simulation::getCurrent(); + FluidModel *model = sim->getFluidModel(fluidModelIndex); + const Real density0 = model->getDensity0(); + const int numParticles = (int)model->numActiveParticles(); + const unsigned int nPointSets = sim->numberOfPointSets(); + std::vector &pointSets = sim->getCurrent()->getPointSets(); + PointSetImplementation *impl = pointSets[fluidModelIndex].getPointSetImplementation(); + if (numParticles == 0) + return; + + const unsigned int nFluids = sim->numberOfFluidModels(); + const unsigned int nBoundaries = sim->numberOfBoundaryModels(); + const Real h = TimeManager::getCurrent()->getTimeStepSize(); + const Real invH = static_cast(1.0) / h; + Real density_error = 0.0; +/* Real density_error = 0.0, *d_density_error; + + CudaHelper::CudaMalloc(&d_density_error, 1); + CudaHelper::MemcpyHostToDevice( &density_error, d_density_error, 1); */ + + ////////////////////////////////////////////////////////////////////////// + // Perform Jacobi iteration over all blocks + ////////////////////////////////////////////////////////////////////////// +#ifdef USE_WARMSTART_V + divergenceSolveUpdateFluidVelocities<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>( CudaHelper::GetPointer(d_fmVelocities), d_kappaV, CudaHelper::GetPointer(d_forcesPerThread), + CudaHelper::GetPointer(d_torquesPerThread), CudaHelper::GetPointer(d_forcesPerThreadIndices), CudaHelper::GetPointer(d_torquesPerThreadIndices), + CudaHelper::GetPointer(d_rigidBodyPositions), d_densitiesAdv, d_factors, CudaHelper::GetPointer(d_fmIndices), CudaHelper::GetPointer(d_masses), + CudaHelper::GetPointer(d_volumes), CudaHelper::GetPointer(d_boundaryVolumes), CudaHelper::GetPointer(d_boundaryVolumeIndices), + CudaHelper::GetPointer(d_densities0), CudaHelper::GetPointer(d_isDynamic), omp_get_thread_num(), h, invH, d_kernelData, m_eps, + CudaHelper::GetPointer(d_particles), d_neighbors, d_neighborCounts, d_neighborOffsets, d_neighborPointsetIndices, nFluids, + nPointSets, fluidModelIndex, numParticles); +#else + divergenceSolveUpdateFluidVelocities<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>( CudaHelper::GetPointer(d_fmVelocities), CudaHelper::GetPointer(d_forcesPerThread), + CudaHelper::GetPointer(d_torquesPerThread), CudaHelper::GetPointer(d_forcesPerThreadIndices), CudaHelper::GetPointer(d_torquesPerThreadIndices), + CudaHelper::GetPointer(d_rigidBodyPositions), d_densitiesAdv, d_factors, CudaHelper::GetPointer(d_fmIndices), CudaHelper::GetPointer(d_masses), + CudaHelper::GetPointer(d_volumes), CudaHelper::GetPointer(d_boundaryVolumes), CudaHelper::GetPointer(d_boundaryVolumeIndices), + CudaHelper::GetPointer(d_densities0), CudaHelper::GetPointer(d_isDynamic), omp_get_thread_num(), h, invH, d_kernelData, m_eps, + CudaHelper::GetPointer(d_particles), d_neighbors, d_neighborCounts, d_neighborOffsets, d_neighborPointsetIndices, nFluids, + nPointSets, fluidModelIndex, numParticles); +#endif + + CudaHelper::CheckLastError(); + CudaHelper::DeviceSynchronize(); + + computeDensityChanges<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>(d_densitiesAdv, CudaHelper::GetPointer(d_fmVelocities), CudaHelper::GetPointer(d_bmVelocities), + CudaHelper::GetPointer(d_fmIndices), CudaHelper::GetPointer(d_volumes), CudaHelper::GetPointer(d_boundaryVolumes), CudaHelper::GetPointer(d_boundaryVolumeIndices), + d_kernelData, CudaHelper::GetPointer(d_particles), d_neighbors, d_neighborCounts, d_neighborOffsets, d_neighborPointsetIndices, nFluids, + nPointSets, fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); + CudaHelper::DeviceSynchronize(); + + unsigned int sumActiveParticles = 0; + for(unsigned int fluidModelIndex = 0; fluidModelIndex < nFluids; fluidModelIndex++) + { + CudaHelper::MemcpyDeviceToHost( d_densitiesAdv + sumActiveParticles, &(m_simulationData.getDensityAdv(fluidModelIndex, 0)), sim->getFluidModel(fluidModelIndex)->numActiveParticles()); + sumActiveParticles += sim->getFluidModel(fluidModelIndex)->numActiveParticles(); + } + + #pragma omp parallel default(shared) + { + #pragma omp for reduction(+:density_error) schedule(static) + for (int i = 0; i < (int)numParticles; i++) + { + density_error += density0 * m_simulationData.getDensityAdv(fluidModelIndex, i); + } + } + + sumActiveParticles = 0; + for(unsigned int fluidModelIndex = 0; fluidModelIndex < nFluids; fluidModelIndex++) + { + CudaHelper::MemcpyHostToDevice( &(m_simulationData.getDensityAdv(fluidModelIndex, 0)), d_densitiesAdv + sumActiveParticles, sim->getFluidModel(fluidModelIndex)->numActiveParticles()); + sumActiveParticles += sim->getFluidModel(fluidModelIndex)->numActiveParticles(); + } + +/* updateDensityErrorDivergence<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>( d_density_error, d_densitiesAdv, CudaHelper::GetPointer(d_densities0), + CudaHelper::GetPointer(d_fmIndices), fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); + CudaHelper::DeviceSynchronize(); + + CudaHelper::MemcpyDeviceToHost(d_density_error, &density_error, 1); + CudaHelper::CudaFree(d_density_error); */ + + avg_density_err = density_error/numParticles; +} + + +void TimeStepDFSPHGPU::prepareData() +{ + Simulation *sim = Simulation::getCurrent(); + const unsigned int nPointSets = sim->numberOfPointSets(); + const unsigned int nFluids = sim->numberOfFluidModels(); + unsigned int sumActiveParticles = 0; + + ////////////////////////////////////////////////////////////////////////////// + // Common data + ////////////////////////////////////////////////////////////////////////////// + + if(isInitialized) + { + CudaHelper::CudaFree(d_neighbors); + CudaHelper::CudaFree(d_neighborCounts); + CudaHelper::CudaFree(d_neighborOffsets); + CudaHelper::CudaFree(d_neighborPointsetIndices); + } + + std::vector &pointSets = sim->getCurrent()->getPointSets(); + + CudaHelper::CudaMalloc(&d_neighborPointsetIndices, nPointSets); + unsigned int neighborPointsetIndices_tmp[nPointSets]; + + unsigned int neighborsetCount = 0; + for(int i = 0 ; i < nPointSets ; ++i) + { + neighborPointsetIndices_tmp[i] = neighborsetCount; + neighborsetCount += pointSets[i].n_neighborsets(); + } + + CudaHelper::MemcpyHostToDevice(neighborPointsetIndices_tmp, d_neighborPointsetIndices, nPointSets); + + // flattened out the structures for efficiency + CudaHelper::CudaMalloc(&d_neighbors, neighborsetCount); + CudaHelper::CudaMalloc(&d_neighborCounts, neighborsetCount); + CudaHelper::CudaMalloc(&d_neighborOffsets, neighborsetCount); + + for(int i = 0 ; i < nPointSets ; ++i) + { + const unsigned int nNeighborsets = pointSets[i].n_neighborsets(); + + uint* neighbors_tmp[nNeighborsets]; + uint* neighborCounts_tmp[nNeighborsets]; + uint* neighborOffsets_tmp[nNeighborsets]; + + for(int j = 0; j < nNeighborsets; j++) + { + neighbors_tmp[j] = pointSets[i].neighbor_indices(j); + neighborCounts_tmp[j] = pointSets[i].neighbor_counts(j); + neighborOffsets_tmp[j] = pointSets[i].neighbor_offsets(j); + } + + CudaHelper::MemcpyHostToDevice(neighbors_tmp, d_neighbors + neighborPointsetIndices_tmp[i], nNeighborsets); + CudaHelper::MemcpyHostToDevice(neighborCounts_tmp, d_neighborCounts + neighborPointsetIndices_tmp[i], nNeighborsets); + CudaHelper::MemcpyHostToDevice(neighborOffsets_tmp, d_neighborOffsets + neighborPointsetIndices_tmp[i], nNeighborsets); + } + + // for computeDensities and computePressureAccels + d_boundaryVolumeIndices.resize(nPointSets - nFluids); + unsigned int sumBoundaryVolumes = 0; + for(unsigned int pid = nFluids; pid < nPointSets; pid++) + { + BoundaryModel_Akinci2012 *bm_neighbor = static_cast(sim->getBoundaryModelFromPointSet(pid)); + d_boundaryVolumes.insert(d_boundaryVolumes.end(), bm_neighbor->getVolumes().begin(), bm_neighbor->getVolumes().end()); + + d_boundaryVolumeIndices[pid - nFluids] = sumBoundaryVolumes; + sumBoundaryVolumes += bm_neighbor->getVolumes().size(); + } + + sumParticles = 0; // for indexing + for(unsigned int fluidModelIndex = 0; fluidModelIndex < nFluids; fluidModelIndex++) + { + FluidModel *model = sim->getFluidModel(fluidModelIndex); + + //d_fmDensities.insert(d_fmDensities.end(), &(model->getDensity(0)), &(model->getDensity(0)) + model->numActiveParticles()); + d_fmIndices[fluidModelIndex] = sumParticles; + sumParticles += model->numActiveParticles(); + + } + + CudaHelper::CudaMalloc(&d_fmDensities, sumParticles); + + //////////////////////////////////////////////////////////////////////////// + // DFPSH specific + //////////////////////////////////////////////////////////////////////////// + + CudaHelper::CudaMalloc(&d_densitiesAdv, sumParticles); + + for(unsigned int fluidModelIndex = 0; fluidModelIndex < nFluids; fluidModelIndex++) + { + FluidModel *model = sim->getFluidModel(fluidModelIndex); + d_masses.insert(d_masses.end(), model->getMasses().begin(), model->getMasses().begin() + model->numActiveParticles()); + } + + // for correct indexing + int sumForcesPerThread = 0; + int sumTorquesPerThread = 0; + for (unsigned int boundaryModelIndex = nFluids; boundaryModelIndex < nPointSets; boundaryModelIndex++) + { + BoundaryModel_Akinci2012 *bm_neighbor = static_cast(sim->getBoundaryModelFromPointSet(boundaryModelIndex)); + d_forcesPerThread.insert(d_forcesPerThread.end(), bm_neighbor->getForcesPerThread().begin(), bm_neighbor->getForcesPerThread().end()); + d_torquesPerThread.insert(d_torquesPerThread.end(), bm_neighbor->getTorquesPerThread().begin(), bm_neighbor->getTorquesPerThread().end()); + + d_forcesPerThreadIndices[boundaryModelIndex - nFluids] = sumForcesPerThread; + d_torquesPerThreadIndices[boundaryModelIndex - nFluids] = sumTorquesPerThread; + + sumForcesPerThread += bm_neighbor->getForcesPerThread().size(); + sumTorquesPerThread += bm_neighbor->getTorquesPerThread().size(); + + d_rigidBodyPositions[boundaryModelIndex - nFluids] = bm_neighbor->getRigidBodyPosition(); + d_isDynamic[boundaryModelIndex - nFluids] = bm_neighbor->isDynamic(); + d_bmVelocities.insert(d_bmVelocities.end(), bm_neighbor->getVelocities().begin(), bm_neighbor->getVelocities().end()); + } +} + +void TimeStepDFSPHGPU::getDataBack() +{ + Simulation *sim = Simulation::getCurrent(); + const unsigned int nPointSets = sim->numberOfPointSets(); + const unsigned int nFluids = sim->numberOfFluidModels(); + unsigned int sumActiveParticles = 0; + + for(unsigned int fluidModelIndex = 0; fluidModelIndex < nFluids; fluidModelIndex++) + { + FluidModel *fm_neighbor = sim->getFluidModel(fluidModelIndex); + // CudaHelper::MemcpyDeviceToHost( d_factors + sumActiveParticles, &(m_simulationData.getFactor(fluidModelIndex, 0)), sim->getFluidModel(fluidModelIndex)->numActiveParticles()); + // CudaHelper::MemcpyDeviceToHost( d_densitiesAdv + sumActiveParticles, &(m_simulationData.getDensityAdv(fluidModelIndex, 0)), sim->getFluidModel(fluidModelIndex)->numActiveParticles()); + CudaHelper::MemcpyDeviceToHost( CudaHelper::GetPointer(d_fmVelocities) + sumActiveParticles, &(fm_neighbor->getVelocity(0)), sim->getFluidModel(fluidModelIndex)->numActiveParticles()); + sumActiveParticles += sim->getFluidModel(fluidModelIndex)->numActiveParticles(); + } + + CudaHelper::CudaFree(d_factors); + CudaHelper::CudaFree(d_densitiesAdv); + + unsigned int sumForcesPerThread = 0; + unsigned int sumTorquesPerThread = 0; + sumActiveParticles = 0; + for (unsigned int boundaryModelIndex = nFluids; boundaryModelIndex < sim->numberOfPointSets(); boundaryModelIndex++) + { + BoundaryModel_Akinci2012 *bm_neighbor = static_cast(sim->getBoundaryModelFromPointSet(boundaryModelIndex)); + + CudaHelper::MemcpyDeviceToHost( CudaHelper::GetPointer(d_forcesPerThread) + sumForcesPerThread, &(bm_neighbor->getForcesPerThread()[0]), bm_neighbor->getForcesPerThread().size()); + CudaHelper::MemcpyDeviceToHost( CudaHelper::GetPointer(d_torquesPerThread) + sumTorquesPerThread, &(bm_neighbor->getTorquesPerThread()[0]), bm_neighbor->getTorquesPerThread().size()); + + sumForcesPerThread += bm_neighbor->getForcesPerThread().size(); + sumTorquesPerThread += bm_neighbor->getTorquesPerThread().size(); + } + + d_forcesPerThread.clear(); d_forcesPerThread.shrink_to_fit(); + d_torquesPerThread.clear(); d_torquesPerThread.shrink_to_fit(); + + d_fmVelocities.clear(); d_fmVelocities.shrink_to_fit(); + d_bmVelocities.clear(); d_bmVelocities.shrink_to_fit(); + d_boundaryVolumes.clear(); d_boundaryVolumes.shrink_to_fit(); + d_boundaryVolumeIndices.clear(); d_boundaryVolumeIndices.shrink_to_fit(); + + CudaHelper::CudaFree(d_fmDensities); + d_masses.clear(); d_masses.shrink_to_fit(); +} + +void TimeStepDFSPHGPU::reset() +{ + TimeStep::reset(); + m_simulationData.reset(); + m_counter = 0; + m_iterations = 0; + m_iterationsV = 0; +} + +void TimeStepDFSPHGPU::performNeighborhoodSearch() +{ + if (Simulation::getCurrent()->zSortEnabled()) + { + if (m_counter % 500 == 0) + { + Simulation::getCurrent()->performNeighborhoodSearchSort(); + m_simulationData.performNeighborhoodSearchSort(); + } + m_counter++; + } + + Simulation::getCurrent()->performNeighborhoodSearch(); +} + +void TimeStepDFSPHGPU::emittedParticles(FluidModel *model, const unsigned int startIndex) +{ + m_simulationData.emittedParticles(model, startIndex); +} + +void TimeStepDFSPHGPU::resize() +{ + m_simulationData.init(); +} diff --git a/SPlisHSPlasH/DFSPH/TimeStepDFSPHGPU.h b/SPlisHSPlasH/DFSPH/TimeStepDFSPHGPU.h new file mode 100644 index 00000000..44083906 --- /dev/null +++ b/SPlisHSPlasH/DFSPH/TimeStepDFSPHGPU.h @@ -0,0 +1,117 @@ +#ifndef __TimeStepDFSPHGPU_h__ +#define __TimeStepDFSPHGPU_h__ + +#include "SPlisHSPlasH/Common.h" +#include "SPlisHSPlasH/TimeStep.h" +#include "SimulationDataDFSPH.h" +#include "SPlisHSPlasH/SPHKernels.h" +#include "SPlisHSPlasH/UtilitiesGPU/Kernels.cuh" + +#include + +#define USE_WARMSTART +#define USE_WARMSTART_V + +namespace SPH +{ + class SimulationDataDFSPH; + + /** \brief This class implements the Divergence-free Smoothed Particle Hydrodynamics approach introduced + * by Bender and Koschier \cite Bender:2015, \cite Bender2017, \cite KBST19. + */ + class TimeStepDFSPHGPU : public TimeStep + { + protected: +/* const unsigned int KERNEL_RESOLUTION = 10000; + typedef PrecomputedKernel PrecomputedCubicKernel; TODO: why is this leading to compiler errors sometimes? */ + + SimulationDataDFSPH m_simulationData; + unsigned int m_counter; + const Real m_eps = 1.0e-5; + bool m_enableDivergenceSolver; + unsigned int m_iterationsV; + Real m_maxErrorV; + unsigned int m_maxIterationsV; + + bool isInitialized = false; + + KernelData *d_kernelData, kernelData; + + thrust::device_vector d_particles; // particle positions + uint **d_neighbors; + uint **d_neighborCounts; + uint **d_neighborOffsets; + uint *d_neighborPointsetIndices; // indexing the above + + thrust::device_vector d_volumes; + thrust::device_vector d_densities0; + Real *d_fmDensities; + thrust::device_vector d_fmVelocities, d_bmVelocities; + thrust::device_vector d_boundaryVolumes; + thrust::device_vector d_boundaryVolumeIndices; + thrust::device_vector d_fmIndices; + + thrust::device_vector d_masses; + thrust::device_vector d_rigidBodyPositions; + thrust::device_vector d_isDynamic; + thrust::device_vector d_forcesPerThread; + thrust::device_vector d_torquesPerThread; + thrust::device_vector d_forcesPerThreadIndices; + thrust::device_vector d_torquesPerThreadIndices; + + unsigned int sumParticles; + + Real *d_densitiesAdv; + Real *d_factors; + + #ifdef USE_WARMSTART_V + Real *d_kappaV; + #endif + + #ifdef USE_WARMSTART + Real *d_kappa; + #endif + + void initCUDA(); + void prepareData(); + void getDataBack(); + + void pressureSolve(); + void pressureSolveIteration(const unsigned int fluidModelIndex, Real &avg_density_err); + void divergenceSolveIterationDummy(const unsigned int fluidModelIndex, Real &avg_density_err); + void divergenceSolve(); + void divergenceSolveIteration(const unsigned int fluidModelIndex, Real &avg_density_err); + void computeDensityAdv(const unsigned int fluidModelIndex, const unsigned int index, const int numParticles, const Real h, const Real density0); + void computeDensityChange(const unsigned int fluidModelIndex, const unsigned int index, const Real h); + +#ifdef USE_WARMSTART_V + void warmstartDivergenceSolve(const unsigned int fluidModelIndex); +#endif +#ifdef USE_WARMSTART + void warmstartPressureSolve(const unsigned int fluidModelIndex); +#endif + + /** Perform the neighborhood search for all fluid particles. + */ + void performNeighborhoodSearch(); + virtual void emittedParticles(FluidModel *model, const unsigned int startIndex); + + virtual void initParameters(); + + public: + static int SOLVER_ITERATIONS_V; + static int MAX_ITERATIONS_V; + static int MAX_ERROR_V; + static int USE_DIVERGENCE_SOLVER; + + TimeStepDFSPHGPU(); + virtual ~TimeStepDFSPHGPU(void); + + virtual void step(); + virtual void reset(); + + virtual void resize(); + }; +} + +#endif \ No newline at end of file diff --git a/SPlisHSPlasH/FluidModel.h b/SPlisHSPlasH/FluidModel.h index 83164398..58fffd60 100644 --- a/SPlisHSPlasH/FluidModel.h +++ b/SPlisHSPlasH/FluidModel.h @@ -238,6 +238,11 @@ namespace SPH m_x[i] = pos; } + FORCE_INLINE std::vector &getVelocities() + { + return m_v; + } + FORCE_INLINE Vector3r &getVelocity(const unsigned int i) { return m_v[i]; @@ -283,6 +288,11 @@ namespace SPH m_a[i] = accel; } + FORCE_INLINE std::vector& getMasses() + { + return m_masses; + } + FORCE_INLINE const Real getMass(const unsigned int i) const { return m_masses[i]; diff --git a/SPlisHSPlasH/NeighborhoodSearch.h b/SPlisHSPlasH/NeighborhoodSearch.h index bbbc9eba..efa0d42d 100644 --- a/SPlisHSPlasH/NeighborhoodSearch.h +++ b/SPlisHSPlasH/NeighborhoodSearch.h @@ -5,10 +5,10 @@ #define CUNSEARCH_USE_DOUBLE_PRECISION #endif -#ifdef GPU_NEIGHBORHOOD_SEARCH +//#ifdef GPU_NEIGHBORHOOD_SEARCH #include "cuNSearch.h" typedef cuNSearch::NeighborhoodSearch NeighborhoodSearch; -#else - #include "CompactNSearch.h" - typedef CompactNSearch::NeighborhoodSearch NeighborhoodSearch; -#endif +//#else +// #include "CompactNSearch.h" +// typedef CompactNSearch::NeighborhoodSearch NeighborhoodSearch; +//#endif diff --git a/SPlisHSPlasH/SPHKernels.h b/SPlisHSPlasH/SPHKernels.h index c14f8298..d62ee113 100644 --- a/SPlisHSPlasH/SPHKernels.h +++ b/SPlisHSPlasH/SPHKernels.h @@ -608,6 +608,9 @@ namespace SPH static Real m_invStepSize; static Real m_W_zero; public: + static Real* getWeightField() { return m_W; } + static Real* getGradField() { return m_gradW; } + static Real getInvStepSize() { return m_invStepSize; } static Real getRadius() { return m_radius; } static void setRadius(Real val) { diff --git a/SPlisHSPlasH/Simulation.cpp b/SPlisHSPlasH/Simulation.cpp index 866f5037..b80dc408 100644 --- a/SPlisHSPlasH/Simulation.cpp +++ b/SPlisHSPlasH/Simulation.cpp @@ -4,10 +4,12 @@ #include "TimeStep.h" #include "EmitterSystem.h" #include "SPlisHSPlasH/WCSPH/TimeStepWCSPH.h" +#include "SPlisHSPlasH/WCSPH/TimeStepWCSPHGPU.h" #include "SPlisHSPlasH/PCISPH/TimeStepPCISPH.h" #include "SPlisHSPlasH/PBF/TimeStepPBF.h" #include "SPlisHSPlasH/IISPH/TimeStepIISPH.h" #include "SPlisHSPlasH/DFSPH/TimeStepDFSPH.h" +#include "SPlisHSPlasH/DFSPH/TimeStepDFSPHGPU.h" #include "SPlisHSPlasH/PF/TimeStepPF.h" #include "BoundaryModel_Akinci2012.h" #include "BoundaryModel_Bender2019.h" @@ -58,6 +60,7 @@ int Simulation::ENUM_AKINCI2012 = -1; int Simulation::ENUM_KOSCHIER2017 = -1; int Simulation::ENUM_BENDER2019 = -1; +bool TIMESTEP_GPU = true; // TODO: only for benchmarking purposes Simulation::Simulation () { @@ -499,7 +502,11 @@ void Simulation::setSimulationMethod(const int val) if (method == SimulationMethods::WCSPH) { - m_timeStep = new TimeStepWCSPH(); + if (TIMESTEP_GPU) + m_timeStep = new TimeStepWCSPHGPU(); + else + m_timeStep = new TimeStepWCSPH(); + m_timeStep->init(); setValue(Simulation::KERNEL_METHOD, Simulation::ENUM_KERNEL_CUBIC); setValue(Simulation::GRAD_KERNEL_METHOD, Simulation::ENUM_GRADKERNEL_CUBIC); @@ -527,7 +534,11 @@ void Simulation::setSimulationMethod(const int val) } else if (method == SimulationMethods::DFSPH) { - m_timeStep = new TimeStepDFSPH(); + if (TIMESTEP_GPU) + m_timeStep = new TimeStepDFSPHGPU(); + else + m_timeStep = new TimeStepDFSPH(); + m_timeStep->init(); setValue(Simulation::KERNEL_METHOD, Simulation::ENUM_KERNEL_PRECOMPUTED_CUBIC); setValue(Simulation::GRAD_KERNEL_METHOD, Simulation::ENUM_GRADKERNEL_PRECOMPUTED_CUBIC); diff --git a/SPlisHSPlasH/Simulation.h b/SPlisHSPlasH/Simulation.h index 078458cf..171ad09c 100644 --- a/SPlisHSPlasH/Simulation.h +++ b/SPlisHSPlasH/Simulation.h @@ -51,6 +51,37 @@ for (unsigned int pid = nFluids; pid < sim->numberOfPointSets(); pid++) \ } \ } +/** Loop over the fluid neighbors of all fluid phases. +* constructGpuData must have been called before. outer_loop for computing indices needed for inner_loop. +*/ +#define forall_fluid_neighborsGPU(code) \ +for(uint pid = 0; pid < nFluids; pid++) \ +{ \ + const uint neighborsetIndex = neighborPointsetIndices[fluidModelIndex] + pid; \ + for(uint j = 0; j < neighborCounts[neighborsetIndex][i]; j++) \ + { \ + const uint neighborIndex = neighbors[neighborsetIndex][neighborOffsets[neighborsetIndex][i] + j]; \ + const double3 &xj = particles[pid][neighborIndex]; \ + code \ + } \ +} + + +/** Loop over the boundary neighbors of all fluid phases. +* constructGpuData must have been called before. +*/ +#define forall_boundary_neighborsGPU(code) \ +for (unsigned int pid = nFluids; pid < nPointSets; pid++) \ +{ \ + const uint neighborsetIndex = neighborPointsetIndices[fluidModelIndex] + pid; \ + for(unsigned int j = 0; j < neighborCounts[neighborsetIndex][i]; j++) \ + { \ + const uint neighborIndex = neighbors[neighborsetIndex][neighborOffsets[neighborsetIndex][i] + j]; \ + const double3 &xj = particles[pid][neighborIndex]; \ + code \ + } \ +} + /** Loop over the boundary density maps. * Simulation *sim, unsigned int nBoundaries and unsigned int fluidModelIndex must be defined. */ @@ -254,6 +285,11 @@ namespace SPH return static_cast(m_neighborhoodSearch->point_set(pointSetIndex).n_neighbors(neighborPointSetIndex, index)); } + FORCE_INLINE std::vector &getPointSets() const + { + return m_neighborhoodSearch->point_sets(); + } + FORCE_INLINE unsigned int getNeighbor(const unsigned int pointSetIndex, const unsigned int neighborPointSetIndex, const unsigned int index, const unsigned int k) const { return m_neighborhoodSearch->point_set(pointSetIndex).neighbor(neighborPointSetIndex, index, k); @@ -261,11 +297,11 @@ namespace SPH FORCE_INLINE const unsigned int * getNeighborList(const unsigned int pointSetIndex, const unsigned int neighborPointSetIndex, const unsigned int index) const { - #ifdef GPU_NEIGHBORHOOD_SEARCH +// #ifdef GPU_NEIGHBORHOOD_SEARCH return m_neighborhoodSearch->point_set(pointSetIndex).neighbor_list(neighborPointSetIndex, index); - #else +/* #else return m_neighborhoodSearch->point_set(pointSetIndex).neighbor_list(neighborPointSetIndex, index).data(); - #endif + #endif */ } }; } diff --git a/SPlisHSPlasH/UtilitiesGPU/Kernels.cu b/SPlisHSPlasH/UtilitiesGPU/Kernels.cu new file mode 100644 index 00000000..be14f1b6 --- /dev/null +++ b/SPlisHSPlasH/UtilitiesGPU/Kernels.cu @@ -0,0 +1,775 @@ +#include "Kernels.cuh" +#include "../Simulation.h" + +using namespace SPH; + +////////////////////////////////////////////////////////////////// +// Helper host methods +////////////////////////////////////////////////////////////////// + +KernelData::KernelData() +{ + CudaHelper::CudaMalloc(&d_W, PRECOMPUTED_KERNEL_SIZE); + CudaHelper::CudaMalloc(&d_gradW, PRECOMPUTED_KERNEL_SIZE + 1); +} + +KernelData::~KernelData() +{ + CudaHelper::CudaFree(d_W); + CudaHelper::CudaFree(d_gradW); +} + +void updateKernelData(KernelData &data) +{ + data.radius = PrecomputedKernel::getRadius(); + data.invStepSize = PrecomputedKernel::getInvStepSize(); + data.radius2 = data.radius * data.radius; + + CudaHelper::MemcpyHostToDevice(PrecomputedKernel::getWeightField(), data.d_W, PRECOMPUTED_KERNEL_SIZE); + CudaHelper::MemcpyHostToDevice(PrecomputedKernel::getGradField(), data.d_gradW, PRECOMPUTED_KERNEL_SIZE + 1); +} + +////////////////////////////////////////////////////////////////// +//Kernels for all methods +////////////////////////////////////////////////////////////////// + +__device__ +Real kernelWeightPrecomputed(const Vector3r &r, const KernelData* const data) +{ + Real res = 0.0; + const Real r2 = r.squaredNorm(); + if (r2 <= data->radius2) + { + const Real rl = sqrt(r2); + //const unsigned int pos = std::min((unsigned int)(rl * data->invStepSize), PRECOMPUTED_KERNEL_SIZE-2u); + unsigned int pos = 0; + if(static_cast(rl * data->invStepSize) < PRECOMPUTED_KERNEL_SIZE-2u) + pos = static_cast(rl * data->invStepSize); + else + pos = PRECOMPUTED_KERNEL_SIZE-2u; + res = static_cast(0.5)*(data->d_W[pos] + data->d_W[pos+1]); + } + return res; +} + +__device__ +Vector3r gradKernelWeightPrecomputed(const Vector3r &r, const KernelData* const data) +{ + Vector3r res; + const Real rl = r.norm(); // rl / radius = > 0 - 1, texturSpeicher + if (rl <= data->radius) + { + //const Real rl = sqrt(r2); + //const unsigned int pos = static_cast(fminf(static_cast(rl * data->invStepSize), PRECOMPUTED_KERNEL_SIZE-1u)); + unsigned int pos = 0; + if(static_cast(rl * data->invStepSize) < PRECOMPUTED_KERNEL_SIZE-1u) + pos = static_cast(rl * data->invStepSize); + else + pos = PRECOMPUTED_KERNEL_SIZE-1u; + res = 0.5*(data->d_gradW[pos] + data->d_gradW[pos + 1]) * r; // ersetzbar + } + else + res.setZero(); + + return res; +} + +__device__ +Real kernelWeight(const Vector3r& rin, const Real m_radius) +{ + const Real r = sqrt(rin[0] * rin[0] + rin[1] * rin[1] + rin[2] * rin[2]); + const Real pi = 3.14159265358979323846; + + const Real h3 = m_radius*m_radius*m_radius; + Real m_k = static_cast(8.0) / (pi*h3); + Real m_l = static_cast(48.0) / (pi*h3); + + Real res = 0.0; + const Real q = r / m_radius; + + if (q <= 1.0) + { + if (q <= 0.5) + { + const Real q2 = q*q; + const Real q3 = q2*q; + res = m_k * (static_cast(6.0)*q3 - static_cast(6.0)*q2 + static_cast(1.0)); + } + else + { + res = m_k * (static_cast(2.0)*pow(static_cast(1.0) - q, 3)); + } + } + return res; +} + +__device__ +Vector3r gradKernelWeight(const Vector3r &rin, const Real m_radius) +{ + + const Real pi = 3.14159265358979323846; + const Real h3 = m_radius*m_radius*m_radius; + const Real m_l = static_cast(48.0) / (pi*h3); + + Vector3r res; + const Real rl = sqrt(rin[0] * rin[0] + rin[1] * rin[1] + rin[2] * rin[2]); + const Real q = rl / m_radius; + if ((rl > 1.0e-6) && (q <= 1.0)) + { + const Vector3r gradq = rin * (static_cast(1.0) / (rl*m_radius)); + if (q <= 0.5) + { + res = m_l*q*((Real) 3.0*q - static_cast(2.0))*gradq; + } + else + { + const Real factor = static_cast(1.0) - q; + res = m_l*(-factor*factor)*gradq; + } + } + else + res.setZero(); + + return res; +} + + +__device__ +void addForce(const Vector3r &pos, const Vector3r &f, /* output */ Vector3r* const forcesPerThread, /* output */ Vector3r* const torquesPerThread, + const Vector3r* const rigidBodyPositions, const uint* const forcesPerThreadIndices, const uint* const torquesPerThreadIndices, const uint index, const int id) +{ + #ifdef _OPENMP + int tid = id; + #else + int tid = 0; + #endif + forcesPerThread[forcesPerThreadIndices[index] + tid] += f; + torquesPerThread[torquesPerThreadIndices[index] + tid] += (pos - rigidBodyPositions[index]).cross(f); +} + + +__global__ +void computeDensitiesGPU(/*out*/ Real* const densities, const Real* const volumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, + const uint* const fmIndices, const Real* const densities0, const Real W_zero, const KernelData* const kernelData, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles) +{ + // Boundary: Akinci2012 + const uint i = blockIdx.x * blockDim.x + threadIdx.x; + + if(i >= numParticles) + return; + + extern __shared__ Real densities_tmp[]; + + Real &density = densities_tmp[threadIdx.x]; + + density = volumes[fluidModelIndex] * W_zero; + const double3 &xi = particles[fluidModelIndex][i]; + + ////////////////////////////////////////////////////////////////////////// + // Fluid + ////////////////////////////////////////////////////////////////////////// + forall_fluid_neighborsGPU( + density += volumes[pid] * kernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + ) + + + ////////////////////////////////////////////////////////////////////////// + // Boundary + ////////////////////////////////////////////////////////////////////////// + forall_boundary_neighborsGPU( + density += boundaryVolumes[boundaryVolumeIndices[pid - nFluids] + neighborIndex] * kernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + ) + + density *= densities0[fluidModelIndex]; + + densities[fmIndices[fluidModelIndex] + i] = densities_tmp[threadIdx.x]; +} + + +////////////////////////////////////////////////////////////////// +//Kernels for the WCPSH method +////////////////////////////////////////////////////////////////// + +__global__ +void clearAccelerationsGPU(Real* masses, Vector3r* accelerations, const Vector3r grav, const uint numActiveParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + + if(i >= numActiveParticles) + return; + + // Clear accelerations of dynamic particles + if (masses[i] != 0.0) + { + Vector3r &a = accelerations[i]; + a = grav; + } +} + +__global__ +void updatePressureGPU(Real* const densities, const uint* const fmIndices, Real* const pressures, const Real* const densities0, const Real m_stiffness, const Real m_exponent, + const uint fluidModelIndex, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + + if(i >= numParticles) + return; + + Real &density = densities[fmIndices[fluidModelIndex] + i]; + density = max(density, densities0[fluidModelIndex]); + pressures[fmIndices[fluidModelIndex] + i] = m_stiffness * (pow(density / densities0[fluidModelIndex], m_exponent) - static_cast(1.0)); +} + +__global__ +void computePressureAccelsGPU( /* output */ Vector3r* const pressureAccels, /* output */ Vector3r* const forcesPerThread, /* output */ Vector3r* const torquesPerThread, const uint* const forcesPerThreadIndices, + const uint* const torquesPerThreadIndices, const Real* const densities, const Real* const densities0, const uint* const fmIndices, const Real* const pressures, const Real* const masses, + const Vector3r* const rigidBodyPositions, const Real* const volumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, const bool* const isDynamic, const int tid, const KernelData* kernelData, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles) +{ + const uint i = blockIdx.x*blockDim.x + threadIdx.x; + + if(i >= numParticles) + return; + + extern __shared__ Vector3r pressureAccels_tmp[]; + + const double3 &xi = particles[fluidModelIndex][i]; + + const Real density_i = densities[fmIndices[fluidModelIndex] + i]; + + pressureAccels_tmp[threadIdx.x] = Vector3r(0, 0, 0); + Vector3r &ai = pressureAccels_tmp[threadIdx.x]; + + const Real dpi = pressures[fmIndices[fluidModelIndex] + i] / (density_i*density_i); + ////////////////////////////////////////////////////////////////////////// + // Fluid + ////////////////////////////////////////////////////////////////////////// + forall_fluid_neighborsGPU( + const Real density_j = densities[fmIndices[pid] + neighborIndex] * densities0[fluidModelIndex] / densities0[pid]; + const Real dpj = pressures[fmIndices[pid] + neighborIndex] / (density_j*density_j); + ai -= densities0[fluidModelIndex] * volumes[pid] * (dpi + dpj) * gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + ) + + ////////////////////////////////////////////////////////////////////////// + // Boundary + ////////////////////////////////////////////////////////////////////////// + const Real dpj = pressures[fmIndices[fluidModelIndex] + i] / (densities0[fluidModelIndex] * densities0[fluidModelIndex]); + forall_boundary_neighborsGPU( + const Vector3r a = densities0[fluidModelIndex] * boundaryVolumes[fmIndices[pid - nFluids] + neighborIndex] * (dpi + dpj) * gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + ai -= a; + if(isDynamic[pid - nFluids]) + { + addForce(Vector3r(xj.x, xj.y, xj.z), masses[i] * a, forcesPerThread, torquesPerThread, rigidBodyPositions, forcesPerThreadIndices, torquesPerThreadIndices, pid - nFluids, tid); + } + ) + + pressureAccels[i] = pressureAccels_tmp[threadIdx.x]; +} + +__global__ +void updatePosPressureAccelPressureAccel(Vector3r* const positions, Vector3r* const velocities, Vector3r* const accelerations, + const Vector3r* const pressureAccels, const Real h, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + + if(i >= numParticles) + return; + + accelerations[i] += pressureAccels[i]; + velocities[i] += accelerations[i] * h; + positions[i] += velocities[i] * h; + +} + + +////////////////////////////////////////////////////////////////// +//Kernels for the DFSPH method +////////////////////////////////////////////////////////////////// + +__global__ +void computeDFSPHFactors(/* out */ Real* factors, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, const KernelData* const kernelData, + const unsigned int* fmIndices, const Real* fmVolumes, const Real eps, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= numParticles) + return; + + Real &factor = factors[fmIndices[fluidModelIndex] + i]; + factor = 0.0; + + ////////////////////////////////////////////////////////////////////////// + // Compute gradient dp_i/dx_j * (1/k) and dp_j/dx_j * (1/k) + ////////////////////////////////////////////////////////////////////////// + + const double3 xi = particles[fluidModelIndex][i]; + Real sum_grad_p_k = 0.0; + Vector3r grad_p_i; + grad_p_i.setZero(); + + ////////////////////////////////////////////////////////////////////////// + // Fluid + ////////////////////////////////////////////////////////////////////////// +forall_fluid_neighborsGPU( + const Vector3r grad_p_j = -fmVolumes[fluidModelIndex] * gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + sum_grad_p_k += grad_p_j.squaredNorm(); + grad_p_i -= grad_p_j; +) + + ////////////////////////////////////////////////////////////////////////// + // Boundary + ////////////////////////////////////////////////////////////////////////// + forall_boundary_neighborsGPU( + const Vector3r grad_p_j = -boundaryVolumes[boundaryVolumeIndices[pid - nFluids] + neighborIndex] * gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + grad_p_i -= grad_p_j; + ) + + sum_grad_p_k += grad_p_i.squaredNorm(); + + ////////////////////////////////////////////////////////////////////////// + // Compute pressure stiffness denominator + ////////////////////////////////////////////////////////////////////////// + if (sum_grad_p_k > eps) + factor = -static_cast(1.0) / (sum_grad_p_k); + else + factor = 0.0; +} + + + __global__ +void computeDensityChanges(/*out*/ Real* const densitiesAdv, const Vector3r* const fmVelocities, const Vector3r* const bmVelocities, const uint* const fmIndices, + const Real* const fmVolumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, const KernelData* const kernelData, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= numParticles) + return; + + Real &densityAdv = densitiesAdv[fmIndices[fluidModelIndex] + i]; + const double3 &xi = particles[fluidModelIndex][i]; + const Vector3r &vi = fmVelocities[fmIndices[fluidModelIndex] + i]; + + densityAdv = 0.0; + unsigned int numNeighbors = 0; + + ////////////////////////////////////////////////////////////////////////// + // Fluid + ////////////////////////////////////////////////////////////////////////// + forall_fluid_neighborsGPU( + const Vector3r &vj = fmVelocities[fmIndices[pid] + neighborIndex]; + densityAdv += fmVolumes[pid] * (vi - vj).dot(gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData)); + ) + + ////////////////////////////////////////////////////////////////////////// + // Boundary + ////////////////////////////////////////////////////////////////////////// + forall_boundary_neighborsGPU( + const Vector3r &vj = bmVelocities[boundaryVolumeIndices[pid - nFluids] + neighborIndex]; + densityAdv += boundaryVolumes[boundaryVolumeIndices[pid - nFluids] + neighborIndex] * (vi - vj).dot(gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData)); + ) + + // only correct positive divergence + densityAdv = max(densityAdv, static_cast(0.0)); + + for (unsigned int pid = 0; pid < nPointSets; pid++) + { + const uint neighborsetIndex = neighborPointsetIndices[fluidModelIndex] + pid; + numNeighbors += neighborCounts[neighborsetIndex][i]; + } + + // in case of particle deficiency do not perform a divergence solve + if (numNeighbors < 20) + densityAdv = 0.0; +} + +__global__ +void computeDensityAdvs(/*out*/ Real* const densitiesAdv, const Real* const fmDensities, const Vector3r* const fmVelocities, const Vector3r* const bmVelocities, const uint* const fmIndices, + const Real* const fmVolumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, const Real* const densities0, const Real h, const KernelData* const kernelData, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= numParticles) + return; + + Real &densityAdv = densitiesAdv[fmIndices[fluidModelIndex] + i]; + const Real &density = fmDensities[fmIndices[fluidModelIndex] + i]; + const double3 &xi = particles[fluidModelIndex][i]; + const Vector3r &vi = fmVelocities[fmIndices[fluidModelIndex] + i]; + Real delta = 0.0; + + ////////////////////////////////////////////////////////////////////////// + // Fluid + ////////////////////////////////////////////////////////////////////////// + forall_fluid_neighborsGPU( + const Vector3r &vj = fmVelocities[fmIndices[pid] + neighborIndex]; + delta += fmVolumes[pid] * (vi - vj).dot(gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData)); + ) + + ////////////////////////////////////////////////////////////////////////// + // Boundary + ////////////////////////////////////////////////////////////////////////// + forall_boundary_neighborsGPU( + const Vector3r &vj = bmVelocities[boundaryVolumeIndices[pid - nFluids] + neighborIndex]; + delta += boundaryVolumes[boundaryVolumeIndices[pid - nFluids] + neighborIndex] * (vi - vj).dot(gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData)); + ) + + densityAdv = density / densities0[fluidModelIndex] + h*delta; + densityAdv = max(densityAdv, static_cast(1.0)); +} + +__global__ +void warmstartDivergenceSolveKappaV(/*out*/ Real* const kappaV, const uint* const fmIndices, const Real* const densities0, const Real invH, const uint fluidModelIndex, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= numParticles) + return; + + kappaV[fmIndices[fluidModelIndex] + i] = static_cast(0.5) * max( kappaV[fmIndices[fluidModelIndex] + i] * invH, -static_cast(0.5) * densities0[fluidModelIndex] * densities0[fluidModelIndex]); +} + +__global__ +void divergenceSolveWarmstart( /*out*/ Vector3r* const fmVelocities, /* output */ Vector3r* const forcesPerThread, /* output */ Vector3r* const torquesPerThread, + const uint* const forcesPerThreadIndices, const uint* const torquesPerThreadIndices, const Vector3r* const rigidBodyPositions, const Real* const kappaV, + const uint* const fmIndices, const Real* const masses, const Real* const fmVolumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, + const Real* const densities0, const bool* const isDynamic, const int tid, const Real h, const KernelData* const kernelData, const Real eps, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= numParticles || numParticles == 0) + return; + + const Real invH = static_cast(1.0) / h; + + Vector3r &vel = fmVelocities[fmIndices[fluidModelIndex] + i]; + const Real ki = kappaV[fmIndices[fluidModelIndex] + i]; + const double3 &xi = particles[fluidModelIndex][i]; + + ////////////////////////////////////////////////////////////////////////// + // Fluid + ////////////////////////////////////////////////////////////////////////// + forall_fluid_neighborsGPU( + const Real kj = kappaV[fmIndices[pid] + neighborIndex]; + + const Real kSum = (ki + densities0[pid] / densities0[fluidModelIndex] * kj); + if (fabsf(kSum) > eps) + { + const Vector3r grad_p_j = -fmVolumes[pid] * gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + vel -= h * kSum * grad_p_j; // ki, kj already contain inverse density + } + ) + + ////////////////////////////////////////////////////////////////////////// + // Boundary + ////////////////////////////////////////////////////////////////////////// + if (fabsf(ki) > eps) + { + forall_boundary_neighborsGPU( + const Vector3r grad_p_j = -boundaryVolumes[boundaryVolumeIndices[pid - nFluids] + neighborIndex] * gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + const Vector3r velChange = -h * (Real) 1.0 * ki * grad_p_j; // kj already contains inverse density + vel += velChange; + addForce(Vector3r(xj.x, xj.y, xj.z), -masses[fmIndices[fluidModelIndex] + i] * velChange * invH, forcesPerThread, torquesPerThread, rigidBodyPositions, forcesPerThreadIndices, torquesPerThreadIndices, pid - nFluids, tid); + ) + } +} + + +__global__ +void multiplyRealWithConstant(/*out*/ Real* const input, const uint* const fmIndices, const Real f, const uint fluidModelIndex, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= numParticles) + return; + + input[fmIndices[fluidModelIndex] + i] *= f; +} + +__global__ +void setRealToZero(/*out*/ Real* const input, const uint* const fmIndices, const uint fluidModelIndex, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= numParticles) + return; + + input[fmIndices[fluidModelIndex] + i] = 0.0; +} + +__global__ +void divergenceSolveUpdateFluidVelocities( /*out*/ Vector3r* const fmVelocities, /*out*/ Real* const kappaV, /* output */ Vector3r* const forcesPerThread, /* output */ Vector3r* const torquesPerThread, + const uint* const forcesPerThreadIndices, const uint* const torquesPerThreadIndices, const Vector3r* const rigidBodyPositions, const Real* const densitiesAdv, const Real* const factors, + const uint* const fmIndices, const Real* const masses, const Real* const fmVolumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, + const Real* const densities0, const bool* const isDynamic, const int tid, const Real h, const Real invH, const KernelData* const kernelData, const Real eps, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= numParticles) + return; + + const Real b_i = densitiesAdv[fmIndices[fluidModelIndex] + i]; + const Real ki = b_i * factors[fmIndices[fluidModelIndex] + i]; + kappaV[fmIndices[fluidModelIndex] + i] += ki; + + Vector3r &v_i = fmVelocities[fmIndices[fluidModelIndex] + i]; + const double3 &xi = particles[fluidModelIndex][i]; + + ////////////////////////////////////////////////////////////////////////// + // Fluid + ////////////////////////////////////////////////////////////////////////// + forall_fluid_neighborsGPU( + const Real b_j = densitiesAdv[fmIndices[pid] + neighborIndex]; + const Real kj = b_j * factors[fmIndices[pid] + neighborIndex]; + + const Real kSum = ki + densities0[pid] / densities0[fluidModelIndex] * kj; + if(fabsf(kSum) > eps) + { + const Vector3r grad_p_j = -fmVolumes[pid] * gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + v_i -= h * kSum * grad_p_j; // ki, kj already contain inverse density + } + ) + + ////////////////////////////////////////////////////////////////////////// + // Boundary + ////////////////////////////////////////////////////////////////////////// + if(fabsf(ki) > eps) + { + forall_boundary_neighborsGPU( + const Vector3r grad_p_j = -boundaryVolumes[boundaryVolumeIndices[pid - nFluids] + neighborIndex] * gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + const Vector3r velChange = -h * (Real) 1.0 * ki * grad_p_j; // kj already contains inverse density + v_i += velChange; + addForce(Vector3r(xj.x, xj.y, xj.z), -masses[fmIndices[fluidModelIndex] + i] * velChange * invH, forcesPerThread, torquesPerThread, rigidBodyPositions, forcesPerThreadIndices, torquesPerThreadIndices, pid - nFluids, tid); + ) + } +} + +__global__ +void divergenceSolveUpdateFluidVelocities( /*out*/ Vector3r* const fmVelocities, /* output */ Vector3r* const forcesPerThread, /* output */ Vector3r* const torquesPerThread, + const uint* const forcesPerThreadIndices, const uint* const torquesPerThreadIndices, const Vector3r* const rigidBodyPositions, const Real* const densitiesAdv, const Real* const factors, + const uint* const fmIndices, const Real* const masses, const Real* const fmVolumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, + const Real* const densities0, const bool* const isDynamic, const int tid, const Real h, const Real invH, const KernelData* const kernelData, const Real eps, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= numParticles) + return; + + const Real b_i = densitiesAdv[fmIndices[fluidModelIndex] + i]; + const Real ki = b_i * factors[fmIndices[fluidModelIndex] + i]; + + Vector3r &v_i = fmVelocities[fmIndices[fluidModelIndex] + i]; + const double3 &xi = particles[fluidModelIndex][i]; + + ////////////////////////////////////////////////////////////////////////// + // Fluid + ////////////////////////////////////////////////////////////////////////// + forall_fluid_neighborsGPU( + const Real b_j = densitiesAdv[fmIndices[pid] + neighborIndex]; + const Real kj = b_j * factors[fmIndices[pid] + neighborIndex]; + + const Real kSum = ki + densities0[pid] / densities0[fluidModelIndex] * kj; + if(fabsf(kSum) > eps) + { + const Vector3r grad_p_j = -fmVolumes[pid] * gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + v_i -= h * kSum * grad_p_j; // ki, kj already contain inverse density + } + ) + + ////////////////////////////////////////////////////////////////////////// + // Boundary + ////////////////////////////////////////////////////////////////////////// + if(fabsf(ki) > eps) + { + forall_boundary_neighborsGPU( + const Vector3r grad_p_j = -boundaryVolumes[boundaryVolumeIndices[pid - nFluids] + neighborIndex] * gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + const Vector3r velChange = -h * (Real) 1.0 * ki * grad_p_j; // kj already contains inverse density + v_i += velChange; + addForce(Vector3r(xj.x, xj.y, xj.z), -masses[fmIndices[fluidModelIndex] + i] * velChange * invH, forcesPerThread, torquesPerThread, rigidBodyPositions, forcesPerThreadIndices, torquesPerThreadIndices, pid - nFluids, tid); + ) + } +} + +__global__ +void pressureSolveUpdateFluidVelocities( /*out*/ Vector3r* const fmVelocities, /*out*/ Real* const kappa, /* output */ Vector3r* const forcesPerThread, /* output */ Vector3r* const torquesPerThread, + const uint* const forcesPerThreadIndices, const uint* const torquesPerThreadIndices, const Vector3r* const rigidBodyPositions, const Real* const densitiesAdv, const Real* const factors, + const uint* const fmIndices, const Real* const masses, const Real* const fmVolumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, + const Real* const densities0, const bool* const isDynamic, const int tid, const Real h, const Real invH, const KernelData* const kernelData, const Real eps, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= numParticles) + return; + + const Real b_i = densitiesAdv[fmIndices[fluidModelIndex] + i] - static_cast(1.0); + const Real ki = b_i * factors[fmIndices[fluidModelIndex] + i]; + + kappa[fmIndices[fluidModelIndex] + i] += ki; + + Vector3r &v_i = fmVelocities[fmIndices[fluidModelIndex] + i]; + const double3 &xi = particles[fluidModelIndex][i]; + + ////////////////////////////////////////////////////////////////////////// + // Fluid + ////////////////////////////////////////////////////////////////////////// + forall_fluid_neighborsGPU( + const Real b_j = densitiesAdv[fmIndices[pid] + neighborIndex] - static_cast(1.0); + const Real kj = b_j * factors[fmIndices[pid] + neighborIndex]; + + const Real kSum = ki + densities0[pid] / densities0[fluidModelIndex] * kj; + if(fabsf(kSum) > eps) + { + const Vector3r grad_p_j = -fmVolumes[pid] * gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + v_i -= h * kSum * grad_p_j; // ki, kj already contain inverse density + } + ) + + ////////////////////////////////////////////////////////////////////////// + // Boundary + ////////////////////////////////////////////////////////////////////////// + if(fabsf(ki) > eps) + { + forall_boundary_neighborsGPU( + const Vector3r grad_p_j = -boundaryVolumes[boundaryVolumeIndices[pid - nFluids] + neighborIndex] * gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + const Vector3r velChange = -h * (Real) 1.0 * ki * grad_p_j; // kj already contains inverse density + v_i += velChange; + addForce(Vector3r(xj.x, xj.y, xj.z), -masses[fmIndices[fluidModelIndex] + i] * velChange * invH, forcesPerThread, torquesPerThread, rigidBodyPositions, forcesPerThreadIndices, torquesPerThreadIndices, pid - nFluids, tid); + ) + } +} + +__global__ +void pressureSolveUpdateFluidVelocities( /*out*/ Vector3r* const fmVelocities, /* output */ Vector3r* const forcesPerThread, /* output */ Vector3r* const torquesPerThread, + const uint* const forcesPerThreadIndices, const uint* const torquesPerThreadIndices, const Vector3r* const rigidBodyPositions, const Real* const densitiesAdv, const Real* const factors, + const uint* const fmIndices, const Real* const masses, const Real* const fmVolumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, + const Real* const densities0, const bool* const isDynamic, const int tid, const Real h, const Real invH, const KernelData* const kernelData, const Real eps, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= numParticles) + return; + + const Real b_i = densitiesAdv[fmIndices[fluidModelIndex] + i] - static_cast(1.0); + const Real ki = b_i * factors[fmIndices[fluidModelIndex] + i]; + + Vector3r &v_i = fmVelocities[fmIndices[fluidModelIndex] + i]; + const double3 &xi = particles[fluidModelIndex][i]; + + ////////////////////////////////////////////////////////////////////////// + // Fluid + ////////////////////////////////////////////////////////////////////////// + forall_fluid_neighborsGPU( + const Real b_j = densitiesAdv[fmIndices[pid] + neighborIndex] - static_cast(1.0); + const Real kj = b_j * factors[fmIndices[pid] + neighborIndex]; + + const Real kSum = ki + densities0[pid] / densities0[fluidModelIndex] * kj; + if(fabsf(kSum) > eps) + { + const Vector3r grad_p_j = -fmVolumes[pid] * gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + v_i -= h * kSum * grad_p_j; // ki, kj already contain inverse density + } + ) + + ////////////////////////////////////////////////////////////////////////// + // Boundary + ////////////////////////////////////////////////////////////////////////// + if(fabsf(ki) > eps) + { + forall_boundary_neighborsGPU( + const Vector3r grad_p_j = -boundaryVolumes[boundaryVolumeIndices[pid - nFluids] + neighborIndex] * gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + const Vector3r velChange = -h * (Real) 1.0 * ki * grad_p_j; // kj already contains inverse density + v_i += velChange; + addForce(Vector3r(xj.x, xj.y, xj.z), -masses[fmIndices[fluidModelIndex] + i] * velChange * invH, forcesPerThread, torquesPerThread, rigidBodyPositions, forcesPerThreadIndices, torquesPerThreadIndices, pid - nFluids, tid); + ) + } +} + +__global__ +void updateDensityErrorDivergence(/*out*/ Real* const density_errors, const Real* const densitiesAdv, const Real* const densities0, const uint* const fmIndices, + const uint fluidModelIndex, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= numParticles) + return; + + //density_errors[fluidModelIndex] += densities0[fluidModelIndex] * densitiesAdv[fmIndices[fluidModelIndex] + i]; + density_errors[0] += densities0[fluidModelIndex] * densitiesAdv[fmIndices[fluidModelIndex] + i]; +} + +__global__ +void warmstartPressureSolveKappa(/*out*/ Real* kappa, const uint* const fmIndices, const Real* const densities0, const Real invH2, const uint fluidModelIndex, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= numParticles) + return; + + kappa[fmIndices[fluidModelIndex] + i] = max( kappa[fmIndices[fluidModelIndex] + i] * invH2, -static_cast(0.5) * densities0[fluidModelIndex] * densities0[fluidModelIndex]); +} + +__global__ +void pressureSolveWarmstart(/*out*/ Vector3r* const fmVelocities , /* output */ Vector3r* const forcesPerThread, /* output */ Vector3r* const torquesPerThread, + const uint* const forcesPerThreadIndices, const uint* const torquesPerThreadIndices, const Vector3r* const rigidBodyPositions,const Real* const kappa, + const Real* const densitiesAdv, const Real* const masses, const Real* const fmVolumes, const uint* const fmIndices, const Real* const boundaryVolumes, + const uint* const boundaryVolumeIndices, const Real* const densities0, const bool* const isDynamic, const int tid, const Real h, const Real eps, const KernelData* const kernelData, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= numParticles) + return; + + if(densitiesAdv[fmIndices[fluidModelIndex] + i] > densities0[fluidModelIndex]) + { + const Real invH = static_cast(1.0) / h; + + Vector3r &vel = fmVelocities[fmIndices[fluidModelIndex] + i]; + const Real &ki = kappa[fmIndices[fluidModelIndex] + i]; + const double3 &xi = particles[fluidModelIndex][i]; + + ////////////////////////////////////////////////////////////////////////// + // Fluid + ////////////////////////////////////////////////////////////////////////// + forall_fluid_neighborsGPU( + const Real kj = kappa[fmIndices[pid] + neighborIndex]; + + const Real kSum = (ki + densities0[pid] / densities0[fluidModelIndex] * kj); + if (fabsf(kSum) > eps) + { + const Vector3r grad_p_j = -fmVolumes[pid] * gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + vel -= h * kSum * grad_p_j; // ki, kj already contain inverse density + } + ) + + ////////////////////////////////////////////////////////////////////////// + // Boundary + ////////////////////////////////////////////////////////////////////////// + if (fabsf(ki) > eps) + { + forall_boundary_neighborsGPU( + const Vector3r grad_p_j = -boundaryVolumes[boundaryVolumeIndices[pid - nFluids] + neighborIndex] * gradKernelWeightPrecomputed(Vector3r(xi.x - xj.x, xi.y - xj.y, xi.z - xj.z), kernelData); + const Vector3r velChange = -h * (Real) 1.0 * ki * grad_p_j; // kj already contains inverse density + vel += velChange; + addForce(Vector3r(xj.x, xj.y, xj.z), -masses[fmIndices[fluidModelIndex] + i] * velChange * invH, forcesPerThread, torquesPerThread, rigidBodyPositions, forcesPerThreadIndices, torquesPerThreadIndices, pid - nFluids, tid); + ) + } + } +} + +__global__ +void updateDensityErrorPressureSolve(/*out*/ Real* const density_error, const Real* const densitiesAdv, const Real* const densities0, const uint* const fmIndices, + const uint fluidModelIndex, const uint numParticles) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + if(i >= numParticles) + return; + + //density_errors[fluidModelIndex] += densities0[fluidModelIndex] * densitiesAdv[fmIndices[fluidModelIndex] + i]; + density_error[0] += densities0[fluidModelIndex] * densitiesAdv[fmIndices[fluidModelIndex] + i] - densities0[fluidModelIndex]; +} \ No newline at end of file diff --git a/SPlisHSPlasH/UtilitiesGPU/Kernels.cuh b/SPlisHSPlasH/UtilitiesGPU/Kernels.cuh new file mode 100644 index 00000000..60093866 --- /dev/null +++ b/SPlisHSPlasH/UtilitiesGPU/Kernels.cuh @@ -0,0 +1,164 @@ +#ifndef __WCSPHKernels_h__ +#define __WCSPHKernels_h__ + +#include +#include "SPlisHSPlasH/NeighborhoodSearch.h" +#include "SPlisHSPlasH/Common.h" +#include "SPlisHSPlasH/SPHKernels.h" + +const unsigned int PRECOMPUTED_KERNEL_SIZE = 10000; + +////////////////////////////////////////////////////////////////// +// Helper class +////////////////////////////////////////////////////////////////// + +struct KernelData{ + Real *d_W, *d_gradW; + Real radius, radius2, invStepSize; + + KernelData(); + ~KernelData(); +}; + +void updateKernelData(KernelData &data); + +////////////////////////////////////////////////////////////////// +//Kernels for all methods +////////////////////////////////////////////////////////////////// + +__device__ +Real kernelWeightPrecomputed(const Vector3r &r, const KernelData* const data); + +__device__ +Vector3r gradKernelWeightPrecomputed(const Vector3r &r, const KernelData* const data); + +__device__ +Real kernelWeight(const Vector3r& rin, const Real m_radius); + +__device__ +Vector3r gradKernelWeight(const Vector3r &rin, const Real m_radius); + +__device__ +void addForce(const Vector3r &pos, const Vector3r &f, /* output */ Vector3r* const forcesPerThread, /* output */ Vector3r* const torquesPerThread, + const Vector3r* const rigidBodyPositions, const uint* const forcesPerThreadIndices, const uint* const torquesPerThreadIndices, const uint index, const int id); + + +__global__ +void computeDensitiesGPU(/*out*/ Real* const densities, const Real* const volumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, + const uint* const fmIndices, const Real* const densities0, const Real W_zero, const KernelData* const kernelData, /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles); + +////////////////////////////////////////////////////////////////// +//Kernels for WCSPH method +////////////////////////////////////////////////////////////////// + +__global__ +void clearAccelerationsGPU(Real* masses, Vector3r* accelerations, const Vector3r grav, const unsigned int numActiveParticles); + +__global__ +void updatePressureGPU(Real* const densities, const uint* const fmIndices, Real* pressures, const Real* const densities0, const Real m_stiffness, const Real m_exponent, + const uint fluidModelIndex, const uint numParticles); + + __global__ +void computePressureAccelsGPU( /* output */ Vector3r* const pressureAccels, /* output */ Vector3r* const forcesPerThread, /* output */ Vector3r* const torquesPerThread, const uint* const forcesPerThreadIndices, + const uint* const torquesPerThreadIndices, const Real* const densities, const Real* const densities0, const uint* const fmIndices, const Real* const pressures, const Real* const masses, + const Vector3r* const rigidBodyPositions, const Real* const volumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, const bool* const isDynamic, const int tid, const KernelData* kernelData, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles); + +__global__ +void updatePosPressureAccelPressureAccel(Vector3r* const positions, Vector3r* const velocities, Vector3r* const accelerations, + const Vector3r* const pressureAccels, const Real h, const uint numParticles); + + +////////////////////////////////////////////////////////////////// +//Kernels for the DFSPH method +////////////////////////////////////////////////////////////////// + +__global__ +void computeDFSPHFactors(/* out */ Real* factors, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, const KernelData* const kernelData, + const unsigned int* fmIndices, const Real* fmVolumes, const Real eps, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles); + + __global__ +void computeDensityChanges(/* out */ Real* const densitiesAdv, const Vector3r* const fmVelocities, const Vector3r* const bmVelocities, const uint* const fmIndices, + const Real* const fmVolumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, const KernelData* const kernelData, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles); + +__global__ +void computeDensityAdvs(/*out*/ Real* const densitiesAdv, const Real* const fmDensities, const Vector3r* const fmVelocities, const Vector3r* const bmVelocities, const uint* const fmIndices, + const Real* const fmVolumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, const Real* const densities0, const Real h, const KernelData* const kernelData, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles); + +__global__ +void warmstartDivergenceSolveKappaV(/*out*/ Real* const kappaV, const uint* const fmIndices, const Real* const densities0, const Real invH, const uint fluidModelIndex, const uint numParticles); + +__global__ +void divergenceSolveWarmstart( /*out*/ Vector3r* const fmVelocities, /* output */ Vector3r* const forcesPerThread, /* output */ Vector3r* const torquesPerThread, + const uint* const forcesPerThreadIndices, const uint* const torquesPerThreadIndices, const Vector3r* const rigidBodyPositions, const Real* const kappaV, + const uint* const fmIndices, const Real* const masses, const Real* const fmVolumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, + const Real* const densities0, const bool* const isDynamic, const int tid, const Real h, const KernelData* const kernelData, const Real eps, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles); + +__global__ +void multiplyRealWithConstant(/*out*/ Real* const input, const uint* const fmIndices, const Real f, const uint fluidModelIndex, const uint numParticles); + +__global__ +void setRealToZero(/*out*/ Real* const input, const uint* const fmIndices, const uint fluidModelIndex, const uint numParticles); + +__global__ +void divergenceSolveUpdateFluidVelocities( /*out*/ Vector3r* const fmVelocities, /*out*/ Real* const kappaV, /* output */ Vector3r* const forcesPerThread, /* output */ Vector3r* const torquesPerThread, + const uint* const forcesPerThreadIndices, const uint* const torquesPerThreadIndices, const Vector3r* const rigidBodyPositions, const Real* const densitiesAdv, const Real* const factors, + const uint* const fmIndices, const Real* const masses, const Real* const fmVolumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, + const Real* const densities0, const bool* const isDynamic, const int tid, const Real h, const Real invH, const KernelData* const kernelData, const Real eps, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles); + +__global__ +void divergenceSolveUpdateFluidVelocities( /*out*/ Vector3r* const fmVelocities, /* output */ Vector3r* const forcesPerThread, /* output */ Vector3r* const torquesPerThread, + const uint* const forcesPerThreadIndices, const uint* const torquesPerThreadIndices, const Vector3r* const rigidBodyPositions, const Real* const densitiesAdv, const Real* const factors, + const uint* const fmIndices, const Real* const masses, const Real* const fmVolumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, + const Real* const densities0, const bool* const isDynamic, const int tid, const Real h, const Real invH, const KernelData* const kernelData, const Real eps, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles); + +__global__ +void pressureSolveUpdateFluidVelocities( /*out*/ Vector3r* const fmVelocities, /*out*/ Vector3r* const forcesPerThread, /*out*/ Vector3r* const torquesPerThread, + const uint* const forcesPerThreadIndices, const uint* const torquesPerThreadIndices, const Vector3r* const rigidBodyPositions, const Real* const densitiesAdv, const Real* const factors, + const uint* const fmIndices, const Real* const masses, const Real* const fmVolumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, + const Real* const densities0, const bool* const isDynamic, const int tid, const Real h, const Real invH, const KernelData* const kernelData, const Real eps, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles); + +__global__ +void pressureSolveUpdateFluidVelocities( /*out*/ Vector3r* const fmVelocities, /*out*/ Real* const kappa, /*out*/ Vector3r* const forcesPerThread, /*out*/ Vector3r* const torquesPerThread, + const uint* const forcesPerThreadIndices, const uint* const torquesPerThreadIndices, const Vector3r* const rigidBodyPositions, const Real* const densitiesAdv, const Real* const factors, + const uint* const fmIndices, const Real* const masses, const Real* const fmVolumes, const Real* const boundaryVolumes, const uint* const boundaryVolumeIndices, + const Real* const densities0, const bool* const isDynamic, const int tid, const Real h, const Real invH, const KernelData* const kernelData, const Real eps, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles); + +__global__ +void updateDensityErrorDivergence(/* out */ Real* const density_errors, const Real* const densitiesAdv, const Real* const densities0, const uint* const fmIndices, + const uint fluidModelIndex, const uint numParticles); + +__global__ +void warmstartPressureSolveKappa(/*out*/ Real* kappa, const uint* const fmIndices, const Real* const densities0, const Real invH2, const uint fluidModelIndex, const uint numParticles); + +__global__ +void pressureSolveWarmstart(/*out*/ Vector3r* const fmVelocities , /* output */ Vector3r* const forcesPerThread, /* output */ Vector3r* const torquesPerThread, + const uint* const forcesPerThreadIndices, const uint* const torquesPerThreadIndices, const Vector3r* const rigidBodyPositions,const Real* const kappa, + const Real* const densitiesAdv, const Real* const masses, const Real* const fmVolumes, const uint* const fmIndices, const Real* const boundaryVolumes, + const uint* const boundaryVolumeIndices, const Real* const densities0, const bool* const isDynamic, const int tid, const Real h, const Real eps, const KernelData* const kernelData, + /*start of forall-parameters*/ double3** particles, uint** neighbors, uint** neighborCounts, uint** neighborOffsets, + uint* neighborPointsetIndices, const uint nFluids, const uint nPointSets, const uint fluidModelIndex, const uint numParticles); + +__global__ +void updateDensityErrorPressureSolve(/*out*/ Real* const density_error, const Real* const densitiesAdv, const Real* const densities0, const uint* const fmIndices, + const uint fluidModelIndex, const uint numParticles); + +#endif \ No newline at end of file diff --git a/SPlisHSPlasH/WCSPH/SimulationDataWCSPH.h b/SPlisHSPlasH/WCSPH/SimulationDataWCSPH.h index 5133be2e..49828c8d 100644 --- a/SPlisHSPlasH/WCSPH/SimulationDataWCSPH.h +++ b/SPlisHSPlasH/WCSPH/SimulationDataWCSPH.h @@ -54,7 +54,7 @@ namespace SPH { m_pressure[fluidIndex][i] = p; } - + FORCE_INLINE Vector3r &getPressureAccel(const unsigned int fluidIndex, const unsigned int i) { return m_pressureAccel[fluidIndex][i]; diff --git a/SPlisHSPlasH/WCSPH/TimeStepWCSPHGPU.cu b/SPlisHSPlasH/WCSPH/TimeStepWCSPHGPU.cu new file mode 100644 index 00000000..44579cdc --- /dev/null +++ b/SPlisHSPlasH/WCSPH/TimeStepWCSPHGPU.cu @@ -0,0 +1,397 @@ +#include "TimeStepWCSPHGPU.h" +#include "SPlisHSPlasH/TimeManager.h" +#include "SPlisHSPlasH/SPHKernels.h" +#include "SimulationDataWCSPH.h" +#include +#include "Utilities/Timing.h" +#include "../Simulation.h" +#include "SPlisHSPlasH/BoundaryModel_Akinci2012.h" + +#include "../../extern/cuNSearch/src/Ext_NeighborhoodSearch/src/PointSetImplementation.h" + +using namespace SPH; +using namespace std; +using namespace GenParam; +using namespace cuNSearch; + +int TimeStepWCSPHGPU::STIFFNESS = -1; +int TimeStepWCSPHGPU::EXPONENT = -1; + +TimeStepWCSPHGPU::TimeStepWCSPHGPU() : + TimeStep() +{ + m_simulationData.init(); + m_counter = 0; + m_stiffness = 50.0; + m_exponent = 7.0; + + CudaHelper::CudaMalloc(&d_kernelData, 1); + + Simulation *sim = Simulation::getCurrent(); + const unsigned int nModels = sim->numberOfFluidModels(); + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nModels; fluidModelIndex++) + { + FluidModel *model = sim->getFluidModel(fluidModelIndex); + model->addField({ "pressure", FieldType::Scalar, [this, fluidModelIndex](const unsigned int i) -> Real* { return &m_simulationData.getPressure(fluidModelIndex, i); } }); + model->addField({ "pressure acceleration", FieldType::Vector3, [this, fluidModelIndex](const unsigned int i) -> Real* { return &m_simulationData.getPressureAccel(fluidModelIndex, i)[0]; } }); + } +} + +TimeStepWCSPHGPU::~TimeStepWCSPHGPU(void) +{ + CudaHelper::CudaFree(d_kernelData); + CudaHelper::CudaFree(d_neighbors); + CudaHelper::CudaFree(d_neighborCounts); + CudaHelper::CudaFree(d_neighborOffsets); + CudaHelper::CudaFree(d_neighborPointsetIndices); + + Simulation *sim = Simulation::getCurrent(); + const unsigned int nModels = sim->numberOfFluidModels(); + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nModels; fluidModelIndex++) + { + FluidModel *model = sim->getFluidModel(fluidModelIndex); + model->removeFieldByName("pressure"); + model->removeFieldByName("pressure acceleration"); + } +} + +void TimeStepWCSPHGPU::initParameters() +{ + TimeStep::initParameters(); + + STIFFNESS = createNumericParameter("stiffness", "Stiffness", &m_stiffness); + setGroup(STIFFNESS, "WCSPH"); + setDescription(STIFFNESS, "Stiffness coefficient of EOS."); + static_cast(getParameter(STIFFNESS))->setMinValue(1e-6); + + EXPONENT = createNumericParameter("exponent", "Exponent (gamma)", &m_exponent); + setGroup(EXPONENT, "WCSPH"); + setDescription(EXPONENT, "Exponent of EOS."); + static_cast(getParameter(EXPONENT))->setMinValue(1e-6); +} + +void TimeStepWCSPHGPU::initCUDA() // TODO: shift this into constructor or at best spot +{ + Simulation *sim = Simulation::getCurrent(); + const unsigned int nModels = sim->numberOfFluidModels(); + + std::vector &pointSets = sim->getCurrent()->getPointSets(); + d_particles.resize(pointSets.size()); + for(int i = 0 ; i < pointSets.size() ; ++i) + { + d_particles[i] = CudaHelper::GetPointer(pointSets[i].getPointSetImplementation()->getParticles()); + } + + d_volumes.resize(nModels); + d_densities0.resize(nModels); + for(unsigned int pid = 0; pid < nModels; pid++) + { + FluidModel *fm = sim->getFluidModel(pid); + d_volumes[pid] = fm->getVolume(0); + d_densities0[pid] = fm->getDensity0(); + } + + d_fmIndices.resize(nModels); + fmIndices.resize(nModels); // helper to copy data back + + d_rigidBodyPositions.resize(sim->numberOfPointSets() - nModels); + d_isDynamic.resize(sim->numberOfPointSets() - nModels); + d_forcesPerThreadIndices.resize(sim->numberOfPointSets() - nModels); + d_torquesPerThreadIndices.resize(sim->numberOfPointSets() - nModels); +} + +void TimeStepWCSPHGPU::step() +{ + Simulation *sim = Simulation::getCurrent(); + const unsigned int nModels = sim->numberOfFluidModels(); + const unsigned int nPointSets = sim->numberOfPointSets(); + TimeManager *tm = TimeManager::getCurrent(); + const Real h = tm->getTimeStepSize(); + + performNeighborhoodSearch(); + + if(!isInitialized) + { + initCUDA(); + } + + prepareData(); + + // re-compute the precomputed kernel if necessary + if( sim->getSupportRadius() != PrecomputedKernel::getRadius() || !isInitialized) + { + PrecomputedKernel::setRadius(sim->getSupportRadius()); + updateKernelData(kernelData); + CudaHelper::MemcpyHostToDevice(&kernelData, d_kernelData, 1); + + isInitialized = true; + } + + // for computeDensities and computePressureAccels + d_boundaryVolumeIndices.resize(sim->numberOfPointSets() - nModels); + unsigned int sumBoundaryVolumes = 0; + + for(unsigned int pid = nModels; pid < sim->numberOfPointSets(); pid++) + { + BoundaryModel_Akinci2012 *bm_neighbor = static_cast(sim->getBoundaryModelFromPointSet(pid)); + d_boundaryVolumeIndices[pid - nModels] = sumBoundaryVolumes; + sumBoundaryVolumes += bm_neighbor->getVolumes().size(); + + d_boundaryVolumes.insert(d_boundaryVolumes.end(), bm_neighbor->getVolumes().begin(), bm_neighbor->getVolumes().end()); + } + + unsigned int sumParticles = 0; + // for indexing + for(unsigned int fluidModelIndex = 0; fluidModelIndex < nModels; fluidModelIndex++) + { + FluidModel *model = sim->getFluidModel(fluidModelIndex); + FluidModel *fm = sim->getFluidModelFromPointSet(fluidModelIndex); + + d_fmIndices[fluidModelIndex] = sumParticles; + + fmIndices[fluidModelIndex] = sumParticles; + sumParticles += fm->numActiveParticles(); + } + + Real *d_densities; + CudaHelper::CudaMalloc( &d_densities, sumParticles); + + // Compute accelerations: a(t) + unsigned int sumActiveParticles = 0; + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nModels; fluidModelIndex++) + { + START_TIMING("Clearing accelerations"); + clearAccelerations(fluidModelIndex); + STOP_TIMING_AVG; + START_TIMING("Computing desities"); + + FluidModel *model = sim->getFluidModel(fluidModelIndex); + std::vector &pointSets = sim->getCurrent()->getPointSets(); + PointSetImplementation *impl = pointSets[fluidModelIndex].getPointSetImplementation(); + const Real density0 = model->getDensity0(); + const unsigned int numParticles = model->numActiveParticles(); + const Real W_zero = sim->W_zero(); + + computeDensitiesGPU<<getNumberOfBlocks(), impl->getThreadsPerBlock(), impl->getThreadsPerBlock() * sizeof(Real)>>>(d_densities, CudaHelper::GetPointer(d_volumes), CudaHelper::GetPointer(d_boundaryVolumes), + CudaHelper::GetPointer(d_boundaryVolumeIndices), CudaHelper::GetPointer(d_fmIndices), CudaHelper::GetPointer(d_densities0), W_zero, d_kernelData, + CudaHelper::GetPointer(d_particles), d_neighbors, d_neighborCounts, d_neighborOffsets, d_neighborPointsetIndices, nModels, + nPointSets, fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); + CudaHelper::DeviceSynchronize(); + + CudaHelper::MemcpyDeviceToHost(d_densities + sumActiveParticles, &(model->getDensity(0)), numParticles); + sumActiveParticles += numParticles; + + STOP_TIMING_AVG; + } + + sim->computeNonPressureForces(); + + // for correct indexing + int sumForcesPerThread = 0; + int sumTorquesPerThread = 0; + for (unsigned int pid = nModels; pid < sim->numberOfPointSets(); pid++) + { + BoundaryModel_Akinci2012 *bm_neighbor = static_cast(sim->getBoundaryModelFromPointSet(pid)); + + d_forcesPerThread.insert(d_forcesPerThread.end(), bm_neighbor->getForcesPerThread().begin(), bm_neighbor->getForcesPerThread().end()); + d_torquesPerThread.insert(d_torquesPerThread.end(), bm_neighbor->getTorquesPerThread().begin(), bm_neighbor->getTorquesPerThread().end()); + + d_forcesPerThreadIndices[pid - nModels] = sumForcesPerThread; + d_torquesPerThreadIndices[pid - nModels] = sumTorquesPerThread; + + sumForcesPerThread += bm_neighbor->getForcesPerThread().size(); + sumTorquesPerThread += bm_neighbor->getTorquesPerThread().size(); + + d_rigidBodyPositions[pid - nModels] = bm_neighbor->getRigidBodyPosition(); + d_isDynamic[pid - nModels] = bm_neighbor->isDynamic(); + } + + Real *d_pressures; + CudaHelper::CudaMalloc(&d_pressures, sumParticles); + + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nModels; fluidModelIndex++) + { + START_TIMING("Update pressure"); + FluidModel *model = sim->getFluidModel(fluidModelIndex); + std::vector &pointSets = sim->getCurrent()->getPointSets(); + PointSetImplementation *impl = pointSets[fluidModelIndex].getPointSetImplementation(); + const unsigned int numParticles = model->numActiveParticles(); + const unsigned int nPointSets = sim->numberOfPointSets(); + + updatePressureGPU<<getNumberOfBlocks(), impl->getThreadsPerBlock()>>>( d_densities, CudaHelper::GetPointer(d_fmIndices), d_pressures, + CudaHelper::GetPointer(d_densities0), m_stiffness, m_exponent, fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); + CudaHelper::DeviceSynchronize(); + + STOP_TIMING_AVG; + + START_TIMING("Compute pressure accels"); + + // Pressure Accelerations + Vector3r *d_pressureAccels; + CudaHelper::CudaMalloc(&d_pressureAccels, numParticles); + + Real *d_masses; + CudaHelper::CudaMalloc(&d_masses, numParticles); + CudaHelper::MemcpyHostToDevice(&(model->getMass(0)), d_masses, numParticles); + + computePressureAccelsGPU<<getNumberOfBlocks(), impl->getThreadsPerBlock(), impl->getThreadsPerBlock() * sizeof(Vector3r)>>>( d_pressureAccels, CudaHelper::GetPointer(d_forcesPerThread), CudaHelper::GetPointer(d_torquesPerThread), CudaHelper::GetPointer(d_forcesPerThreadIndices), + CudaHelper::GetPointer(d_torquesPerThreadIndices), d_densities, CudaHelper::GetPointer(d_densities0), CudaHelper::GetPointer(d_fmIndices), + d_pressures, d_masses, CudaHelper::GetPointer(d_rigidBodyPositions), CudaHelper::GetPointer(d_volumes), CudaHelper::GetPointer(d_boundaryVolumes), + CudaHelper::GetPointer(d_boundaryVolumeIndices), CudaHelper::GetPointer(d_isDynamic), omp_get_thread_num(), d_kernelData, CudaHelper::GetPointer(d_particles), d_neighbors, + d_neighborCounts, d_neighborOffsets, d_neighborPointsetIndices, nModels, nPointSets, fluidModelIndex, numParticles); + + CudaHelper::CheckLastError(); + CudaHelper::DeviceSynchronize(); + + + CudaHelper::MemcpyDeviceToHost( d_pressures + fmIndices[fluidModelIndex], &(m_simulationData.getPressure(fluidModelIndex, 0)), numParticles); + CudaHelper::MemcpyDeviceToHost( d_pressureAccels, &(m_simulationData.getPressureAccel(fluidModelIndex, 0)), numParticles); + CudaHelper::CudaFree(d_pressureAccels); + CudaHelper::CudaFree(d_masses); + + STOP_TIMING_AVG; + } + + CudaHelper::CudaFree(d_pressures); + CudaHelper::CudaFree(d_densities); + + sumForcesPerThread = 0; + sumTorquesPerThread = 0; + + for (unsigned int pid = nModels; pid < sim->numberOfPointSets(); pid++) + { + BoundaryModel_Akinci2012 *bm_neighbor = static_cast(sim->getBoundaryModelFromPointSet(pid)); + + CudaHelper::MemcpyDeviceToHost( CudaHelper::GetPointer(d_forcesPerThread) + sumForcesPerThread, &(bm_neighbor->getForcesPerThread()[0]), bm_neighbor->getForcesPerThread().size()); + CudaHelper::MemcpyDeviceToHost( CudaHelper::GetPointer(d_torquesPerThread) + sumTorquesPerThread, &(bm_neighbor->getTorquesPerThread()[0]), bm_neighbor->getTorquesPerThread().size()); + + sumForcesPerThread += bm_neighbor->getForcesPerThread().size(); + sumTorquesPerThread += bm_neighbor->getTorquesPerThread().size(); + } + + sim->updateTimeStepSize(); + + START_TIMING("Update some quantities"); + for (unsigned int fluidModelIndex = 0; fluidModelIndex < nModels; fluidModelIndex++) + { + FluidModel *model = sim->getFluidModel(fluidModelIndex); + #pragma omp parallel default(shared) + { + #pragma omp for schedule(static) + for (int i = 0; i < (int)model->numActiveParticles(); i++) + { + if (model->getParticleState(i) == ParticleState::Active) + { + Vector3r &pos = model->getPosition(i); + Vector3r &vel = model->getVelocity(i); + Vector3r &accel = model->getAcceleration(i); + accel += m_simulationData.getPressureAccel(fluidModelIndex, i); + vel += accel * h; + pos += vel * h; + } + } + } + } + STOP_TIMING_AVG; + + sim->emitParticles(); + sim->animateParticles(); + + // Compute new time + tm->setTime (tm->getTime () + h); + + d_boundaryVolumes.clear(); d_boundaryVolumes.shrink_to_fit(); + d_forcesPerThread.clear(); d_forcesPerThread.shrink_to_fit(); + d_torquesPerThread.clear(); d_torquesPerThread.shrink_to_fit(); +} + +void TimeStepWCSPHGPU::prepareData() +{ + Simulation *sim = Simulation::getCurrent(); + const unsigned int nPointSets = sim->numberOfPointSets(); + + if(isInitialized) + { + CudaHelper::CudaFree(d_neighbors); + CudaHelper::CudaFree(d_neighborCounts); + CudaHelper::CudaFree(d_neighborOffsets); + CudaHelper::CudaFree(d_neighborPointsetIndices); + } + + std::vector &pointSets = sim->getCurrent()->getPointSets(); + + CudaHelper::CudaMalloc(&d_neighborPointsetIndices, nPointSets); + unsigned int neighborPointsetIndices_tmp[nPointSets]; + + unsigned int neighborsetCount = 0; + for(int i = 0 ; i < nPointSets ; ++i) + { + neighborPointsetIndices_tmp[i] = neighborsetCount; + neighborsetCount += pointSets[i].n_neighborsets(); + } + + CudaHelper::MemcpyHostToDevice(neighborPointsetIndices_tmp, d_neighborPointsetIndices, nPointSets); + + // flattened out the structures for efficiency + CudaHelper::CudaMalloc(&d_neighbors, neighborsetCount); + CudaHelper::CudaMalloc(&d_neighborCounts, neighborsetCount); + CudaHelper::CudaMalloc(&d_neighborOffsets, neighborsetCount); + + for(int i = 0 ; i < nPointSets ; ++i) + { + const unsigned int nNeighborsets = pointSets[i].n_neighborsets(); + + uint* neighbors_tmp[nNeighborsets]; + uint* neighborCounts_tmp[nNeighborsets]; + uint* neighborOffsets_tmp[nNeighborsets]; + + for(int j = 0; j < nNeighborsets; j++) + { + neighbors_tmp[j] = pointSets[i].neighbor_indices(j); + neighborCounts_tmp[j] = pointSets[i].neighbor_counts(j); + neighborOffsets_tmp[j] = pointSets[i].neighbor_offsets(j); + } + + CudaHelper::MemcpyHostToDevice(neighbors_tmp, d_neighbors + neighborPointsetIndices_tmp[i], nNeighborsets); + CudaHelper::MemcpyHostToDevice(neighborCounts_tmp, d_neighborCounts + neighborPointsetIndices_tmp[i], nNeighborsets); + CudaHelper::MemcpyHostToDevice(neighborOffsets_tmp, d_neighborOffsets + neighborPointsetIndices_tmp[i], nNeighborsets); + } +} + + +void TimeStepWCSPHGPU::reset() +{ + TimeStep::reset(); + m_simulationData.reset(); + m_counter = 0; +} + + +void TimeStepWCSPHGPU::performNeighborhoodSearch() +{ + if (Simulation::getCurrent()->zSortEnabled()) + { + if (m_counter % 500 == 0) + { + Simulation::getCurrent()->performNeighborhoodSearchSort(); + m_simulationData.performNeighborhoodSearchSort(); + } + m_counter++; + } + + Simulation::getCurrent()->performNeighborhoodSearch(); +} + +void TimeStepWCSPHGPU::emittedParticles(FluidModel *model, const unsigned int startIndex) +{ + m_simulationData.emittedParticles(model, startIndex); + } + +void TimeStepWCSPHGPU::resize() +{ + m_simulationData.init(); +} diff --git a/SPlisHSPlasH/WCSPH/TimeStepWCSPHGPU.h b/SPlisHSPlasH/WCSPH/TimeStepWCSPHGPU.h new file mode 100644 index 00000000..b283881d --- /dev/null +++ b/SPlisHSPlasH/WCSPH/TimeStepWCSPHGPU.h @@ -0,0 +1,93 @@ +#ifndef __TimeStepWCSPHGPU_h__ +#define __TimeStepWCSPHGPU_h__ + +#include "SPlisHSPlasH/Common.h" +#include "SPlisHSPlasH/TimeStep.h" +#include "SimulationDataWCSPH.h" +#include "SPlisHSPlasH/SPHKernels.h" +#include "SPlisHSPlasH/UtilitiesGPU/Kernels.cuh" +//#include "../Common.h" + +#include + +namespace SPH +{ + class SimulationDataWCSPH; + +/* struct KernelDeviceData{ + Real *m_W; + Real *m_gradW; + Real m_radius; + Real m_radius2; + Real m_invStepSize; + Real m_W_zero; + }; */ + + /** \brief This class implements the Weakly Compressible SPH for Free Surface Flows approach introduced + * by Becker and Teschner \cite Becker:2007. + */ + class TimeStepWCSPHGPU : public TimeStep + { + protected: + +/* KernelDeviceData kernelData, *d_kernelData; + unsigned int kernelResolution; */ + + bool isInitialized = false; + + KernelData *d_kernelData, kernelData; + + thrust::device_vector d_particles; // particle positions + uint **d_neighbors; + uint **d_neighborCounts; + uint **d_neighborOffsets; + uint *d_neighborPointsetIndices; // indexing the above + + thrust::device_vector d_boundaryVolumes; + thrust::device_vector d_boundaryVolumeIndices; + + thrust::device_vector d_rigidBodyPositions; + thrust::device_vector d_isDynamic; + thrust::device_vector d_forcesPerThread; + thrust::device_vector d_torquesPerThread; + thrust::device_vector d_forcesPerThreadIndices; + thrust::device_vector d_torquesPerThreadIndices; + + thrust::device_vector d_fmIndices; + std::vector fmIndices; + + thrust::device_vector d_volumes; + thrust::device_vector d_densities0; + + Real m_stiffness; + Real m_exponent; + + SimulationDataWCSPH m_simulationData; + unsigned int m_counter; + + /** Perform the neighborhood search for all fluid particles. + */ + void performNeighborhoodSearch(); + + virtual void emittedParticles(FluidModel *model, const unsigned int startIndex); + virtual void initParameters(); + void initCUDA(); + void prepareData(); + + void computePressureAccels(const unsigned int fluidModelIndex); + + public: + static int STIFFNESS; + static int EXPONENT; + + TimeStepWCSPHGPU(); + virtual ~TimeStepWCSPHGPU(void); + + virtual void step(); + virtual void reset(); + virtual void resize(); + }; +} + + +#endif \ No newline at end of file diff --git a/Simulators/DynamicBoundarySimulator/CMakeLists.txt b/Simulators/DynamicBoundarySimulator/CMakeLists.txt index c4df9d4f..f0567592 100644 --- a/Simulators/DynamicBoundarySimulator/CMakeLists.txt +++ b/Simulators/DynamicBoundarySimulator/CMakeLists.txt @@ -64,6 +64,7 @@ set(SIMULATION_LINK_LIBRARIES ${SIMULATION_LINK_LIBRARIES} set(SIMULATION_DEPENDENCIES Ext_PBD ${SIMULATION_DEPENDENCIES}) link_directories(${PROJECT_PATH}/extern/install/PositionBasedDynamics/lib) +include_directories( ${CUDA_INCLUDE_DIRS}) add_executable(DynamicBoundarySimulator main.cpp @@ -92,7 +93,7 @@ set_target_properties(DynamicBoundarySimulator PROPERTIES DEBUG_POSTFIX ${CMAKE_ set_target_properties(DynamicBoundarySimulator PROPERTIES RELWITHDEBINFO_POSTFIX ${CMAKE_RELWITHDEBINFO_POSTFIX}) set_target_properties(DynamicBoundarySimulator PROPERTIES MINSIZEREL_POSTFIX ${CMAKE_MINSIZEREL_POSTFIX}) add_dependencies(DynamicBoundarySimulator ${SIMULATION_DEPENDENCIES}) -target_link_libraries(DynamicBoundarySimulator ${SIMULATION_LINK_LIBRARIES}) +target_link_libraries(DynamicBoundarySimulator ${SIMULATION_LINK_LIBRARIES} ${CUDA_LIBRARIES}) VIS_SOURCE_GROUPS() source_group("Header Files\\PBD" FILES ${PBDWRAPPER_HEADER_FILES}) diff --git a/Simulators/StaticBoundarySimulator/CMakeLists.txt b/Simulators/StaticBoundarySimulator/CMakeLists.txt index 0107d822..c454f64a 100644 --- a/Simulators/StaticBoundarySimulator/CMakeLists.txt +++ b/Simulators/StaticBoundarySimulator/CMakeLists.txt @@ -64,12 +64,14 @@ add_definitions(-DTW_NO_LIB_PRAGMA -DTW_STATIC) include_directories(${PROJECT_PATH}/extern/freeglut/include) include_directories(${PROJECT_PATH}/extern/glew/include) +include_directories( ${CUDA_INCLUDE_DIRS}) set_target_properties(StaticBoundarySimulator PROPERTIES DEBUG_POSTFIX ${CMAKE_DEBUG_POSTFIX}) set_target_properties(StaticBoundarySimulator PROPERTIES RELWITHDEBINFO_POSTFIX ${CMAKE_RELWITHDEBINFO_POSTFIX}) set_target_properties(StaticBoundarySimulator PROPERTIES MINSIZEREL_POSTFIX ${CMAKE_MINSIZEREL_POSTFIX}) add_dependencies(StaticBoundarySimulator ${SIMULATION_DEPENDENCIES}) -target_link_libraries(StaticBoundarySimulator ${SIMULATION_LINK_LIBRARIES}) +target_link_libraries(StaticBoundarySimulator ${SIMULATION_LINK_LIBRARIES} ${CUDA_LIBRARIES}) + VIS_SOURCE_GROUPS() set_target_properties(StaticBoundarySimulator PROPERTIES FOLDER "Simulators") diff --git a/data/Scenes/DoubleDamBreak.json b/data/Scenes/DoubleDamBreak.json index 74a118e4..d9aa3221 100644 --- a/data/Scenes/DoubleDamBreak.json +++ b/data/Scenes/DoubleDamBreak.json @@ -3,7 +3,7 @@ { "cameraPosition": [0,2,5], "cameraLookat": [0,0,0], - "particleRadius": 0.025, + "particleRadius": 0.015, "numberOfStepsPerRenderUpdate": 4, "density0": 1000, "simulationMethod": 4, @@ -19,7 +19,7 @@ "exponent": 7, "velocityUpdateMethod": 0, "enableDivergenceSolver": true, - "boundaryHandlingMethod": 2 + "boundaryHandlingMethod": 0 }, "Fluid": { diff --git a/extern/eigen/CMakeLists.txt b/extern/eigen/CMakeLists.txt index f5840025..2bfb6d56 100644 --- a/extern/eigen/CMakeLists.txt +++ b/extern/eigen/CMakeLists.txt @@ -41,10 +41,13 @@ string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_ set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}") set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION}) -# if the mercurial program is absent, this will leave the EIGEN_HG_CHANGESET string empty, -# but won't stop CMake. -execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT) -execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT) +# if we are not in a mercurial clone +if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.hg) + # if the mercurial program is absent or this will leave the EIGEN_HG_CHANGESET string empty, + # but won't stop CMake. + execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT) + execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT) +endif() # if this is the default (aka development) branch, extract the mercurial changeset number from the hg tip output... if(EIGEN_BRANCH_OUTPUT MATCHES "default") @@ -64,6 +67,33 @@ include(GNUInstallDirs) set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) + +option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tensor module)." OFF) + + +macro(ei_add_cxx_compiler_flag FLAG) + string(REGEX REPLACE "-" "" SFLAG1 ${FLAG}) + string(REGEX REPLACE "\\+" "p" SFLAG ${SFLAG1}) + check_cxx_compiler_flag(${FLAG} COMPILER_SUPPORT_${SFLAG}) + if(COMPILER_SUPPORT_${SFLAG}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}") + endif() +endmacro(ei_add_cxx_compiler_flag) + +check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11) + +if(EIGEN_TEST_CXX11) + set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_EXTENSIONS OFF) + if(EIGEN_COMPILER_SUPPORT_CPP11) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + endif() +else() + #set(CMAKE_CXX_STANDARD 03) + #set(CMAKE_CXX_EXTENSIONS OFF) + ei_add_cxx_compiler_flag("-std=c++03") +endif() + ############################################################################# # find how to link to the standard libraries # ############################################################################# @@ -115,15 +145,6 @@ endif() set(EIGEN_TEST_MAX_SIZE "320" CACHE STRING "Maximal matrix/vector size, default is 320") -macro(ei_add_cxx_compiler_flag FLAG) - string(REGEX REPLACE "-" "" SFLAG1 ${FLAG}) - string(REGEX REPLACE "\\+" "p" SFLAG ${SFLAG1}) - check_cxx_compiler_flag(${FLAG} COMPILER_SUPPORT_${SFLAG}) - if(COMPILER_SUPPORT_${SFLAG}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}") - endif() -endmacro(ei_add_cxx_compiler_flag) - if(NOT MSVC) # We assume that other compilers are partly compatible with GNUCC @@ -359,8 +380,6 @@ if(EIGEN_TEST_NO_EXCEPTIONS) message(STATUS "Disabling exceptions in tests/examples") endif() -option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tensor module)." OFF) - set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code") include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) @@ -416,16 +435,15 @@ add_subdirectory(Eigen) add_subdirectory(doc EXCLUDE_FROM_ALL) -include(EigenConfigureTesting) - -# fixme, not sure this line is still needed: -enable_testing() # must be called from the root CMakeLists, see man page +option(BUILD_TESTING "Enable creation of Eigen tests." ON) +if(BUILD_TESTING) + include(EigenConfigureTesting) - -if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) - add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest -else() - add_subdirectory(test EXCLUDE_FROM_ALL) + if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) + add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest + else() + add_subdirectory(test EXCLUDE_FROM_ALL) + endif() endif() if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) @@ -461,7 +479,9 @@ endif(NOT WIN32) configure_file(scripts/cdashtesting.cmake.in cdashtesting.cmake @ONLY) -ei_testing_print_summary() +if(BUILD_TESTING) + ei_testing_print_summary() +endif() message(STATUS "") message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}") diff --git a/extern/eigen/COPYING.MINPACK b/extern/eigen/COPYING.MINPACK index ae7984da..11d8a9a6 100644 --- a/extern/eigen/COPYING.MINPACK +++ b/extern/eigen/COPYING.MINPACK @@ -1,52 +1,52 @@ -Minpack Copyright Notice (1999) University of Chicago. All rights reserved - -Redistribution and use in source and binary forms, with or -without modification, are permitted provided that the -following conditions are met: - -1. Redistributions of source code must retain the above -copyright notice, this list of conditions and the following -disclaimer. - -2. Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following -disclaimer in the documentation and/or other materials -provided with the distribution. - -3. The end-user documentation included with the -redistribution, if any, must include the following -acknowledgment: - - "This product includes software developed by the - University of Chicago, as Operator of Argonne National - Laboratory. - -Alternately, this acknowledgment may appear in the software -itself, if and wherever such third-party acknowledgments -normally appear. - -4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS" -WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE -UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND -THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES -OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE -OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY -OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR -USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF -THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4) -DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION -UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL -BE CORRECTED. - -5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT -HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF -ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT, -INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF -ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF -PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER -SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT -(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE, -EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE -POSSIBILITY OF SUCH LOSS OR DAMAGES. - +Minpack Copyright Notice (1999) University of Chicago. All rights reserved + +Redistribution and use in source and binary forms, with or +without modification, are permitted provided that the +following conditions are met: + +1. Redistributions of source code must retain the above +copyright notice, this list of conditions and the following +disclaimer. + +2. Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials +provided with the distribution. + +3. The end-user documentation included with the +redistribution, if any, must include the following +acknowledgment: + + "This product includes software developed by the + University of Chicago, as Operator of Argonne National + Laboratory. + +Alternately, this acknowledgment may appear in the software +itself, if and wherever such third-party acknowledgments +normally appear. + +4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS" +WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE +UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND +THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE +OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY +OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR +USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF +THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4) +DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION +UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL +BE CORRECTED. + +5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT +HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF +ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT, +INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF +ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF +PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER +SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT +(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE, +EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE +POSSIBILITY OF SUCH LOSS OR DAMAGES. + diff --git a/extern/eigen/CTestConfig.cmake b/extern/eigen/CTestConfig.cmake new file mode 100644 index 00000000..0039bf8a --- /dev/null +++ b/extern/eigen/CTestConfig.cmake @@ -0,0 +1,13 @@ +## This file should be placed in the root directory of your project. +## Then modify the CMakeLists.txt file in the root directory of your +## project to incorporate the testing dashboard. +## # The following are required to uses Dart and the Cdash dashboard +## ENABLE_TESTING() +## INCLUDE(CTest) +set(CTEST_PROJECT_NAME "Eigen 3.3") +set(CTEST_NIGHTLY_START_TIME "00:00:00 UTC") + +set(CTEST_DROP_METHOD "http") +set(CTEST_DROP_SITE "manao.inria.fr") +set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen+3.3") +set(CTEST_DROP_SITE_CDASH TRUE) diff --git a/extern/eigen/CTestCustom.cmake.in b/extern/eigen/CTestCustom.cmake.in new file mode 100644 index 00000000..89e487f0 --- /dev/null +++ b/extern/eigen/CTestCustom.cmake.in @@ -0,0 +1,4 @@ + +set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "2000") +set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "2000") +list(APPEND CTEST_CUSTOM_ERROR_EXCEPTION @EIGEN_CTEST_ERROR_EXCEPTION@) diff --git a/extern/eigen/Eigen/Cholesky b/extern/eigen/Eigen/Cholesky index 369d1f5e..1332b540 100644 --- a/extern/eigen/Eigen/Cholesky +++ b/extern/eigen/Eigen/Cholesky @@ -9,6 +9,7 @@ #define EIGEN_CHOLESKY_MODULE_H #include "Core" +#include "Jacobi" #include "src/Core/util/DisableStupidWarnings.h" @@ -31,7 +32,11 @@ #include "src/Cholesky/LLT.h" #include "src/Cholesky/LDLT.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/Cholesky/LLT_LAPACKE.h" #endif diff --git a/extern/eigen/Eigen/Core b/extern/eigen/Eigen/Core index 0f7fa630..b923b8c0 100644 --- a/extern/eigen/Eigen/Core +++ b/extern/eigen/Eigen/Core @@ -14,6 +14,22 @@ // first thing Eigen does: stop the compiler from committing suicide #include "src/Core/util/DisableStupidWarnings.h" +#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) + #define EIGEN_CUDACC __CUDACC__ +#endif + +#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) + #define EIGEN_CUDA_ARCH __CUDA_ARCH__ +#endif + +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) +#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) +#elif defined(__CUDACC_VER__) +#define EIGEN_CUDACC_VER __CUDACC_VER__ +#else +#define EIGEN_CUDACC_VER 0 +#endif + // Handle NVCC/CUDA/SYCL #if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__) // Do not try asserts on CUDA and SYCL! @@ -37,9 +53,9 @@ #endif #define EIGEN_DEVICE_FUNC __host__ __device__ - // We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro + // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro // works properly on the device side - #include + #include #else #define EIGEN_DEVICE_FUNC #endif @@ -155,6 +171,9 @@ #ifdef __AVX512DQ__ #define EIGEN_VECTORIZE_AVX512DQ #endif + #ifdef __AVX512ER__ + #define EIGEN_VECTORIZE_AVX512ER + #endif #endif // include files @@ -229,7 +248,7 @@ #if defined __CUDACC__ #define EIGEN_VECTORIZE_CUDA #include - #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 + #if EIGEN_CUDACC_VER >= 70500 #define EIGEN_HAS_CUDA_FP16 #endif #endif @@ -352,6 +371,7 @@ using std::ptrdiff_t; #include "src/Core/MathFunctions.h" #include "src/Core/GenericPacketMath.h" #include "src/Core/MathFunctionsImpl.h" +#include "src/Core/arch/Default/ConjHelper.h" #if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" @@ -367,6 +387,7 @@ using std::ptrdiff_t; #include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX/TypeCasting.h" + #include "src/Core/arch/SSE/TypeCasting.h" #elif defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/MathFunctions.h" diff --git a/extern/eigen/Eigen/Eigenvalues b/extern/eigen/Eigen/Eigenvalues index 009e529e..f3f661b0 100644 --- a/extern/eigen/Eigen/Eigenvalues +++ b/extern/eigen/Eigen/Eigenvalues @@ -45,7 +45,11 @@ #include "src/Eigenvalues/GeneralizedEigenSolver.h" #include "src/Eigenvalues/MatrixBaseEigenvalues.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/Eigenvalues/RealSchur_LAPACKE.h" #include "src/Eigenvalues/ComplexSchur_LAPACKE.h" #include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h" diff --git a/extern/eigen/Eigen/LU b/extern/eigen/Eigen/LU index 6f6c5562..6418a86e 100644 --- a/extern/eigen/Eigen/LU +++ b/extern/eigen/Eigen/LU @@ -28,7 +28,11 @@ #include "src/LU/FullPivLU.h" #include "src/LU/PartialPivLU.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/LU/PartialPivLU_LAPACKE.h" #endif #include "src/LU/Determinant.h" diff --git a/extern/eigen/Eigen/PardisoSupport b/extern/eigen/Eigen/PardisoSupport old mode 100644 new mode 100755 diff --git a/extern/eigen/Eigen/QR b/extern/eigen/Eigen/QR index 80838e3b..c7e91446 100644 --- a/extern/eigen/Eigen/QR +++ b/extern/eigen/Eigen/QR @@ -36,7 +36,11 @@ #include "src/QR/ColPivHouseholderQR.h" #include "src/QR/CompleteOrthogonalDecomposition.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/QR/HouseholderQR_LAPACKE.h" #include "src/QR/ColPivHouseholderQR_LAPACKE.h" #endif diff --git a/extern/eigen/Eigen/QtAlignedMalloc b/extern/eigen/Eigen/QtAlignedMalloc index c6571f12..4f07df02 100644 --- a/extern/eigen/Eigen/QtAlignedMalloc +++ b/extern/eigen/Eigen/QtAlignedMalloc @@ -27,7 +27,7 @@ void qFree(void *ptr) void *qRealloc(void *ptr, std::size_t size) { void* newPtr = Eigen::internal::aligned_malloc(size); - memcpy(newPtr, ptr, size); + std::memcpy(newPtr, ptr, size); Eigen::internal::aligned_free(ptr); return newPtr; } diff --git a/extern/eigen/Eigen/SVD b/extern/eigen/Eigen/SVD index 86143c23..5d0e75f7 100644 --- a/extern/eigen/Eigen/SVD +++ b/extern/eigen/Eigen/SVD @@ -37,7 +37,11 @@ #include "src/SVD/JacobiSVD.h" #include "src/SVD/BDCSVD.h" #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT) +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/SVD/JacobiSVD_LAPACKE.h" #endif diff --git a/extern/eigen/Eigen/src/Cholesky/LDLT.h b/extern/eigen/Eigen/src/Cholesky/LDLT.h index fcee7b2e..15ccf24f 100644 --- a/extern/eigen/Eigen/src/Cholesky/LDLT.h +++ b/extern/eigen/Eigen/src/Cholesky/LDLT.h @@ -248,7 +248,7 @@ template class LDLT /** \brief Reports whether previous computation was successful. * * \returns \c Success if computation was succesful, - * \c NumericalIssue if the matrix.appears to be negative. + * \c NumericalIssue if the factorization failed because of a zero pivot. */ ComputationInfo info() const { @@ -305,7 +305,8 @@ template<> struct ldlt_inplace if (size <= 1) { transpositions.setIdentity(); - if (numext::real(mat.coeff(0,0)) > static_cast(0) ) sign = PositiveSemiDef; + if(size==0) sign = ZeroSign; + else if (numext::real(mat.coeff(0,0)) > static_cast(0) ) sign = PositiveSemiDef; else if (numext::real(mat.coeff(0,0)) < static_cast(0)) sign = NegativeSemiDef; else sign = ZeroSign; return true; @@ -376,6 +377,8 @@ template<> struct ldlt_inplace if((rs>0) && pivot_is_valid) A21 /= realAkk; + else if(rs>0) + ret = ret && (A21.array()==Scalar(0)).all(); if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed else if(!pivot_is_valid) found_zero_pivot = true; @@ -568,13 +571,14 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons // more precisely, use pseudo-inverse of D (see bug 241) using std::abs; const typename Diagonal::RealReturnType vecD(vectorD()); - // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon - // as motivated by LAPACK's xGELSS: + // In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min()) + // and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS: // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits::epsilon(),RealScalar(1) / NumTraits::highest()); // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest // diagonal element is not well justified and leads to numerical issues in some cases. // Moreover, Lapack's xSYTRS routines use 0 for the tolerance. - RealScalar tolerance = RealScalar(1) / NumTraits::highest(); + // Using numeric_limits::min() gives us more robustness to denormals. + RealScalar tolerance = (std::numeric_limits::min)(); for (Index i = 0; i < vecD.size(); ++i) { diff --git a/extern/eigen/Eigen/src/Cholesky/LLT.h b/extern/eigen/Eigen/src/Cholesky/LLT.h index 87ca8d42..e1624d21 100644 --- a/extern/eigen/Eigen/src/Cholesky/LLT.h +++ b/extern/eigen/Eigen/src/Cholesky/LLT.h @@ -24,7 +24,7 @@ template struct LLT_Traits; * * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. - * The other triangular part won't be read. + * The other triangular part won't be read. * * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite * matrix A such that A = LL^* = U^*U, where L is lower triangular. @@ -41,14 +41,18 @@ template struct LLT_Traits; * Example: \include LLT_example.cpp * Output: \verbinclude LLT_example.out * + * \b Performance: for best performance, it is recommended to use a column-major storage format + * with the Lower triangular part (the default), or, equivalently, a row-major storage format + * with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization + * step, and rank-updates can be up to 3 times slower. + * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. * + * Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered. + * Therefore, the strict lower part does not have to store correct values. + * * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ - /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH) - * Note that during the decomposition, only the upper triangular part of A is considered. Therefore, - * the strict lower part does not have to store correct values. - */ template class LLT { public: @@ -146,7 +150,7 @@ template class LLT } template - void solveInPlace(MatrixBase &bAndX) const; + void solveInPlace(const MatrixBase &bAndX) const; template LLT& compute(const EigenBase& matrix); @@ -177,7 +181,7 @@ template class LLT /** \brief Reports whether previous computation was successful. * * \returns \c Success if computation was succesful, - * \c NumericalIssue if the matrix.appears to be negative. + * \c NumericalIssue if the matrix.appears not to be positive definite. */ ComputationInfo info() const { @@ -425,7 +429,8 @@ LLT& LLT::compute(const EigenBase eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); m_matrix.resize(size, size); - m_matrix = a.derived(); + if (!internal::is_same_dense(m_matrix, a.derived())) + m_matrix = a.derived(); // Compute matrix L1 norm = max abs column sum. m_l1_norm = RealScalar(0); @@ -485,11 +490,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const * * This version avoids a copy when the right hand side matrix b is not needed anymore. * + * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here. + * This function will const_cast it, so constness isn't honored here. + * * \sa LLT::solve(), MatrixBase::llt() */ template template -void LLT::solveInPlace(MatrixBase &bAndX) const +void LLT::solveInPlace(const MatrixBase &bAndX) const { eigen_assert(m_isInitialized && "LLT is not initialized."); eigen_assert(m_matrix.rows()==bAndX.rows()); diff --git a/extern/eigen/Eigen/src/Core/Array.h b/extern/eigen/Eigen/src/Core/Array.h index e10020d4..16770fc7 100644 --- a/extern/eigen/Eigen/src/Core/Array.h +++ b/extern/eigen/Eigen/src/Core/Array.h @@ -153,8 +153,6 @@ class Array : Base(std::move(other)) { Base::_check_template_params(); - if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic) - Base::_set_noalias(other); } EIGEN_DEVICE_FUNC Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) diff --git a/extern/eigen/Eigen/src/Core/AssignEvaluator.h b/extern/eigen/Eigen/src/Core/AssignEvaluator.h index b0ec7b7c..dbe435d8 100644 --- a/extern/eigen/Eigen/src/Core/AssignEvaluator.h +++ b/extern/eigen/Eigen/src/Core/AssignEvaluator.h @@ -39,7 +39,7 @@ struct copy_using_evaluator_traits enum { DstAlignment = DstEvaluator::Alignment, SrcAlignment = SrcEvaluator::Alignment, - DstHasDirectAccess = DstFlags & DirectAccessBit, + DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit, JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment) }; @@ -83,7 +83,7 @@ struct copy_using_evaluator_traits && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0 && (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)), MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit), - MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess + MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess) && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), /* If the destination isn't aligned, we have to do runtime checks and we don't unroll, so it's only good for large enough sizes. */ diff --git a/extern/eigen/Eigen/src/Core/Assign_MKL.h b/extern/eigen/Eigen/src/Core/Assign_MKL.h old mode 100644 new mode 100755 index 6c2ab926..6866095b --- a/extern/eigen/Eigen/src/Core/Assign_MKL.h +++ b/extern/eigen/Eigen/src/Core/Assign_MKL.h @@ -84,7 +84,8 @@ class vml_assign_traits struct Assignment, SrcXprNested>, assign_op, \ Dense2Dense, typename enable_if::EnableVml>::type> { \ typedef CwiseUnaryOp, SrcXprNested> SrcXprType; \ - static void run(DstXprType &dst, const SrcXprType &src, const assign_op &/*func*/) { \ + static void run(DstXprType &dst, const SrcXprType &src, const assign_op &func) { \ + resize_if_allowed(dst, src, func); \ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ if(vml_assign_traits::Traversal==LinearTraversal) { \ VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \ @@ -144,7 +145,8 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _) Dense2Dense, typename enable_if::EnableVml>::type> { \ typedef CwiseBinaryOp, SrcXprNested, \ const CwiseNullaryOp,Plain> > SrcXprType; \ - static void run(DstXprType &dst, const SrcXprType &src, const assign_op &/*func*/) { \ + static void run(DstXprType &dst, const SrcXprType &src, const assign_op &func) { \ + resize_if_allowed(dst, src, func); \ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ VMLTYPE exponent = reinterpret_cast(src.rhs().functor().m_other); \ if(vml_assign_traits::Traversal==LinearTraversal) \ diff --git a/extern/eigen/Eigen/src/Core/ConditionEstimator.h b/extern/eigen/Eigen/src/Core/ConditionEstimator.h index aa7efdc7..51a2e5f1 100644 --- a/extern/eigen/Eigen/src/Core/ConditionEstimator.h +++ b/extern/eigen/Eigen/src/Core/ConditionEstimator.h @@ -160,7 +160,7 @@ rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Deco { typedef typename Decomposition::RealScalar RealScalar; eigen_assert(dec.rows() == dec.cols()); - if (dec.rows() == 0) return RealScalar(1); + if (dec.rows() == 0) return NumTraits::infinity(); if (matrix_norm == RealScalar(0)) return RealScalar(0); if (dec.rows() == 1) return RealScalar(1); const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec); diff --git a/extern/eigen/Eigen/src/Core/CoreEvaluators.h b/extern/eigen/Eigen/src/Core/CoreEvaluators.h index f7c1effc..910889ef 100644 --- a/extern/eigen/Eigen/src/Core/CoreEvaluators.h +++ b/extern/eigen/Eigen/src/Core/CoreEvaluators.h @@ -977,7 +977,7 @@ struct evaluator > OuterStrideAtCompileTime = HasSameStorageOrderAsArgType ? int(outer_stride_at_compile_time::ret) : int(inner_stride_at_compile_time::ret), - MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, + MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0, FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator::Flags&LinearAccessBit))) ? LinearAccessBit : 0, FlagsRowMajorBit = XprType::Flags&RowMajorBit, @@ -987,7 +987,9 @@ struct evaluator > Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit, PacketAlignment = unpacket_traits::alignment, - Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0, + Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) + && (OuterStrideAtCompileTime!=0) + && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0, Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, Alignment0) }; typedef block_evaluator block_evaluator_type; @@ -1018,14 +1020,16 @@ struct unary_evaluator, IndexBa EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block) : m_argImpl(block.nestedExpression()), m_startRow(block.startRow()), - m_startCol(block.startCol()) + m_startCol(block.startCol()), + m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0) { } typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; enum { - RowsAtCompileTime = XprType::RowsAtCompileTime + RowsAtCompileTime = XprType::RowsAtCompileTime, + ForwardLinearAccess = InnerPanel && bool(evaluator::Flags&LinearAccessBit) }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1037,7 +1041,10 @@ struct unary_evaluator, IndexBa EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + if (ForwardLinearAccess) + return m_argImpl.coeff(m_linear_offset.value() + index); + else + return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1049,7 +1056,10 @@ struct unary_evaluator, IndexBa EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + if (ForwardLinearAccess) + return m_argImpl.coeffRef(m_linear_offset.value() + index); + else + return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } template @@ -1063,8 +1073,11 @@ struct unary_evaluator, IndexBa EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return packet(RowsAtCompileTime == 1 ? 0 : index, - RowsAtCompileTime == 1 ? index : 0); + if (ForwardLinearAccess) + return m_argImpl.template packet(m_linear_offset.value() + index); + else + return packet(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0); } template @@ -1078,15 +1091,19 @@ struct unary_evaluator, IndexBa EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { - return writePacket(RowsAtCompileTime == 1 ? 0 : index, - RowsAtCompileTime == 1 ? index : 0, - x); + if (ForwardLinearAccess) + return m_argImpl.template writePacket(m_linear_offset.value() + index, x); + else + return writePacket(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0, + x); } protected: evaluator m_argImpl; const variable_if_dynamic m_startRow; const variable_if_dynamic m_startCol; + const variable_if_dynamic m_linear_offset; }; // TODO: This evaluator does not actually use the child evaluator; diff --git a/extern/eigen/Eigen/src/Core/Diagonal.h b/extern/eigen/Eigen/src/Core/Diagonal.h index 49e71125..afcaf357 100644 --- a/extern/eigen/Eigen/src/Core/Diagonal.h +++ b/extern/eigen/Eigen/src/Core/Diagonal.h @@ -70,7 +70,10 @@ template class Diagonal EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal) EIGEN_DEVICE_FUNC - explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {} + explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) + { + eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() ); + } EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal) diff --git a/extern/eigen/Eigen/src/Core/Dot.h b/extern/eigen/Eigen/src/Core/Dot.h index 06ef18b8..1fe7a84a 100644 --- a/extern/eigen/Eigen/src/Core/Dot.h +++ b/extern/eigen/Eigen/src/Core/Dot.h @@ -31,7 +31,8 @@ struct dot_nocheck typedef scalar_conj_product_op::Scalar,typename traits::Scalar> conj_prod; typedef typename conj_prod::result_type ResScalar; EIGEN_DEVICE_FUNC - static inline ResScalar run(const MatrixBase& a, const MatrixBase& b) + EIGEN_STRONG_INLINE + static ResScalar run(const MatrixBase& a, const MatrixBase& b) { return a.template binaryExpr(b).sum(); } @@ -43,7 +44,8 @@ struct dot_nocheck typedef scalar_conj_product_op::Scalar,typename traits::Scalar> conj_prod; typedef typename conj_prod::result_type ResScalar; EIGEN_DEVICE_FUNC - static inline ResScalar run(const MatrixBase& a, const MatrixBase& b) + EIGEN_STRONG_INLINE + static ResScalar run(const MatrixBase& a, const MatrixBase& b) { return a.transpose().template binaryExpr(b).sum(); } @@ -65,6 +67,7 @@ struct dot_nocheck template template EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE typename ScalarBinaryOpTraits::Scalar,typename internal::traits::Scalar>::ReturnType MatrixBase::dot(const MatrixBase& other) const { @@ -102,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits::Scala * \sa lpNorm(), dot(), squaredNorm() */ template -inline typename NumTraits::Scalar>::Real MatrixBase::norm() const +EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::norm() const { return numext::sqrt(squaredNorm()); } @@ -117,7 +120,7 @@ inline typename NumTraits::Scalar>::Real Matr * \sa norm(), normalize() */ template -inline const typename MatrixBase::PlainObject +EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::normalized() const { typedef typename internal::nested_eval::type _Nested; @@ -139,7 +142,7 @@ MatrixBase::normalized() const * \sa norm(), normalized() */ template -inline void MatrixBase::normalize() +EIGEN_STRONG_INLINE void MatrixBase::normalize() { RealScalar z = squaredNorm(); // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU @@ -160,7 +163,7 @@ inline void MatrixBase::normalize() * \sa stableNorm(), stableNormalize(), normalized() */ template -inline const typename MatrixBase::PlainObject +EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::stableNormalized() const { typedef typename internal::nested_eval::type _Nested; @@ -185,7 +188,7 @@ MatrixBase::stableNormalized() const * \sa stableNorm(), stableNormalized(), normalize() */ template -inline void MatrixBase::stableNormalize() +EIGEN_STRONG_INLINE void MatrixBase::stableNormalize() { RealScalar w = cwiseAbs().maxCoeff(); RealScalar z = (derived()/w).squaredNorm(); diff --git a/extern/eigen/Eigen/src/Core/GeneralProduct.h b/extern/eigen/Eigen/src/Core/GeneralProduct.h index 0f16cd8e..6f0cc80e 100644 --- a/extern/eigen/Eigen/src/Core/GeneralProduct.h +++ b/extern/eigen/Eigen/src/Core/GeneralProduct.h @@ -24,12 +24,17 @@ template struct product_type_selector; template struct product_size_category { - enum { is_large = MaxSize == Dynamic || - Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || - (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), - value = is_large ? Large - : Size == 1 ? 1 - : Small + enum { + #ifndef EIGEN_CUDA_ARCH + is_large = MaxSize == Dynamic || + Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || + (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), + #else + is_large = 0, + #endif + value = is_large ? Large + : Size == 1 ? 1 + : Small }; }; @@ -379,8 +384,6 @@ template<> struct gemv_dense_selector * * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*() */ -#ifndef __CUDACC__ - template template inline const Product @@ -412,8 +415,6 @@ MatrixBase::operator*(const MatrixBase &other) const return Product(derived(), other.derived()); } -#endif // __CUDACC__ - /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation. * * The returned product will behave like any other expressions: the coefficients of the product will be diff --git a/extern/eigen/Eigen/src/Core/Map.h b/extern/eigen/Eigen/src/Core/Map.h index 06d19670..548bf9a2 100644 --- a/extern/eigen/Eigen/src/Core/Map.h +++ b/extern/eigen/Eigen/src/Core/Map.h @@ -20,11 +20,17 @@ struct traits > { typedef traits TraitsBase; enum { + PlainObjectTypeInnerSize = ((traits::Flags&RowMajorBit)==RowMajorBit) + ? PlainObjectType::ColsAtCompileTime + : PlainObjectType::RowsAtCompileTime, + InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 ? int(PlainObjectType::InnerStrideAtCompileTime) : int(StrideType::InnerStrideAtCompileTime), OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 - ? int(PlainObjectType::OuterStrideAtCompileTime) + ? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic + ? Dynamic + : int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize)) : int(StrideType::OuterStrideAtCompileTime), Alignment = int(MapOptions)&int(AlignedMask), Flags0 = TraitsBase::Flags & (~NestByRefBit), @@ -107,10 +113,11 @@ template class Ma EIGEN_DEVICE_FUNC inline Index outerStride() const { - return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() - : IsVectorAtCompileTime ? this->size() - : int(Flags)&RowMajorBit ? this->cols() - : this->rows(); + return int(StrideType::OuterStrideAtCompileTime) != 0 ? m_stride.outer() + : int(internal::traits::OuterStrideAtCompileTime) != Dynamic ? Index(internal::traits::OuterStrideAtCompileTime) + : IsVectorAtCompileTime ? (this->size() * innerStride()) + : (int(Flags)&RowMajorBit) ? (this->cols() * innerStride()) + : (this->rows() * innerStride()); } /** Constructor in the fixed-size case. diff --git a/extern/eigen/Eigen/src/Core/MapBase.h b/extern/eigen/Eigen/src/Core/MapBase.h index 020f939a..668922ff 100644 --- a/extern/eigen/Eigen/src/Core/MapBase.h +++ b/extern/eigen/Eigen/src/Core/MapBase.h @@ -43,6 +43,7 @@ template class MapBase enum { RowsAtCompileTime = internal::traits::RowsAtCompileTime, ColsAtCompileTime = internal::traits::ColsAtCompileTime, + InnerStrideAtCompileTime = internal::traits::InnerStrideAtCompileTime, SizeAtCompileTime = Base::SizeAtCompileTime }; @@ -187,8 +188,11 @@ template class MapBase void checkSanity(typename internal::enable_if<(internal::traits::Alignment>0),void*>::type = 0) const { #if EIGEN_MAX_ALIGN_BYTES>0 + // innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value: + const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime); + EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride); eigen_assert(( ((internal::UIntPtr(m_data) % internal::traits::Alignment) == 0) - || (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits::Alignment ) && "data is not aligned"); + || (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits::Alignment ) && "data is not aligned"); #endif } diff --git a/extern/eigen/Eigen/src/Core/MathFunctions.h b/extern/eigen/Eigen/src/Core/MathFunctions.h index a648aa0f..b249ce0c 100644 --- a/extern/eigen/Eigen/src/Core/MathFunctions.h +++ b/extern/eigen/Eigen/src/Core/MathFunctions.h @@ -348,31 +348,7 @@ struct norm1_retval * Implementation of hypot * ****************************************************************************/ -template -struct hypot_impl -{ - typedef typename NumTraits::Real RealScalar; - static inline RealScalar run(const Scalar& x, const Scalar& y) - { - EIGEN_USING_STD_MATH(abs); - EIGEN_USING_STD_MATH(sqrt); - RealScalar _x = abs(x); - RealScalar _y = abs(y); - Scalar p, qp; - if(_x>_y) - { - p = _x; - qp = _y / p; - } - else - { - p = _y; - qp = _x / p; - } - if(p==RealScalar(0)) return RealScalar(0); - return p * sqrt(RealScalar(1) + qp*qp); - } -}; +template struct hypot_impl; template struct hypot_retval @@ -495,7 +471,7 @@ namespace std_fallback { typedef typename NumTraits::Real RealScalar; EIGEN_USING_STD_MATH(log); Scalar x1p = RealScalar(1) + x; - return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); + return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); } } @@ -640,21 +616,28 @@ template struct random_default_impl { static inline Scalar run(const Scalar& x, const Scalar& y) - { - typedef typename conditional::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX; - if(y=x the result converted to an unsigned long is still correct. - std::size_t range = ScalarX(y)-ScalarX(x); - std::size_t offset = 0; - // rejection sampling - std::size_t divisor = 1; - std::size_t multiplier = 1; - if(range::type ScalarU; + // ScalarX is the widest of ScalarU and unsigned int. + // We'll deal only with ScalarX and unsigned int below thus avoiding signed + // types and arithmetic and signed overflows (which are undefined behavior). + typedef typename conditional<(ScalarU(-1) > unsigned(-1)), ScalarU, unsigned>::type ScalarX; + // The following difference doesn't overflow, provided our integer types are two's + // complement and have the same number of padding bits in signed and unsigned variants. + // This is the case in most modern implementations of C++. + ScalarX range = ScalarX(y) - ScalarX(x); + ScalarX offset = 0; + ScalarX divisor = 1; + ScalarX multiplier = 1; + const unsigned rand_max = RAND_MAX; + if (range <= rand_max) divisor = (rand_max + 1) / (range + 1); + else multiplier = 1 + range / (rand_max + 1); + // Rejection sampling. do { - offset = (std::size_t(std::rand()) * multiplier) / divisor; + offset = (unsigned(std::rand()) * multiplier) / divisor; } while (offset > range); return Scalar(ScalarX(x) + offset); } @@ -1030,7 +1013,8 @@ inline int log2(int x) /** \returns the square root of \a x. * - * It is essentially equivalent to \code using std::sqrt; return sqrt(x); \endcode, + * It is essentially equivalent to + * \code using std::sqrt; return sqrt(x); \endcode * but slightly faster for float/double and some compilers (e.g., gcc), thanks to * specializations when SSE is enabled. * diff --git a/extern/eigen/Eigen/src/Core/MathFunctionsImpl.h b/extern/eigen/Eigen/src/Core/MathFunctionsImpl.h index 3c9ef22f..9c1ceb0e 100644 --- a/extern/eigen/Eigen/src/Core/MathFunctionsImpl.h +++ b/extern/eigen/Eigen/src/Core/MathFunctionsImpl.h @@ -71,6 +71,29 @@ T generic_fast_tanh_float(const T& a_x) return pdiv(p, q); } +template +EIGEN_STRONG_INLINE +RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y) +{ + EIGEN_USING_STD_MATH(sqrt); + RealScalar p, qp; + p = numext::maxi(x,y); + if(p==RealScalar(0)) return RealScalar(0); + qp = numext::mini(y,x) / p; + return p * sqrt(RealScalar(1) + qp*qp); +} + +template +struct hypot_impl +{ + typedef typename NumTraits::Real RealScalar; + static inline RealScalar run(const Scalar& x, const Scalar& y) + { + EIGEN_USING_STD_MATH(abs); + return positive_real_hypot(abs(x), abs(y)); + } +}; + } // end namespace internal } // end namespace Eigen diff --git a/extern/eigen/Eigen/src/Core/Matrix.h b/extern/eigen/Eigen/src/Core/Matrix.h index 90c336d8..7f4a7af9 100644 --- a/extern/eigen/Eigen/src/Core/Matrix.h +++ b/extern/eigen/Eigen/src/Core/Matrix.h @@ -274,8 +274,6 @@ class Matrix : Base(std::move(other)) { Base::_check_template_params(); - if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic) - Base::_set_noalias(other); } EIGEN_DEVICE_FUNC Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) diff --git a/extern/eigen/Eigen/src/Core/MatrixBase.h b/extern/eigen/Eigen/src/Core/MatrixBase.h index ce412180..e6c35907 100644 --- a/extern/eigen/Eigen/src/Core/MatrixBase.h +++ b/extern/eigen/Eigen/src/Core/MatrixBase.h @@ -160,20 +160,11 @@ template class MatrixBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const MatrixBase& other); -#ifdef __CUDACC__ template EIGEN_DEVICE_FUNC - const Product - operator*(const MatrixBase &other) const - { return this->lazyProduct(other); } -#else - - template const Product operator*(const MatrixBase &other) const; -#endif - template EIGEN_DEVICE_FUNC const Product @@ -453,16 +444,24 @@ template class MatrixBase ///////// MatrixFunctions module ///////// typedef typename internal::stem_function::type StemFunction; - const MatrixExponentialReturnValue exp() const; +#define EIGEN_MATRIX_FUNCTION(ReturnType, Name, Description) \ + /** \returns an expression of the matrix Description of \c *this. \brief This function requires the unsupported MatrixFunctions module. To compute the coefficient-wise Description use ArrayBase::##Name . */ \ + const ReturnType Name() const; +#define EIGEN_MATRIX_FUNCTION_1(ReturnType, Name, Description, Argument) \ + /** \returns an expression of the matrix Description of \c *this. \brief This function requires the unsupported MatrixFunctions module. To compute the coefficient-wise Description use ArrayBase::##Name . */ \ + const ReturnType Name(Argument) const; + + EIGEN_MATRIX_FUNCTION(MatrixExponentialReturnValue, exp, exponential) + /** \brief Helper function for the unsupported MatrixFunctions module.*/ const MatrixFunctionReturnValue matrixFunction(StemFunction f) const; - const MatrixFunctionReturnValue cosh() const; - const MatrixFunctionReturnValue sinh() const; - const MatrixFunctionReturnValue cos() const; - const MatrixFunctionReturnValue sin() const; - const MatrixSquareRootReturnValue sqrt() const; - const MatrixLogarithmReturnValue log() const; - const MatrixPowerReturnValue pow(const RealScalar& p) const; - const MatrixComplexPowerReturnValue pow(const std::complex& p) const; + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine) + EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root) + EIGEN_MATRIX_FUNCTION(MatrixLogarithmReturnValue, log, logarithm) + EIGEN_MATRIX_FUNCTION_1(MatrixPowerReturnValue, pow, power to \c p, const RealScalar& p) + EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const std::complex& p) protected: EIGEN_DEVICE_FUNC MatrixBase() : Base() {} diff --git a/extern/eigen/Eigen/src/Core/PlainObjectBase.h b/extern/eigen/Eigen/src/Core/PlainObjectBase.h index 77f4f606..1dc7e223 100644 --- a/extern/eigen/Eigen/src/Core/PlainObjectBase.h +++ b/extern/eigen/Eigen/src/Core/PlainObjectBase.h @@ -577,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned * \a data pointers. * + * Here is an example using strides: + * \include Matrix_Map_stride.cpp + * Output: \verbinclude Matrix_Map_stride.out + * * \see class Map */ //@{ diff --git a/extern/eigen/Eigen/src/Core/Product.h b/extern/eigen/Eigen/src/Core/Product.h index ae0c94b3..676c4802 100644 --- a/extern/eigen/Eigen/src/Core/Product.h +++ b/extern/eigen/Eigen/src/Core/Product.h @@ -97,8 +97,8 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option, && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); } - EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); } EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; } EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; } @@ -127,7 +127,7 @@ class dense_product_base using Base::derived; typedef typename Base::Scalar Scalar; - operator const Scalar() const + EIGEN_STRONG_INLINE operator const Scalar() const { return internal::evaluator(derived()).coeff(0,0); } @@ -162,7 +162,7 @@ class ProductImpl public: - EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const { EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); @@ -170,7 +170,7 @@ class ProductImpl return internal::evaluator(derived()).coeff(row,col); } - EIGEN_DEVICE_FUNC Scalar coeff(Index i) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const { EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); diff --git a/extern/eigen/Eigen/src/Core/ProductEvaluators.h b/extern/eigen/Eigen/src/Core/ProductEvaluators.h index c42725db..9b99bd76 100644 --- a/extern/eigen/Eigen/src/Core/ProductEvaluators.h +++ b/extern/eigen/Eigen/src/Core/ProductEvaluators.h @@ -32,7 +32,7 @@ struct evaluator > typedef Product XprType; typedef product_evaluator Base; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {} }; // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B" @@ -55,7 +55,7 @@ struct evaluator, const Product > XprType; typedef evaluator > Base; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs()) {} }; @@ -68,7 +68,7 @@ struct evaluator, DiagIndex> > typedef Diagonal, DiagIndex> XprType; typedef evaluator, DiagIndex> > Base; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(Diagonal, DiagIndex>( Product(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()), xpr.index() )) @@ -246,19 +246,19 @@ template struct generic_product_impl { template - static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum(); } template - static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum(); } template - static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); } }; @@ -312,25 +312,25 @@ struct generic_product_impl }; template - static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major()); } template - static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major()); } template - static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major()); } template - static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major()); } @@ -785,7 +785,11 @@ struct diagonal_product_evaluator_base _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0), - Alignment = evaluator::Alignment + Alignment = evaluator::Alignment, + + AsScalarProduct = (DiagonalType::SizeAtCompileTime==1) + || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft) + || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight) }; diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) @@ -797,7 +801,10 @@ struct diagonal_product_evaluator_base EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const { - return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); + if(AsScalarProduct) + return m_diagImpl.coeff(0) * m_matImpl.coeff(idx); + else + return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); } protected: diff --git a/extern/eigen/Eigen/src/Core/Redux.h b/extern/eigen/Eigen/src/Core/Redux.h index b6e8f888..760e9f86 100644 --- a/extern/eigen/Eigen/src/Core/Redux.h +++ b/extern/eigen/Eigen/src/Core/Redux.h @@ -407,7 +407,7 @@ class redux_evaluator */ template template -typename internal::traits::Scalar +EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::redux(const Func& func) const { eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); diff --git a/extern/eigen/Eigen/src/Core/Ref.h b/extern/eigen/Eigen/src/Core/Ref.h index bdf24f52..9c6e3c5d 100644 --- a/extern/eigen/Eigen/src/Core/Ref.h +++ b/extern/eigen/Eigen/src/Core/Ref.h @@ -95,6 +95,8 @@ template class RefBase template EIGEN_DEVICE_FUNC void construct(Expression& expr) { + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression); + if(PlainObjectType::RowsAtCompileTime==1) { eigen_assert(expr.rows()==1 || expr.cols()==1); diff --git a/extern/eigen/Eigen/src/Core/SelfAdjointView.h b/extern/eigen/Eigen/src/Core/SelfAdjointView.h index 504c98f0..b2e51f37 100644 --- a/extern/eigen/Eigen/src/Core/SelfAdjointView.h +++ b/extern/eigen/Eigen/src/Core/SelfAdjointView.h @@ -71,7 +71,9 @@ template class SelfAdjointView EIGEN_DEVICE_FUNC explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix) - {} + { + EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY); + } EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); } @@ -189,7 +191,7 @@ template class SelfAdjointView TriangularView >::type(tmp2); } - typedef SelfAdjointView ConjugateReturnType; + typedef SelfAdjointView ConjugateReturnType; /** \sa MatrixBase::conjugate() const */ EIGEN_DEVICE_FUNC inline const ConjugateReturnType conjugate() const diff --git a/extern/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h b/extern/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h index 50099df8..7c89c2e2 100644 --- a/extern/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +++ b/extern/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h @@ -17,7 +17,6 @@ namespace Eigen { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::operator*=(const Scalar& other) { - typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op()); return derived(); } @@ -25,7 +24,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::operator*=(co template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase::operator+=(const Scalar& other) { - typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op()); return derived(); } @@ -33,7 +31,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase::operator+=(co template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase::operator-=(const Scalar& other) { - typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op()); return derived(); } @@ -41,7 +38,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase::operator-=(co template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::operator/=(const Scalar& other) { - typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op()); return derived(); } diff --git a/extern/eigen/Eigen/src/Core/SolveTriangular.h b/extern/eigen/Eigen/src/Core/SolveTriangular.h index 049890b2..4652e2e1 100644 --- a/extern/eigen/Eigen/src/Core/SolveTriangular.h +++ b/extern/eigen/Eigen/src/Core/SolveTriangular.h @@ -169,6 +169,9 @@ void TriangularViewImpl::solveInPlace(const MatrixBase::Flags & RowMajorBit) && OtherDerived::IsVectorAtCompileTime && OtherDerived::SizeAtCompileTime!=1}; typedef typename internal::conditional::stableNorm() const typedef typename internal::nested_eval::type DerivedCopy; typedef typename internal::remove_all::type DerivedCopyClean; - DerivedCopy copy(derived()); + const DerivedCopy copy(derived()); enum { CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit) diff --git a/extern/eigen/Eigen/src/Core/Transpositions.h b/extern/eigen/Eigen/src/Core/Transpositions.h index 19c17bb4..86da5af5 100644 --- a/extern/eigen/Eigen/src/Core/Transpositions.h +++ b/extern/eigen/Eigen/src/Core/Transpositions.h @@ -384,7 +384,7 @@ class Transpose > const Product operator*(const MatrixBase& matrix, const Transpose& trt) { - return Product(matrix.derived(), trt.derived()); + return Product(matrix.derived(), trt); } /** \returns the \a matrix with the inverse transpositions applied to the rows. diff --git a/extern/eigen/Eigen/src/Core/arch/AVX/Complex.h b/extern/eigen/Eigen/src/Core/arch/AVX/Complex.h index 99439c8a..7fa61969 100644 --- a/extern/eigen/Eigen/src/Core/arch/AVX/Complex.h +++ b/extern/eigen/Eigen/src/Core/arch/AVX/Complex.h @@ -204,23 +204,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const - { return Packet4cf(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const - { return Packet4cf(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f) template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, const Packet4cf& b) { @@ -400,23 +384,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const - { return Packet2cd(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const - { return Packet2cd(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d) template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, const Packet2cd& b) { diff --git a/extern/eigen/Eigen/src/Core/arch/AVX/PacketMath.h b/extern/eigen/Eigen/src/Core/arch/AVX/PacketMath.h index 195d40fb..923a124b 100644 --- a/extern/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/extern/eigen/Eigen/src/Core/arch/AVX/PacketMath.h @@ -159,11 +159,12 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, co #ifdef __FMA__ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { -#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) - // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, - // and gcc stupidly generates a vfmadd132ps instruction, - // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate - // the result of the product. +#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) + // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, + // and even register spilling with clang>=6.0 (bug 1637). + // Gcc stupidly generates a vfmadd132ps instruction. + // So let's enforce it to generate a vfmadd231ps instruction since the most common use + // case is to accumulate the result of the product. Packet8f res = c; __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); return res; @@ -172,7 +173,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& #endif } template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { -#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) +#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) // see above Packet4d res = c; __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); @@ -308,9 +309,9 @@ template<> EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) } #ifndef EIGEN_VECTORIZE_AVX512 -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } #endif template<> EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) { @@ -333,9 +334,12 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) { __m256d tmp = _mm256_shuffle_pd(a,a,5); return _mm256_permute2f128_pd(tmp, tmp, 1); - + #if 0 + // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd + // exhibit the same latency/throughput, but it is here for future reference/benchmarking... __m256d swap_halves = _mm256_permute2f128_pd(a,a,1); return _mm256_permute_pd(swap_halves,5); + #endif } // pabs should be ok diff --git a/extern/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h b/extern/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h index 399be0ee..9c1717f7 100644 --- a/extern/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/extern/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -88,9 +88,9 @@ plog(const Packet16f& _x) { // x = x + x - 1.0; // } else { x = x - 1.0; } __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ); - Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps()); + Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x); x = psub(x, p16f_1); - e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps())); + e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1)); x = padd(x, tmp); Packet16f x2 = pmul(x, x); @@ -119,8 +119,9 @@ plog(const Packet16f& _x) { x = padd(x, y2); // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. - return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf, - _mm512_mask_blend_ps(invalid_mask, p16f_nan, x)); + return _mm512_mask_blend_ps(iszero_mask, + _mm512_mask_blend_ps(invalid_mask, x, p16f_nan), + p16f_minus_inf); } #endif @@ -266,8 +267,7 @@ psqrt(const Packet16f& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ); - Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x), - _mm512_setzero_ps()); + Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_setzero_ps(), _mm512_rsqrt14_ps(_x)); // Do a single step of Newton's iteration. x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); @@ -289,8 +289,7 @@ psqrt(const Packet8d& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ); - Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x), - _mm512_setzero_pd()); + Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_setzero_pd(), _mm512_rsqrt14_pd(_x)); // Do a first step of Newton's iteration. x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); @@ -333,20 +332,18 @@ prsqrt(const Packet16f& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ); - Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), - _mm512_rsqrt14_ps(_x)); + Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps()); // Fill in NaNs and Infs for the negative/zero entries. __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ); Packet16f infs_and_nans = _mm512_mask_blend_ps( - neg_mask, p16f_nan, - _mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps())); + neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan); // Do a single step of Newton's iteration. x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); // Insert NaNs and Infs in all the right places. - return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x); + return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans); } template <> @@ -363,14 +360,12 @@ prsqrt(const Packet8d& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ); - Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), - _mm512_rsqrt14_pd(_x)); + Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd()); // Fill in NaNs and Infs for the negative/zero entries. __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ); Packet8d infs_and_nans = _mm512_mask_blend_pd( - neg_mask, p8d_nan, - _mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd())); + neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan); // Do a first step of Newton's iteration. x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); @@ -379,9 +374,9 @@ prsqrt(const Packet8d& _x) { x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); // Insert NaNs and Infs in all the right places. - return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x); + return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans); } -#else +#elif defined(EIGEN_VECTORIZE_AVX512ER) template <> EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { return _mm512_rsqrt28_ps(x); diff --git a/extern/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h b/extern/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h index f6500a16..5adddc7a 100644 --- a/extern/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/extern/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -618,9 +618,9 @@ EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) { pstore(to, pa); } -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } template <> EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) { @@ -648,13 +648,13 @@ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a) template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) { // _mm512_abs_ps intrinsic not found, so hack around it - return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff)); + return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff))); } template <> EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { // _mm512_abs_ps intrinsic not found, so hack around it - return (__m512d)_mm512_and_si512((__m512i)a, - _mm512_set1_epi64(0x7fffffffffffffff)); + return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), + _mm512_set1_epi64(0x7fffffffffffffff))); } #ifdef EIGEN_VECTORIZE_AVX512DQ diff --git a/extern/eigen/Eigen/src/Core/arch/AltiVec/Complex.h b/extern/eigen/Eigen/src/Core/arch/AltiVec/Complex.h index 67db2f8e..3e665730 100644 --- a/extern/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/extern/eigen/Eigen/src/Core/arch/AltiVec/Complex.h @@ -224,23 +224,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const - { return Packet2cf(internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const - { return Packet2cf(internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { @@ -416,23 +400,8 @@ template<> struct conj_helper return pconj(internal::pmul(a, b)); } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const - { return Packet1cd(internal::pmul(x, y.v)); } -}; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const - { return Packet1cd(internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { diff --git a/extern/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h b/extern/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h old mode 100644 new mode 100755 index b3f1ea19..08a27d15 --- a/extern/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/extern/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -103,7 +103,7 @@ static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4u static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; #else -static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; +static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; @@ -388,10 +388,28 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) +{ + #ifdef __VSX__ + Packet4f ret; + __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; + #else + return vec_min(a, b); + #endif +} template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) +{ + #ifdef __VSX__ + Packet4f ret; + __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; + #else + return vec_max(a, b); + #endif +} template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } @@ -764,7 +782,7 @@ typedef __vector __bool long Packet2bl; static Packet2l p2l_ONE = { 1, 1 }; static Packet2l p2l_ZERO = reinterpret_cast(p4i_ZERO); -static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO = reinterpret_cast(p4f_ZERO); static Packet2d p2d_MZERO = { -0.0, -0.0 }; @@ -910,9 +928,19 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } -template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) +{ + Packet2d ret; + __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; + } -template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) +{ + Packet2d ret; + __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; +} template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } @@ -969,7 +997,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) Packet2d v[2], sum; v[0] = vecs[0] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[0]), 8)); v[1] = vecs[1] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[1]), 8)); - + #ifdef _BIG_ENDIAN sum = reinterpret_cast(vec_sld(reinterpret_cast(v[0]), reinterpret_cast(v[1]), 8)); #else @@ -1022,7 +1050,7 @@ ptranspose(PacketBlock& kernel) { template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; - Packet2bl mask = vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)); + Packet2bl mask = reinterpret_cast( vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)) ); return vec_sel(elsePacket, thenPacket, mask); } #endif // __VSX__ diff --git a/extern/eigen/Eigen/src/Core/arch/CUDA/Half.h b/extern/eigen/Eigen/src/Core/arch/CUDA/Half.h index 294c517e..755e6209 100644 --- a/extern/eigen/Eigen/src/Core/arch/CUDA/Half.h +++ b/extern/eigen/Eigen/src/Core/arch/CUDA/Half.h @@ -29,7 +29,7 @@ // type Eigen::half (inheriting from CUDA's __half struct) with // operator overloads such that it behaves basically as an arithmetic // type. It will be quite slow on CPUs (so it is recommended to stay -// in fp32 for CPUs, except for simple parameter conversions, I/O +// in float32_bits for CPUs, except for simple parameter conversions, I/O // to disk and the likes), but fast on GPUs. @@ -50,38 +50,45 @@ struct half; namespace half_impl { #if !defined(EIGEN_HAS_CUDA_FP16) - -// Make our own __half definition that is similar to CUDA's. -struct __half { - EIGEN_DEVICE_FUNC __half() {} - explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {} +// Make our own __half_raw definition that is similar to CUDA's. +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} unsigned short x; }; - +#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 +// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw +typedef __half __half_raw; #endif -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); -struct half_base : public __half { +struct half_base : public __half_raw { EIGEN_DEVICE_FUNC half_base() {} - EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {} - EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {} + EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} + EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} +#endif }; } // namespace half_impl // Class definition. struct half : public half_impl::half_base { - #if !defined(EIGEN_HAS_CUDA_FP16) - typedef half_impl::__half __half; + #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) + typedef half_impl::__half_raw __half_raw; #endif EIGEN_DEVICE_FUNC half() {} - EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} + EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} +#endif explicit EIGEN_DEVICE_FUNC half(bool b) : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} @@ -138,71 +145,125 @@ struct half : public half_impl::half_base { } }; +} // end namespace Eigen + +namespace std { +template<> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_to_nearest; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 11; + static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int radix = 2; + static const int min_exponent = -13; + static const int min_exponent10 = -4; + static const int max_exponent = 16; + static const int max_exponent10 = 4; + static const bool traps = true; + static const bool tinyness_before = false; + + static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } + static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } + static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } + static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } + static Eigen::half round_error() { return Eigen::half(0.5); } + static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } + static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } +}; + +// If std::numeric_limits is specialized, should also specialize +// std::numeric_limits, std::numeric_limits, and +// std::numeric_limits +// https://stackoverflow.com/a/16519653/ +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +} // end namespace std + +namespace Eigen { + namespace half_impl { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 // Intrinsics for native fp16 support. Note that on current hardware, -// these are no faster than fp32 arithmetic (you need to use the half2 +// these are no faster than float32_bits arithmetic (you need to use the half2 // versions to get the ALU speed increased), but you do save the // conversion steps back and forth. -__device__ half operator + (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { return __hadd(a, b); } -__device__ half operator * (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { return __hmul(a, b); } -__device__ half operator - (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { return __hsub(a, b); } -__device__ half operator / (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { float num = __half2float(a); float denom = __half2float(b); return __float2half(num / denom); } -__device__ half operator - (const half& a) { +EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { return __hneg(a); } -__device__ half& operator += (half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) { a = a + b; return a; } -__device__ half& operator *= (half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) { a = a * b; return a; } -__device__ half& operator -= (half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) { a = a - b; return a; } -__device__ half& operator /= (half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) { a = a / b; return a; } -__device__ bool operator == (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) { return __heq(a, b); } -__device__ bool operator != (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) { return __hne(a, b); } -__device__ bool operator < (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) { return __hlt(a, b); } -__device__ bool operator <= (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) { return __hle(a, b); } -__device__ bool operator > (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) { return __hgt(a, b); } -__device__ bool operator >= (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { return __hge(a, b); } #else // Emulate support for half floats // Definitions for CPUs and older CUDA, mostly working through conversion -// to/from fp32. +// to/from float32_bits. EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { return half(float(a) + float(b)); @@ -238,10 +299,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) return a; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { - return float(a) == float(b); + return numext::equal_strict(float(a),float(b)); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { - return float(a) != float(b); + return numext::not_equal_strict(float(a), float(b)); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { return float(a) < float(b); @@ -269,34 +330,35 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { // these in hardware. If we need more performance on older/other CPUs, they are // also possible to vectorize directly. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { - __half h; +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) { + __half_raw h; h.x = x; return h; } -union FP32 { +union float32_bits { unsigned int u; float f; }; -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - return __float2half(ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 + __half tmp_ff = __float2half(ff); + return *(__half_raw*)&tmp_ff; #elif defined(EIGEN_HAS_FP16_C) - __half h; + __half_raw h; h.x = _cvtss_sh(ff, 0); return h; #else - FP32 f; f.f = ff; + float32_bits f; f.f = ff; - const FP32 f32infty = { 255 << 23 }; - const FP32 f16max = { (127 + 16) << 23 }; - const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + const float32_bits f32infty = { 255 << 23 }; + const float32_bits f16max = { (127 + 16) << 23 }; + const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; unsigned int sign_mask = 0x80000000u; - __half o; + __half_raw o; o.x = static_cast(0x0u); unsigned int sign = f.u & sign_mask; @@ -335,17 +397,17 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { #endif } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __half2float(h); #elif defined(EIGEN_HAS_FP16_C) return _cvtsh_ss(h.x); #else - const FP32 magic = { 113 << 23 }; + const float32_bits magic = { 113 << 23 }; const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift - FP32 o; + float32_bits o; o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits unsigned int exp = shifted_exp & o.u; // just the exponent @@ -370,7 +432,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { return (a.x & 0x7fff) == 0x7c00; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return __hisnan(a); #else return (a.x & 0x7fff) > 0x7c00; @@ -386,11 +448,15 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { return result; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { - return half(::expf(float(a))); +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 + return half(hexp(a)); +#else + return half(::expf(float(a))); +#endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return Eigen::half(::hlog(a)); +#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 + return half(::hlog(a)); #else return half(::logf(float(a))); #endif @@ -402,7 +468,11 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { - return half(::sqrtf(float(a))); +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 + return half(hsqrt(a)); +#else + return half(::sqrtf(float(a))); +#endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { return half(::powf(float(a), float(b))); @@ -420,14 +490,22 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 + return half(hfloor(a)); +#else return half(::floorf(float(a))); +#endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 + return half(hceil(a)); +#else return half(::ceilf(float(a))); +#endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return __hlt(b, a) ? b : a; #else const float f1 = static_cast(a); @@ -436,7 +514,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return __hlt(a, b) ? b : a; #else const float f1 = static_cast(a); @@ -474,49 +552,6 @@ template<> struct is_arithmetic { enum { value = true }; }; } // end namespace internal -} // end namespace Eigen - -namespace std { -template<> -struct numeric_limits { - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool has_infinity = true; - static const bool has_quiet_NaN = true; - static const bool has_signaling_NaN = true; - static const float_denorm_style has_denorm = denorm_present; - static const bool has_denorm_loss = false; - static const std::float_round_style round_style = std::round_to_nearest; - static const bool is_iec559 = false; - static const bool is_bounded = false; - static const bool is_modulo = false; - static const int digits = 11; - static const int digits10 = 2; - //static const int max_digits10 = ; - static const int radix = 2; - static const int min_exponent = -13; - static const int min_exponent10 = -4; - static const int max_exponent = 16; - static const int max_exponent10 = 4; - static const bool traps = true; - static const bool tinyness_before = false; - - static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } - static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } - static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } - static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } - static Eigen::half round_error() { return Eigen::half(0.5); } - static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } - static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } - static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } - static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } -}; -} - -namespace Eigen { - template<> struct NumTraits : GenericNumTraits { @@ -557,7 +592,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { return Eigen::half(::expf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return Eigen::half(::hlog(a)); #else return Eigen::half(::logf(float(a))); @@ -591,14 +626,18 @@ struct hash { // Add the missing shfl_xor intrinsic -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + #if EIGEN_CUDACC_VER < 90000 return static_cast(__shfl_xor(static_cast(var), laneMask, width)); + #else + return static_cast(__shfl_xor_sync(0xFFFFFFFF, static_cast(var), laneMask, width)); + #endif } #endif -// ldg() has an overload for __half, but we also need one for Eigen::half. -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +// ldg() has an overload for __half_raw, but we also need one for Eigen::half. +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::half_impl::raw_uint16_to_half( __ldg(reinterpret_cast(ptr))); @@ -606,7 +645,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) #endif -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) namespace Eigen { namespace numext { diff --git a/extern/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/extern/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index ae54225f..c66d3846 100644 --- a/extern/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/extern/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -99,7 +99,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& template<> __device__ EIGEN_STRONG_INLINE half2 pabs(const half2& a) { half2 result; - result.x = a.x & 0x7FFF7FFF; + unsigned temp = *(reinterpret_cast(&(a))); + *(reinterpret_cast(&(result))) = temp & 0x7FFF7FFF; return result; } @@ -275,7 +276,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { return __floats2half2_rn(r1, r2); } -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { diff --git a/extern/eigen/Eigen/src/Core/arch/Default/ConjHelper.h b/extern/eigen/Eigen/src/Core/arch/Default/ConjHelper.h new file mode 100644 index 00000000..4cfe34e0 --- /dev/null +++ b/extern/eigen/Eigen/src/Core/arch/Default/ConjHelper.h @@ -0,0 +1,29 @@ + +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ARCH_CONJ_HELPER_H +#define EIGEN_ARCH_CONJ_HELPER_H + +#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \ + template<> struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \ + { return padd(c, pmul(x,y)); } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \ + { return PACKET_CPLX(Eigen::internal::pmul(x, y.v)); } \ + }; \ + \ + template<> struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \ + { return padd(c, pmul(x,y)); } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \ + { return PACKET_CPLX(Eigen::internal::pmul(x.v, y)); } \ + }; + +#endif // EIGEN_ARCH_CONJ_HELPER_H diff --git a/extern/eigen/Eigen/src/Core/arch/NEON/Complex.h b/extern/eigen/Eigen/src/Core/arch/NEON/Complex.h index 57e9b431..306a309b 100644 --- a/extern/eigen/Eigen/src/Core/arch/NEON/Complex.h +++ b/extern/eigen/Eigen/src/Core/arch/NEON/Complex.h @@ -67,7 +67,7 @@ template<> struct unpacket_traits { typedef std::complex type; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { float32x2_t r64; - r64 = vld1_f32((float *)&from); + r64 = vld1_f32((const float *)&from); return Packet2cf(vcombine_f32(r64, r64)); } @@ -142,7 +142,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf to[stride*1] = std::complex(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((float *)addr); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const float *)addr); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { @@ -265,6 +265,8 @@ template<> struct conj_helper } }; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) + template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for NEON @@ -275,7 +277,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con s = vmulq_f32(b.v, b.v); rev_s = vrev64q_f32(s); - return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); + return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); } EIGEN_DEVICE_FUNC inline void @@ -381,7 +383,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((double *)addr); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const double *)addr); } template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) { @@ -456,6 +458,8 @@ template<> struct conj_helper } }; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) + template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for NEON diff --git a/extern/eigen/Eigen/src/Core/arch/NEON/PacketMath.h b/extern/eigen/Eigen/src/Core/arch/NEON/PacketMath.h index 836fbc0d..3d5ed0d2 100644 --- a/extern/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/extern/eigen/Eigen/src/Core/arch/NEON/PacketMath.h @@ -36,12 +36,43 @@ namespace internal { #endif #endif +#if EIGEN_COMP_MSVC + +// In MSVC's arm_neon.h header file, all NEON vector types +// are aliases to the same underlying type __n128. +// We thus have to wrap them to make them different C++ types. +// (See also bug 1428) + +template +struct eigen_packet_wrapper +{ + operator T&() { return m_val; } + operator const T&() const { return m_val; } + eigen_packet_wrapper() {} + eigen_packet_wrapper(const T &v) : m_val(v) {} + eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + + T m_val; +}; +typedef eigen_packet_wrapper Packet2f; +typedef eigen_packet_wrapper Packet4f; +typedef eigen_packet_wrapper Packet4i; +typedef eigen_packet_wrapper Packet2i; +typedef eigen_packet_wrapper Packet4ui; + +#else + typedef float32x2_t Packet2f; typedef float32x4_t Packet4f; typedef int32x4_t Packet4i; typedef int32x2_t Packet2i; typedef uint32x4_t Packet4ui; +#endif // EIGEN_COMP_MSVC + #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) diff --git a/extern/eigen/Eigen/src/Core/arch/SSE/Complex.h b/extern/eigen/Eigen/src/Core/arch/SSE/Complex.h index 5607fe0a..d075043c 100644 --- a/extern/eigen/Eigen/src/Core/arch/SSE/Complex.h +++ b/extern/eigen/Eigen/src/Core/arch/SSE/Complex.h @@ -128,7 +128,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { @@ -229,23 +229,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const - { return Packet2cf(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const - { return Packet2cf(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { @@ -340,7 +324,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { @@ -430,23 +414,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const - { return Packet1cd(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const - { return Packet1cd(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { diff --git a/extern/eigen/Eigen/src/Core/arch/SSE/PacketMath.h b/extern/eigen/Eigen/src/Core/arch/SSE/PacketMath.h old mode 100644 new mode 100755 index 3832de14..60e2517e --- a/extern/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/extern/eigen/Eigen/src/Core/arch/SSE/PacketMath.h @@ -28,7 +28,7 @@ namespace internal { #endif #endif -#if (defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004) +#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot // have overloads for both types without linking error. // One solution is to increase ABI version using -fabi-version=4 (or greater). @@ -409,10 +409,16 @@ template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); } +#if EIGEN_COMP_PGI +typedef const void * SsePrefetchPtrType; +#else +typedef const char * SsePrefetchPtrType; +#endif + #ifndef EIGEN_VECTORIZE_AVX -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } #endif #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 @@ -876,4 +882,14 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co } // end namespace Eigen +#if EIGEN_COMP_PGI +// PGI++ does not define the following intrinsics in C++ mode. +static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); } +static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); } +static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); } +static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); } +static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); } +static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); } +#endif + #endif // EIGEN_PACKET_MATH_SSE_H diff --git a/extern/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h b/extern/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h index c8489323..c6ca8c71 100644 --- a/extern/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/extern/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -14,6 +14,7 @@ namespace Eigen { namespace internal { +#ifndef EIGEN_VECTORIZE_AVX template <> struct type_casting_traits { enum { @@ -23,11 +24,6 @@ struct type_casting_traits { }; }; -template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { - return _mm_cvttps_epi32(a); -} - - template <> struct type_casting_traits { enum { @@ -37,11 +33,6 @@ struct type_casting_traits { }; }; -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { - return _mm_cvtepi32_ps(a); -} - - template <> struct type_casting_traits { enum { @@ -51,10 +42,6 @@ struct type_casting_traits { }; }; -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet2d& a, const Packet2d& b) { - return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6)); -} - template <> struct type_casting_traits { enum { @@ -63,6 +50,19 @@ struct type_casting_traits { TgtCoeffRatio = 2 }; }; +#endif + +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return _mm_cvttps_epi32(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return _mm_cvtepi32_ps(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet2d& a, const Packet2d& b) { + return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6)); +} template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f& a) { // Simply discard the second half of the input diff --git a/extern/eigen/Eigen/src/Core/arch/ZVector/Complex.h b/extern/eigen/Eigen/src/Core/arch/ZVector/Complex.h index d39d2d10..1bfb7339 100644 --- a/extern/eigen/Eigen/src/Core/arch/ZVector/Complex.h +++ b/extern/eigen/Eigen/src/Core/arch/ZVector/Complex.h @@ -336,6 +336,9 @@ template<> struct conj_helper } }; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) + template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for AltiVec diff --git a/extern/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h b/extern/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h old mode 100644 new mode 100755 diff --git a/extern/eigen/Eigen/src/Core/functors/BinaryFunctors.h b/extern/eigen/Eigen/src/Core/functors/BinaryFunctors.h index 96747bac..3eae6b8c 100644 --- a/extern/eigen/Eigen/src/Core/functors/BinaryFunctors.h +++ b/extern/eigen/Eigen/src/Core/functors/BinaryFunctors.h @@ -255,7 +255,7 @@ struct scalar_cmp_op : binary_op_base struct scalar_hypot_op : binary_op_base { EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op) -// typedef typename NumTraits::Real result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const { - EIGEN_USING_STD_MATH(sqrt) - Scalar p, qp; - if(_x>_y) - { - p = _x; - qp = _y / p; - } - else - { - p = _y; - qp = _x / p; - } - return p * sqrt(Scalar(1) + qp*qp); + // This functor is used by hypotNorm only for which it is faster to first apply abs + // on all coefficients prior to reduction through hypot. + // This way we avoid calling abs on positive and real entries, and this also permits + // to seamlessly handle complexes. Otherwise we would have to handle both real and complexes + // through the same functor... + return internal::positive_real_hypot(x,y); } }; template diff --git a/extern/eigen/Eigen/src/Core/functors/StlFunctors.h b/extern/eigen/Eigen/src/Core/functors/StlFunctors.h index 6df3fa50..9c1d7585 100644 --- a/extern/eigen/Eigen/src/Core/functors/StlFunctors.h +++ b/extern/eigen/Eigen/src/Core/functors/StlFunctors.h @@ -83,13 +83,17 @@ struct functor_traits > { enum { Cost = functor_traits::Cost, PacketAccess = false }; }; #endif +#if (__cplusplus < 201703L) && (EIGEN_COMP_MSVC < 1910) +// std::unary_negate is deprecated since c++17 and will be removed in c++20 template struct functor_traits > { enum { Cost = 1 + functor_traits::Cost, PacketAccess = false }; }; +// std::binary_negate is deprecated since c++17 and will be removed in c++20 template struct functor_traits > { enum { Cost = 1 + functor_traits::Cost, PacketAccess = false }; }; +#endif #ifdef EIGEN_STDEXT_SUPPORT diff --git a/extern/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/extern/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 45230bce..e3980f6f 100644 --- a/extern/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/extern/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1197,10 +1197,16 @@ void gebp_kernel=6 without FMA (bug 1637) + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1)); + #else + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND + #endif + #define EIGEN_GEBGP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ @@ -1212,6 +1218,7 @@ void gebp_kernel::half SResPacketHalf; + const int SResPacketHalfSize = unpacket_traits::half>::size; if ((SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress <= 8) && - (SwappedTraits::LhsProgress!=8 || unpacket_traits::size==nr)) + (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr)) { SAccPacket C0, C1, C2, C3; straits.initAcc(C0); diff --git a/extern/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/extern/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index 41e18ff0..f6f9ebec 100644 --- a/extern/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +++ b/extern/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -52,7 +52,7 @@ struct general_matrix_matrix_triangular_product& blocking) \ { \ - if ( lhs==rhs && ((UpLo&(Lower|Upper)==UpLo)) ) { \ + if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \ general_matrix_matrix_rankupdate \ ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ } else { \ @@ -88,7 +88,7 @@ struct general_matrix_matrix_rankupdate(lhsStride), ldc=convert_index(resStride), n=convert_index(size), k=convert_index(depth); \ char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \ EIGTYPE beta(1); \ - BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \ + BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \ } \ }; @@ -125,9 +125,13 @@ struct general_matrix_matrix_rankupdate(b_tmp.outerStride()); \ } else b = _rhs; \ \ - BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ }}; -GEMM_SPECIALIZATION(double, d, double, d) -GEMM_SPECIALIZATION(float, f, float, s) -GEMM_SPECIALIZATION(dcomplex, cd, double, z) -GEMM_SPECIALIZATION(scomplex, cf, float, c) +#ifdef EIGEN_USE_MKL +GEMM_SPECIALIZATION(double, d, double, dgemm) +GEMM_SPECIALIZATION(float, f, float, sgemm) +GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm) +GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm) +#else +GEMM_SPECIALIZATION(double, d, double, dgemm_) +GEMM_SPECIALIZATION(float, f, float, sgemm_) +GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_) +GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_) +#endif } // end namespase internal diff --git a/extern/eigen/Eigen/src/Core/products/GeneralMatrixVector.h b/extern/eigen/Eigen/src/Core/products/GeneralMatrixVector.h index 3c1a7fc4..a597c1f4 100644 --- a/extern/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/extern/eigen/Eigen/src/Core/products/GeneralMatrixVector.h @@ -183,8 +183,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product \ struct general_matrix_vector_product_gemv \ { \ @@ -113,14 +113,21 @@ static void run( \ x_ptr=x_tmp.data(); \ incx=1; \ } else x_ptr=rhs; \ - BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ + BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; -EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d) -EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s) -EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z) -EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv) +#else +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_) +#endif } // end namespase internal diff --git a/extern/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/extern/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h index a45238d6..9a531850 100644 --- a/extern/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +++ b/extern/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h @@ -40,7 +40,7 @@ namespace internal { /* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */ -#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -81,13 +81,13 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } else b = _rhs; \ \ - BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -144,20 +144,26 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } \ \ - BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -EIGEN_BLAS_SYMM_L(double, double, d, d) -EIGEN_BLAS_SYMM_L(float, float, f, s) -EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z) -EIGEN_BLAS_HEMM_L(scomplex, float, cf, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMM_L(double, double, d, dsymm) +EIGEN_BLAS_SYMM_L(float, float, f, ssymm) +EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm) +EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm) +#else +EIGEN_BLAS_SYMM_L(double, double, d, dsymm_) +EIGEN_BLAS_SYMM_L(float, float, f, ssymm_) +EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_) +EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_) +#endif /* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */ -#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -197,13 +203,13 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } else b = _lhs; \ \ - BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -259,15 +265,21 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } \ \ - BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ } \ }; -EIGEN_BLAS_SYMM_R(double, double, d, d) -EIGEN_BLAS_SYMM_R(float, float, f, s) -EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z) -EIGEN_BLAS_HEMM_R(scomplex, float, cf, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMM_R(double, double, d, dsymm) +EIGEN_BLAS_SYMM_R(float, float, f, ssymm) +EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm) +EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm) +#else +EIGEN_BLAS_SYMM_R(double, double, d, dsymm_) +EIGEN_BLAS_SYMM_R(float, float, f, ssymm_) +EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_) +EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_) +#endif } // end namespace internal } // end namespace Eigen diff --git a/extern/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/extern/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h index 38f23acc..1238345e 100644 --- a/extern/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +++ b/extern/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h @@ -95,14 +95,21 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \ x_tmp=map_x.conjugate(); \ x_ptr=x_tmp.data(); \ } else x_ptr=_rhs; \ - BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ + BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv) +EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv) +EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv) +EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv) +#else EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_) EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_) EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_) EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_) +#endif } // end namespace internal diff --git a/extern/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h b/extern/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h index 6ec5a8a0..f784507e 100644 --- a/extern/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/extern/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -137,7 +137,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer((internal::constructor_without_unaligned_array_assert())); + // To work around an "error: member reference base type 'Matrix<...> + // (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is + // not a structure or union" compilation error in nvcc (tested V8.0.61), + // create a dummy internal::constructor_without_unaligned_array_assert + // object to pass to the Matrix constructor. + internal::constructor_without_unaligned_array_assert a; + Matrix triangularBuffer(a); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); @@ -284,7 +290,8 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer((internal::constructor_without_unaligned_array_assert())); + internal::constructor_without_unaligned_array_assert a; + Matrix triangularBuffer(a); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); @@ -393,7 +400,9 @@ struct triangular_product_impl { template static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha) { - typedef typename Dest::Scalar Scalar; + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef typename Dest::Scalar Scalar; typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; @@ -405,8 +414,9 @@ struct triangular_product_impl typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) - * RhsBlasTraits::extractScalarFactor(a_rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs); + Scalar actualAlpha = alpha * lhs_alpha * rhs_alpha; typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType; @@ -431,6 +441,21 @@ struct triangular_product_impl &dst.coeffRef(0,0), dst.outerStride(), // result info actualAlpha, blocking ); + + // Apply correction if the diagonal is unit and a scalar factor was nested: + if ((Mode&UnitDiag)==UnitDiag) + { + if (LhsIsTriangular && lhs_alpha!=LhsScalar(1)) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dst.topRows(diagSize) -= ((lhs_alpha-LhsScalar(1))*a_rhs).topRows(diagSize); + } + else if ((!LhsIsTriangular) && rhs_alpha!=RhsScalar(1)) + { + Index diagSize = (std::min)(rhs.rows(),rhs.cols()); + dst.leftCols(diagSize) -= (rhs_alpha-RhsScalar(1))*a_lhs.leftCols(diagSize); + } + } } }; diff --git a/extern/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/extern/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h index aecded6b..a25197ab 100644 --- a/extern/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +++ b/extern/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h @@ -75,7 +75,7 @@ EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true) EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false) // implements col-major += alpha * op(triangular) * op(general) -#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -172,7 +172,7 @@ struct product_triangular_matrix_matrix_trmm > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -180,13 +180,20 @@ struct product_triangular_matrix_matrix_trmm \ @@ -282,7 +289,7 @@ struct product_triangular_matrix_matrix_trmm > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -290,11 +297,17 @@ struct product_triangular_matrix_matrix_trmm struct trmv_selector typename internal::add_const_on_value_type::type actualLhs = LhsBlasTraits::extract(lhs); typename internal::add_const_on_value_type::type actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha; enum { // FIXME find a way to allow an inner stride on the result if packet_traits::size==1 @@ -274,6 +275,12 @@ template struct trmv_selector else dest = MappedDest(actualDestPtr, dest.size()); } + + if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) ) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize); + } } }; @@ -295,8 +302,9 @@ template struct trmv_selector typename add_const::type actualLhs = LhsBlasTraits::extract(lhs); typename add_const::type actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha; enum { DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 @@ -326,6 +334,12 @@ template struct trmv_selector actualRhsPtr,1, dest.data(),dest.innerStride(), actualAlpha); + + if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) ) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize); + } } }; diff --git a/extern/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/extern/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h index 07bf26ce..3d47a2b9 100644 --- a/extern/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +++ b/extern/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h @@ -71,7 +71,7 @@ EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex) EIGEN_BLAS_TRMV_SPECIALIZE(scomplex) // implements col-major: res += alpha * op(triangular) * vector -#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \ template \ struct triangular_matrix_vector_product_trmv { \ enum { \ @@ -121,10 +121,10 @@ struct triangular_matrix_vector_product_trmv(size); \ n = convert_index(cols-size); \ } \ - BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_BLAS_TRMV_CM(double, double, d, d) -EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z) -EIGEN_BLAS_TRMV_CM(float, float, f, s) -EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMV_CM(double, double, d, d,) +EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,) +EIGEN_BLAS_TRMV_CM(float, float, f, s,) +EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c,) +#else +EIGEN_BLAS_TRMV_CM(double, double, d, d, _) +EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _) +EIGEN_BLAS_TRMV_CM(float, float, f, s, _) +EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _) +#endif // implements row-major: res += alpha * op(triangular) * vector -#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \ template \ struct triangular_matrix_vector_product_trmv { \ enum { \ @@ -203,10 +210,10 @@ struct triangular_matrix_vector_product_trmv(size); \ n = convert_index(cols-size); \ } \ - BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_BLAS_TRMV_RM(double, double, d, d) -EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z) -EIGEN_BLAS_TRMV_RM(float, float, f, s) -EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMV_RM(double, double, d, d,) +EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,) +EIGEN_BLAS_TRMV_RM(float, float, f, s,) +EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c,) +#else +EIGEN_BLAS_TRMV_RM(double, double, d, d,_) +EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_) +EIGEN_BLAS_TRMV_RM(float, float, f, s,_) +EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c,_) +#endif } // end namespase internal diff --git a/extern/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h b/extern/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h index 88c0fb79..f0775116 100644 --- a/extern/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +++ b/extern/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h @@ -38,7 +38,7 @@ namespace Eigen { namespace internal { // implements LeftSide op(triangular)^-1 * general -#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \ +#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \ template \ struct triangular_solve_matrix \ { \ @@ -80,18 +80,24 @@ struct triangular_solve_matrix \ struct triangular_solve_matrix \ { \ @@ -133,16 +139,22 @@ struct triangular_solve_matrix=6 +#elif defined __GNUC__ - #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS + #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) #pragma GCC diagnostic push #endif - #pragma GCC diagnostic ignored "-Wignored-attributes" + // g++ warns about local variables shadowing member functions, which is too strict + #pragma GCC diagnostic ignored "-Wshadow" + #if __GNUC__ == 4 && __GNUC_MINOR__ < 8 + // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions: + #pragma GCC diagnostic ignored "-Wtype-limits" + #endif + #if __GNUC__>=6 + #pragma GCC diagnostic ignored "-Wignored-attributes" + #endif #endif diff --git a/extern/eigen/Eigen/src/Core/util/MKL_support.h b/extern/eigen/Eigen/src/Core/util/MKL_support.h old mode 100644 new mode 100755 index 26b59669..b7d6ecc7 --- a/extern/eigen/Eigen/src/Core/util/MKL_support.h +++ b/extern/eigen/Eigen/src/Core/util/MKL_support.h @@ -49,10 +49,11 @@ #define EIGEN_USE_LAPACKE #endif -#if defined(EIGEN_USE_MKL_VML) +#if defined(EIGEN_USE_MKL_VML) && !defined(EIGEN_USE_MKL) #define EIGEN_USE_MKL #endif + #if defined EIGEN_USE_MKL # include /*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/ @@ -108,6 +109,10 @@ #endif #endif +#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL) +#include "../../misc/blas.h" +#endif + namespace Eigen { typedef std::complex dcomplex; @@ -121,8 +126,5 @@ typedef int BlasIndex; } // end namespace Eigen -#if defined(EIGEN_USE_BLAS) -#include "../../misc/blas.h" -#endif #endif // EIGEN_MKL_SUPPORT_H diff --git a/extern/eigen/Eigen/src/Core/util/Macros.h b/extern/eigen/Eigen/src/Core/util/Macros.h index 38d6ddb9..aa054a0b 100644 --- a/extern/eigen/Eigen/src/Core/util/Macros.h +++ b/extern/eigen/Eigen/src/Core/util/Macros.h @@ -13,7 +13,7 @@ #define EIGEN_WORLD_VERSION 3 #define EIGEN_MAJOR_VERSION 3 -#define EIGEN_MINOR_VERSION 4 +#define EIGEN_MINOR_VERSION 7 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ @@ -399,7 +399,7 @@ // Does the compiler support variadic templates? #ifndef EIGEN_HAS_VARIADIC_TEMPLATES #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \ - && ( !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) ) + && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) ) // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices: // this prevents nvcc from crashing when compiling Eigen on Tegra X1 #define EIGEN_HAS_VARIADIC_TEMPLATES 1 @@ -413,7 +413,7 @@ #ifdef __CUDACC__ // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above -#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500)) +#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500)) #define EIGEN_HAS_CONSTEXPR 1 #endif #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ @@ -487,11 +487,13 @@ // EIGEN_STRONG_INLINE is a stronger version of the inline, using __forceinline on MSVC, // but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline // but GCC is still doing fine with just inline. +#ifndef EIGEN_STRONG_INLINE #if EIGEN_COMP_MSVC || EIGEN_COMP_ICC #define EIGEN_STRONG_INLINE __forceinline #else #define EIGEN_STRONG_INLINE inline #endif +#endif // EIGEN_ALWAYS_INLINE is the stronget, it has the effect of making the function inline and adding every possible // attribute to maximize inlining. This should only be used when really necessary: in particular, @@ -812,7 +814,8 @@ namespace Eigen { // just an empty macro ! #define EIGEN_EMPTY -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || defined(__CUDACC_VER__)) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0) + // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) @@ -986,7 +989,13 @@ namespace Eigen { # define EIGEN_NOEXCEPT # define EIGEN_NOEXCEPT_IF(x) # define EIGEN_NO_THROW throw() -# define EIGEN_EXCEPTION_SPEC(X) throw(X) +# if EIGEN_COMP_MSVC + // MSVC does not support exception specifications (warning C4290), + // and they are deprecated in c++11 anyway. +# define EIGEN_EXCEPTION_SPEC(X) throw() +# else +# define EIGEN_EXCEPTION_SPEC(X) throw(X) +# endif #endif #endif // EIGEN_MACROS_H diff --git a/extern/eigen/Eigen/src/Core/util/Memory.h b/extern/eigen/Eigen/src/Core/util/Memory.h index c634d7ea..291383c5 100644 --- a/extern/eigen/Eigen/src/Core/util/Memory.h +++ b/extern/eigen/Eigen/src/Core/util/Memory.h @@ -70,7 +70,7 @@ inline void throw_std_bad_alloc() throw std::bad_alloc(); #else std::size_t huge = static_cast(-1); - new int[huge]; + ::operator new(huge); #endif } @@ -493,7 +493,7 @@ template struct smart_copy_helper { IntPtr size = IntPtr(end)-IntPtr(start); if(size==0) return; eigen_internal_assert(start!=0 && end!=0 && target!=0); - memcpy(target, start, size); + std::memcpy(target, start, size); } }; @@ -696,7 +696,15 @@ template void swap(scoped_array &a,scoped_array &b) /** \class aligned_allocator * \ingroup Core_Module * -* \brief STL compatible allocator to use with with 16 byte aligned types +* \brief STL compatible allocator to use with types requiring a non standrad alignment. +* +* The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd. +* By default, it will thus provide at least 16 bytes alignment and more in following cases: +* - 32 bytes alignment if AVX is enabled. +* - 64 bytes alignment if AVX512 is enabled. +* +* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented +* \link TopicPreprocessorDirectivesPerformance there \endlink. * * Example: * \code @@ -739,7 +747,15 @@ class aligned_allocator : public std::allocator pointer allocate(size_type num, const void* /*hint*/ = 0) { internal::check_size_for_overflow(num); - return static_cast( internal::aligned_malloc(num * sizeof(T)) ); + size_type size = num * sizeof(T); +#if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0) + // workaround gcc bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544 + // It triggered eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807 + if(size>=std::size_t((std::numeric_limits::max)())) + return 0; + else +#endif + return static_cast( internal::aligned_malloc(size) ); } void deallocate(pointer p, size_type /*num*/) diff --git a/extern/eigen/Eigen/src/Core/util/Meta.h b/extern/eigen/Eigen/src/Core/util/Meta.h old mode 100644 new mode 100755 index 7f637075..d31e9541 --- a/extern/eigen/Eigen/src/Core/util/Meta.h +++ b/extern/eigen/Eigen/src/Core/util/Meta.h @@ -109,6 +109,28 @@ template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; +#if EIGEN_HAS_CXX11 +using std::make_unsigned; +#else +// TODO: Possibly improve this implementation of make_unsigned. +// It is currently used only by +// template struct random_default_impl. +template struct make_unsigned; +template<> struct make_unsigned { typedef unsigned char type; }; +template<> struct make_unsigned { typedef unsigned char type; }; +template<> struct make_unsigned { typedef unsigned char type; }; +template<> struct make_unsigned { typedef unsigned short type; }; +template<> struct make_unsigned { typedef unsigned short type; }; +template<> struct make_unsigned { typedef unsigned int type; }; +template<> struct make_unsigned { typedef unsigned int type; }; +template<> struct make_unsigned { typedef unsigned long type; }; +template<> struct make_unsigned { typedef unsigned long type; }; +#if EIGEN_COMP_MSVC +template<> struct make_unsigned { typedef unsigned __int64 type; }; +template<> struct make_unsigned { typedef unsigned __int64 type; }; +#endif +#endif + template struct add_const { typedef const T type; }; template struct add_const { typedef T& type; }; @@ -485,6 +507,26 @@ T div_ceil(const T &a, const T &b) return (a+b-1) / b; } +// The aim of the following functions is to bypass -Wfloat-equal warnings +// when we really want a strict equality comparison on floating points. +template EIGEN_STRONG_INLINE +bool equal_strict(const X& x,const Y& y) { return x == y; } + +template<> EIGEN_STRONG_INLINE +bool equal_strict(const float& x,const float& y) { return std::equal_to()(x,y); } + +template<> EIGEN_STRONG_INLINE +bool equal_strict(const double& x,const double& y) { return std::equal_to()(x,y); } + +template EIGEN_STRONG_INLINE +bool not_equal_strict(const X& x,const Y& y) { return x != y; } + +template<> EIGEN_STRONG_INLINE +bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to()(x,y); } + +template<> EIGEN_STRONG_INLINE +bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to()(x,y); } + } // end namespace numext } // end namespace Eigen diff --git a/extern/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h b/extern/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h index 86b60f52..ecc82b7c 100644 --- a/extern/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +++ b/extern/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h @@ -8,7 +8,7 @@ #pragma warning pop #elif defined __clang__ #pragma clang diagnostic pop - #elif defined __GNUC__ && __GNUC__>=6 + #elif defined __GNUC__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) #pragma GCC diagnostic pop #endif diff --git a/extern/eigen/Eigen/src/Core/util/StaticAssert.h b/extern/eigen/Eigen/src/Core/util/StaticAssert.h index 983361a4..500e4779 100644 --- a/extern/eigen/Eigen/src/Core/util/StaticAssert.h +++ b/extern/eigen/Eigen/src/Core/util/StaticAssert.h @@ -24,6 +24,7 @@ * */ +#ifndef EIGEN_STATIC_ASSERT #ifndef EIGEN_NO_STATIC_ASSERT #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600)) @@ -44,64 +45,65 @@ struct static_assertion { enum { - YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX, - YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES, - YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES, - THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE, - THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE, - THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE, - OUT_OF_RANGE_ACCESS, - YOU_MADE_A_PROGRAMMING_MISTAKE, - EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT, - EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE, - YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR, - YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR, - UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC, - THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES, - FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED, - NUMERIC_TYPE_MUST_BE_REAL, - COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED, - WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED, - THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE, - INVALID_MATRIX_PRODUCT, - INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS, - INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION, - YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY, - THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES, - THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES, - INVALID_MATRIX_TEMPLATE_PARAMETERS, - INVALID_MATRIXBASE_TEMPLATE_PARAMETERS, - BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER, - THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX, - THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE, - THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES, - YOU_ALREADY_SPECIFIED_THIS_STRIDE, - INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION, - THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD, - PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1, - THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS, - YOU_CANNOT_MIX_ARRAYS_AND_MATRICES, - YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION, - THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY, - YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT, - THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS, - THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS, - THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL, - THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES, - YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED, - YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED, - THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE, - THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH, - OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG, - IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY, - STORAGE_LAYOUT_DOES_NOT_MATCH, - EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE, - THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS, - MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY, - THIS_TYPE_IS_NOT_SUPPORTED, - STORAGE_KIND_MUST_MATCH, - STORAGE_INDEX_MUST_MATCH, - CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY + YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX=1, + YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES=1, + YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES=1, + THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE=1, + THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE=1, + THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE=1, + OUT_OF_RANGE_ACCESS=1, + YOU_MADE_A_PROGRAMMING_MISTAKE=1, + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT=1, + EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE=1, + YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR=1, + YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR=1, + UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC=1, + THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES=1, + FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED=1, + NUMERIC_TYPE_MUST_BE_REAL=1, + COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED=1, + WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED=1, + THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE=1, + INVALID_MATRIX_PRODUCT=1, + INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS=1, + INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION=1, + YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY=1, + THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES=1, + THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES=1, + INVALID_MATRIX_TEMPLATE_PARAMETERS=1, + INVALID_MATRIXBASE_TEMPLATE_PARAMETERS=1, + BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER=1, + THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX=1, + THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE=1, + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES=1, + YOU_ALREADY_SPECIFIED_THIS_STRIDE=1, + INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION=1, + THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD=1, + PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1=1, + THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS=1, + YOU_CANNOT_MIX_ARRAYS_AND_MATRICES=1, + YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION=1, + THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY=1, + YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT=1, + THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS=1, + THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS=1, + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL=1, + THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES=1, + YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED=1, + YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED=1, + THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE=1, + THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH=1, + OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG=1, + IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY=1, + STORAGE_LAYOUT_DOES_NOT_MATCH=1, + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE=1, + THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS=1, + MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY=1, + THIS_TYPE_IS_NOT_SUPPORTED=1, + STORAGE_KIND_MUST_MATCH=1, + STORAGE_INDEX_MUST_MATCH=1, + CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1, + SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1 }; }; @@ -131,7 +133,7 @@ #define EIGEN_STATIC_ASSERT(CONDITION,MSG) eigen_assert((CONDITION) && #MSG); #endif // EIGEN_NO_STATIC_ASSERT - +#endif // EIGEN_STATIC_ASSERT // static assertion failing if the type \a TYPE is not a vector type #define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \ diff --git a/extern/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/extern/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h index 36a91dff..87d789b3 100644 --- a/extern/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +++ b/extern/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h @@ -311,7 +311,6 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp // Aliases: Map v(reinterpret_cast(m_tmp.data()), size); ComplexVectorType &cv = m_tmp; - const MatrixType &mZ = m_realQZ.matrixZ(); const MatrixType &mS = m_realQZ.matrixS(); const MatrixType &mT = m_realQZ.matrixT(); @@ -351,7 +350,7 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp } } } - m_eivec.col(i).real().noalias() = mZ.transpose() * v; + m_eivec.col(i).real().noalias() = m_realQZ.matrixZ().transpose() * v; m_eivec.col(i).real().normalize(); m_eivec.col(i).imag().setConstant(0); } @@ -400,7 +399,7 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp / (alpha*mT.coeffRef(j,j) - static_cast(beta*mS.coeffRef(j,j))); } } - m_eivec.col(i+1).noalias() = (mZ.transpose() * cv); + m_eivec.col(i+1).noalias() = (m_realQZ.matrixZ().transpose() * cv); m_eivec.col(i+1).normalize(); m_eivec.col(i) = m_eivec.col(i+1).conjugate(); } diff --git a/extern/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/extern/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h index 4fec8af0..e4e42607 100644 --- a/extern/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +++ b/extern/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h @@ -66,7 +66,6 @@ template inline typename MatrixBase::EigenvaluesReturnType MatrixBase::eigenvalues() const { - typedef typename internal::traits::Scalar Scalar; return internal::eigenvalues_selector::IsComplex>::run(derived()); } @@ -88,7 +87,6 @@ template inline typename SelfAdjointView::EigenvaluesReturnType SelfAdjointView::eigenvalues() const { - typedef typename SelfAdjointView::PlainObject PlainObject; PlainObject thisAsMatrix(*this); return SelfAdjointEigenSolver(thisAsMatrix, false).eigenvalues(); } diff --git a/extern/eigen/Eigen/src/Eigenvalues/RealSchur.h b/extern/eigen/Eigen/src/Eigenvalues/RealSchur.h index f5c86041..17ea903f 100644 --- a/extern/eigen/Eigen/src/Eigenvalues/RealSchur.h +++ b/extern/eigen/Eigen/src/Eigenvalues/RealSchur.h @@ -303,7 +303,7 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa Scalar exshift(0); // sum of exceptional shifts Scalar norm = computeNormOfT(); - if(norm!=0) + if(norm!=Scalar(0)) { while (iu >= 0) { @@ -327,7 +327,7 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa else // No convergence yet { // The firstHouseholderVector vector has to be initialized to something to get rid of a silly GCC warning (-O1 -Wall -DNDEBUG ) - Vector3s firstHouseholderVector(0,0,0), shiftInfo; + Vector3s firstHouseholderVector = Vector3s::Zero(), shiftInfo; computeShift(iu, iter, exshift, shiftInfo); iter = iter + 1; totalIter = totalIter + 1; diff --git a/extern/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h b/extern/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h index 3891cf88..b0c947dc 100644 --- a/extern/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +++ b/extern/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h @@ -37,7 +37,7 @@ namespace Eigen { /** \internal Specialization for the data types supported by LAPACKe */ -#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW, LAPACKE_COLROW ) \ +#define EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW ) \ template<> template inline \ SelfAdjointEigenSolver >& \ SelfAdjointEigenSolver >::compute(const EigenBase& matrix, int options) \ @@ -47,7 +47,7 @@ SelfAdjointEigenSolver >::compute(c && (options&EigVecMask)!=EigVecMask \ && "invalid option parameter"); \ bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors; \ - lapack_int n = internal::convert_index(matrix.cols()), lda, matrix_order, info; \ + lapack_int n = internal::convert_index(matrix.cols()), lda, info; \ m_eivalues.resize(n,1); \ m_subdiag.resize(n-1); \ m_eivec = matrix; \ @@ -63,27 +63,24 @@ SelfAdjointEigenSolver >::compute(c } \ \ lda = internal::convert_index(m_eivec.outerStride()); \ - matrix_order=LAPACKE_COLROW; \ char jobz, uplo='L'/*, range='A'*/; \ jobz = computeEigenvectors ? 'V' : 'N'; \ \ - info = LAPACKE_##LAPACKE_NAME( matrix_order, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \ + info = LAPACKE_##LAPACKE_NAME( LAPACK_COL_MAJOR, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \ m_info = (info==0) ? Success : NoConvergence; \ m_isInitialized = true; \ m_eigenvectorsOk = computeEigenvectors; \ return *this; \ } +#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME ) \ + EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, ColMajor ) \ + EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, RowMajor ) -EIGEN_LAPACKE_EIG_SELFADJ(double, double, double, dsyev, ColMajor, LAPACK_COL_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(float, float, float, ssyev, ColMajor, LAPACK_COL_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, ColMajor, LAPACK_COL_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float, float, cheev, ColMajor, LAPACK_COL_MAJOR) - -EIGEN_LAPACKE_EIG_SELFADJ(double, double, double, dsyev, RowMajor, LAPACK_ROW_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(float, float, float, ssyev, RowMajor, LAPACK_ROW_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, RowMajor, LAPACK_ROW_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float, float, cheev, RowMajor, LAPACK_ROW_MAJOR) +EIGEN_LAPACKE_EIG_SELFADJ(double, double, double, dsyev) +EIGEN_LAPACKE_EIG_SELFADJ(float, float, float, ssyev) +EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev) +EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float, float, cheev) } // end namespace Eigen diff --git a/extern/eigen/Eigen/src/Geometry/AngleAxis.h b/extern/eigen/Eigen/src/Geometry/AngleAxis.h index 0af3c1b0..83ee1be4 100644 --- a/extern/eigen/Eigen/src/Geometry/AngleAxis.h +++ b/extern/eigen/Eigen/src/Geometry/AngleAxis.h @@ -178,7 +178,7 @@ EIGEN_DEVICE_FUNC AngleAxis& AngleAxis::operator=(const Quaterni if (n != Scalar(0)) { m_angle = Scalar(2)*atan2(n, abs(q.w())); - if(q.w() < 0) + if(q.w() < Scalar(0)) n = -n; m_axis = q.vec() / n; } diff --git a/extern/eigen/Eigen/src/Geometry/Quaternion.h b/extern/eigen/Eigen/src/Geometry/Quaternion.h index 3e5a9bad..c3fd8c3e 100644 --- a/extern/eigen/Eigen/src/Geometry/Quaternion.h +++ b/extern/eigen/Eigen/src/Geometry/Quaternion.h @@ -43,6 +43,11 @@ class QuaternionBase : public RotationBase typedef typename internal::traits::Scalar Scalar; typedef typename NumTraits::Real RealScalar; typedef typename internal::traits::Coefficients Coefficients; + typedef typename Coefficients::CoeffReturnType CoeffReturnType; + typedef typename internal::conditional::Flags&LvalueBit), + Scalar&, CoeffReturnType>::type NonConstCoeffReturnType; + + enum { Flags = Eigen::internal::traits::Flags }; @@ -58,22 +63,22 @@ class QuaternionBase : public RotationBase /** \returns the \c x coefficient */ - EIGEN_DEVICE_FUNC inline Scalar x() const { return this->derived().coeffs().coeff(0); } + EIGEN_DEVICE_FUNC inline CoeffReturnType x() const { return this->derived().coeffs().coeff(0); } /** \returns the \c y coefficient */ - EIGEN_DEVICE_FUNC inline Scalar y() const { return this->derived().coeffs().coeff(1); } + EIGEN_DEVICE_FUNC inline CoeffReturnType y() const { return this->derived().coeffs().coeff(1); } /** \returns the \c z coefficient */ - EIGEN_DEVICE_FUNC inline Scalar z() const { return this->derived().coeffs().coeff(2); } + EIGEN_DEVICE_FUNC inline CoeffReturnType z() const { return this->derived().coeffs().coeff(2); } /** \returns the \c w coefficient */ - EIGEN_DEVICE_FUNC inline Scalar w() const { return this->derived().coeffs().coeff(3); } - - /** \returns a reference to the \c x coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& x() { return this->derived().coeffs().coeffRef(0); } - /** \returns a reference to the \c y coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& y() { return this->derived().coeffs().coeffRef(1); } - /** \returns a reference to the \c z coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& z() { return this->derived().coeffs().coeffRef(2); } - /** \returns a reference to the \c w coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& w() { return this->derived().coeffs().coeffRef(3); } + EIGEN_DEVICE_FUNC inline CoeffReturnType w() const { return this->derived().coeffs().coeff(3); } + + /** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType x() { return this->derived().coeffs().x(); } + /** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType y() { return this->derived().coeffs().y(); } + /** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType z() { return this->derived().coeffs().z(); } + /** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType w() { return this->derived().coeffs().w(); } /** \returns a read-only vector expression of the imaginary part (x,y,z) */ EIGEN_DEVICE_FUNC inline const VectorBlock vec() const { return coeffs().template head<3>(); } diff --git a/extern/eigen/Eigen/src/Geometry/Scaling.h b/extern/eigen/Eigen/src/Geometry/Scaling.h old mode 100644 new mode 100755 diff --git a/extern/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/extern/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h index facdaf89..f66c846e 100644 --- a/extern/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +++ b/extern/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h @@ -168,7 +168,7 @@ class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar> { for(Index j=0; jRealScalar(0)) m_invdiag(j) = RealScalar(1)/sum; else diff --git a/extern/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/extern/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 395daa8e..f7ce4713 100644 --- a/extern/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/extern/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -50,7 +50,8 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, tol_error = 0; return; } - RealScalar threshold = tol*tol*rhsNorm2; + const RealScalar considerAsZero = (std::numeric_limits::min)(); + RealScalar threshold = numext::maxi(tol*tol*rhsNorm2,considerAsZero); RealScalar residualNorm2 = residual.squaredNorm(); if (residualNorm2 < threshold) { @@ -58,7 +59,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, tol_error = sqrt(residualNorm2 / rhsNorm2); return; } - + VectorType p(n); p = precond.solve(residual); // initial search direction diff --git a/extern/eigen/Eigen/src/Jacobi/Jacobi.h b/extern/eigen/Eigen/src/Jacobi/Jacobi.h index c30326e1..1998c632 100644 --- a/extern/eigen/Eigen/src/Jacobi/Jacobi.h +++ b/extern/eigen/Eigen/src/Jacobi/Jacobi.h @@ -65,11 +65,11 @@ template class JacobiRotation bool makeJacobi(const MatrixBase&, Index p, Index q); bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z); - void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0); + void makeGivens(const Scalar& p, const Scalar& q, Scalar* r=0); protected: - void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::true_type); - void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::false_type); + void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type); + void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type); Scalar m_c, m_s; }; @@ -84,7 +84,6 @@ bool JacobiRotation::makeJacobi(const RealScalar& x, const Scalar& y, co { using std::sqrt; using std::abs; - typedef typename NumTraits::Real RealScalar; RealScalar deno = RealScalar(2)*abs(y); if(deno < (std::numeric_limits::min)()) { @@ -133,7 +132,7 @@ inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Ind * \f$ V = \left ( \begin{array}{c} p \\ q \end{array} \right )\f$ yields: * \f$ G^* V = \left ( \begin{array}{c} r \\ 0 \end{array} \right )\f$. * - * The value of \a z is returned if \a z is not null (the default is null). + * The value of \a r is returned if \a r is not null (the default is null). * Also note that G is built such that the cosine is always real. * * Example: \include Jacobi_makeGivens.cpp @@ -146,9 +145,9 @@ inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Ind * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template -void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* z) +void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r) { - makeGivens(p, q, z, typename internal::conditional::IsComplex, internal::true_type, internal::false_type>::type()); + makeGivens(p, q, r, typename internal::conditional::IsComplex, internal::true_type, internal::false_type>::type()); } @@ -298,61 +297,119 @@ inline void MatrixBase::applyOnTheRight(Index p, Index q, const JacobiR } namespace internal { -template -void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j) + +template +struct apply_rotation_in_the_plane_selector { - typedef typename VectorX::Scalar Scalar; - enum { - PacketSize = packet_traits::size, - OtherPacketSize = packet_traits::size - }; - typedef typename packet_traits::type Packet; - typedef typename packet_traits::type OtherPacket; - eigen_assert(xpr_x.size() == xpr_y.size()); - Index size = xpr_x.size(); - Index incrx = xpr_x.derived().innerStride(); - Index incry = xpr_y.derived().innerStride(); + static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + { + for(Index i=0; i +struct apply_rotation_in_the_plane_selector +{ + static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + { + enum { + PacketSize = packet_traits::size, + OtherPacketSize = packet_traits::size + }; + typedef typename packet_traits::type Packet; + typedef typename packet_traits::type OtherPacket; + + /*** dynamic-size vectorized paths ***/ + if(SizeAtCompileTime == Dynamic && ((incrx==1 && incry==1) || PacketSize == 1)) + { + // both vectors are sequentially stored in memory => vectorization + enum { Peeling = 2 }; - /*** dynamic-size vectorized paths ***/ + Index alignedStart = internal::first_default_aligned(y, size); + Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; - if(VectorX::SizeAtCompileTime == Dynamic && - (VectorX::Flags & VectorY::Flags & PacketAccessBit) && - (PacketSize == OtherPacketSize) && - ((incrx==1 && incry==1) || PacketSize == 1)) - { - // both vectors are sequentially stored in memory => vectorization - enum { Peeling = 2 }; + const OtherPacket pc = pset1(c); + const OtherPacket ps = pset1(s); + conj_helper::IsComplex,false> pcj; + conj_helper pm; - Index alignedStart = internal::first_default_aligned(y, size); - Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; + for(Index i=0; i(c); - const OtherPacket ps = pset1(s); - conj_helper::IsComplex,false> pcj; - conj_helper pm; + Scalar* EIGEN_RESTRICT px = x + alignedStart; + Scalar* EIGEN_RESTRICT py = y + alignedStart; - for(Index i=0; i(px); + Packet yi = pload(py); + pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + px += PacketSize; + py += PacketSize; + } + } + else + { + Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize); + for(Index i=alignedStart; i(px); + Packet xi1 = ploadu(px+PacketSize); + Packet yi = pload (py); + Packet yi1 = pload (py+PacketSize); + pstoreu(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstoreu(px+PacketSize, padd(pm.pmul(pc,xi1),pcj.pmul(ps,yi1))); + pstore (py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pm.pmul(ps,xi1))); + px += Peeling*PacketSize; + py += Peeling*PacketSize; + } + if(alignedEnd!=peelingEnd) + { + Packet xi = ploadu(x+peelingEnd); + Packet yi = pload (y+peelingEnd); + pstoreu(x+peelingEnd, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + } + } - Scalar* EIGEN_RESTRICT px = x + alignedStart; - Scalar* EIGEN_RESTRICT py = y + alignedStart; + for(Index i=alignedEnd; i0) // FIXME should be compared to the required alignment { - for(Index i=alignedStart; i(c); + const OtherPacket ps = pset1(s); + conj_helper::IsComplex,false> pcj; + conj_helper pm; + Scalar* EIGEN_RESTRICT px = x; + Scalar* EIGEN_RESTRICT py = y; + for(Index i=0; i(px); Packet yi = pload(py); @@ -362,76 +419,40 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x py += PacketSize; } } - else - { - Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize); - for(Index i=alignedStart; i(px); - Packet xi1 = ploadu(px+PacketSize); - Packet yi = pload (py); - Packet yi1 = pload (py+PacketSize); - pstoreu(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); - pstoreu(px+PacketSize, padd(pm.pmul(pc,xi1),pcj.pmul(ps,yi1))); - pstore (py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); - pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pm.pmul(ps,xi1))); - px += Peeling*PacketSize; - py += Peeling*PacketSize; - } - if(alignedEnd!=peelingEnd) - { - Packet xi = ploadu(x+peelingEnd); - Packet yi = pload (y+peelingEnd); - pstoreu(x+peelingEnd, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); - pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); - } - } - for(Index i=alignedEnd; i::run(x,incrx,y,incry,size,c,s); } } +}; - /*** fixed-size vectorized path ***/ - else if(VectorX::SizeAtCompileTime != Dynamic && - (VectorX::Flags & VectorY::Flags & PacketAccessBit) && - (PacketSize == OtherPacketSize) && - (EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment)>0)) // FIXME should be compared to the required alignment - { - const OtherPacket pc = pset1(c); - const OtherPacket ps = pset1(s); - conj_helper::IsComplex,false> pcj; - conj_helper pm; - Scalar* EIGEN_RESTRICT px = x; - Scalar* EIGEN_RESTRICT py = y; - for(Index i=0; i(px); - Packet yi = pload(py); - pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); - pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); - px += PacketSize; - py += PacketSize; - } - } +template +void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j) +{ + typedef typename VectorX::Scalar Scalar; + const bool Vectorizable = (VectorX::Flags & VectorY::Flags & PacketAccessBit) + && (int(packet_traits::size) == int(packet_traits::size)); - /*** non-vectorized path ***/ - else - { - for(Index i=0; i::Alignment, evaluator::Alignment), + Vectorizable>::run(x,incrx,y,incry,size,c,s); } } // end namespace internal diff --git a/extern/eigen/Eigen/src/LU/InverseImpl.h b/extern/eigen/Eigen/src/LU/InverseImpl.h index 018f99b5..f49f2336 100644 --- a/extern/eigen/Eigen/src/LU/InverseImpl.h +++ b/extern/eigen/Eigen/src/LU/InverseImpl.h @@ -404,7 +404,7 @@ inline void MatrixBase::computeInverseWithCheck( const RealScalar& absDeterminantThreshold ) const { - RealScalar determinant; + Scalar determinant; // i'd love to put some static assertions there, but SFINAE means that they have no effect... eigen_assert(rows() == cols()); computeInverseAndDetWithCheck(inverse,determinant,invertible,absDeterminantThreshold); diff --git a/extern/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h b/extern/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h index d2ebfd7b..160d8a52 100644 --- a/extern/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/extern/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -64,28 +64,28 @@ namespace internal typedef typename _MatrixType::StorageIndex StorageIndex; }; - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} s_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); } - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals, int *perm, int * invp, double *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals, int *perm, int * invp, double *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} d_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); } - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} c_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast(vals), perm, invp, reinterpret_cast(x), nbrhs, iparm, dparm); } - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} diff --git a/extern/eigen/Eigen/src/SVD/BDCSVD.h b/extern/eigen/Eigen/src/SVD/BDCSVD.h index d7a4271c..1134d66e 100644 --- a/extern/eigen/Eigen/src/SVD/BDCSVD.h +++ b/extern/eigen/Eigen/src/SVD/BDCSVD.h @@ -11,7 +11,7 @@ // Copyright (C) 2013 Jean Ceccato // Copyright (C) 2013 Pierre Zoppitelli // Copyright (C) 2013 Jitse Niesen -// Copyright (C) 2014-2016 Gael Guennebaud +// Copyright (C) 2014-2017 Gael Guennebaud // // Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -696,7 +696,9 @@ typename BDCSVD::RealScalar BDCSVD::secularEq(RealScalar for(Index i=0; i::computeSingVals(const ArrayRef& col0, const ArrayRef& d { using std::abs; using std::swap; + using std::sqrt; Index n = col0.size(); Index actual_n = n; + // Note that here actual_n is computed based on col0(i)==0 instead of diag(i)==0 as above + // because 1) we have diag(i)==0 => col0(i)==0 and 2) if col0(i)==0, then diag(i) is already a singular value. while(actual_n>1 && col0(actual_n-1)==Literal(0)) --actual_n; for (Index k = 0; k < n; ++k) @@ -732,7 +737,9 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d right = (diag(actual_n-1) + col0.matrix().norm()); else { - // Skip deflated singular values + // Skip deflated singular values, + // recall that at this stage we assume that z[j]!=0 and all entries for which z[j]==0 have been put aside. + // This should be equivalent to using perm[] Index l = k+1; while(col0(l)==Literal(0)) { ++l; eigen_internal_assert(l::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar leftShifted, rightShifted; if (shift == left) { - leftShifted = (std::numeric_limits::min)(); + // to avoid overflow, we must have mu > max(real_min, |z(k)|/sqrt(real_max)), + // the factor 2 is to be more conservative + leftShifted = numext::maxi( (std::numeric_limits::min)(), Literal(2) * abs(col0(k)) / sqrt((std::numeric_limits::max)()) ); + + // check that we did it right: + eigen_internal_assert( (numext::isfinite)( (col0(k)/leftShifted)*(col0(k)/(diag(k)+shift+leftShifted)) ) ); // I don't understand why the case k==0 would be special there: - // if (k == 0) rightShifted = right - left; else - rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.6)); // theoretically we can take 0.5, but let's be safe + // if (k == 0) rightShifted = right - left; else + rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.51)); // theoretically we can take 0.5, but let's be safe } else { - leftShifted = -(right - left) * RealScalar(0.6); - rightShifted = -(std::numeric_limits::min)(); + leftShifted = -(right - left) * RealScalar(0.51); + if(k+1( (std::numeric_limits::min)(), abs(col0(k+1)) / sqrt((std::numeric_limits::max)()) ); + else + rightShifted = -(std::numeric_limits::min)(); } RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); @@ -980,7 +995,7 @@ void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index Index start = firstCol + shift; RealScalar c = m_computed(start, start); RealScalar s = m_computed(start+i, start); - RealScalar r = sqrt(numext::abs2(c) + numext::abs2(s)); + RealScalar r = numext::hypot(c,s); if (r == Literal(0)) { m_computed(start+i, start+i) = Literal(0); diff --git a/extern/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h b/extern/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h index 50272154..ff0516f6 100644 --- a/extern/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +++ b/extern/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h @@ -61,9 +61,10 @@ JacobiSVD, ColPiv u = (LAPACKE_TYPE*)m_matrixU.data(); \ } else { ldu=1; u=&dummy; }\ MatrixType localV; \ - ldvt = (m_computeFullV) ? internal::convert_index(m_cols) : (m_computeThinV) ? internal::convert_index(m_diagSize) : 1; \ + lapack_int vt_rows = (m_computeFullV) ? internal::convert_index(m_cols) : (m_computeThinV) ? internal::convert_index(m_diagSize) : 1; \ if (computeV()) { \ - localV.resize(ldvt, m_cols); \ + localV.resize(vt_rows, m_cols); \ + ldvt = internal::convert_index(localV.outerStride()); \ vt = (LAPACKE_TYPE*)localV.data(); \ } else { ldvt=1; vt=&dummy; }\ Matrix superb; superb.resize(m_diagSize, 1); \ diff --git a/extern/eigen/Eigen/src/SVD/SVDBase.h b/extern/eigen/Eigen/src/SVD/SVDBase.h index cc90a3b7..3d1ef373 100644 --- a/extern/eigen/Eigen/src/SVD/SVDBase.h +++ b/extern/eigen/Eigen/src/SVD/SVDBase.h @@ -180,8 +180,10 @@ class SVDBase RealScalar threshold() const { eigen_assert(m_isInitialized || m_usePrescribedThreshold); + // this temporary is needed to workaround a MSVC issue + Index diagSize = (std::max)(1,m_diagSize); return m_usePrescribedThreshold ? m_prescribedThreshold - : (std::max)(1,m_diagSize)*NumTraits::epsilon(); + : diagSize*NumTraits::epsilon(); } /** \returns true if \a U (full or thin) is asked for in this SVD decomposition */ diff --git a/extern/eigen/Eigen/src/SparseCore/AmbiVector.h b/extern/eigen/Eigen/src/SparseCore/AmbiVector.h index 8a5cc91f..e0295f2a 100644 --- a/extern/eigen/Eigen/src/SparseCore/AmbiVector.h +++ b/extern/eigen/Eigen/src/SparseCore/AmbiVector.h @@ -94,7 +94,7 @@ class AmbiVector Index allocSize = m_allocatedElements * sizeof(ListEl); allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar); Scalar* newBuffer = new Scalar[allocSize]; - memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); + std::memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); delete[] m_buffer; m_buffer = newBuffer; } diff --git a/extern/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/extern/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h index 492eb0a2..9db119b6 100644 --- a/extern/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +++ b/extern/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h @@ -17,7 +17,9 @@ namespace internal { template static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res, bool sortedInsertion = false) { - typedef typename remove_all::type::Scalar Scalar; + typedef typename remove_all::type::Scalar LhsScalar; + typedef typename remove_all::type::Scalar RhsScalar; + typedef typename remove_all::type::Scalar ResScalar; // make sure to call innerSize/outerSize since we fake the storage order. Index rows = lhs.innerSize(); @@ -25,7 +27,7 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r eigen_assert(lhs.outerSize() == rhs.innerSize()); ei_declare_aligned_stack_constructed_variable(bool, mask, rows, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, values, rows, 0); + ei_declare_aligned_stack_constructed_variable(ResScalar, values, rows, 0); ei_declare_aligned_stack_constructed_variable(Index, indices, rows, 0); std::memset(mask,0,sizeof(bool)*rows); @@ -51,12 +53,12 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r Index nnz = 0; for (typename evaluator::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) { - Scalar y = rhsIt.value(); + RhsScalar y = rhsIt.value(); Index k = rhsIt.index(); for (typename evaluator::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt) { Index i = lhsIt.index(); - Scalar x = lhsIt.value(); + LhsScalar x = lhsIt.value(); if(!mask[i]) { mask[i] = true; @@ -166,11 +168,12 @@ struct conservative_sparse_sparse_product_selector RowMajorMatrix; - RowMajorMatrix rhsRow = rhs; - RowMajorMatrix resRow(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(rhsRow, lhs, resRow); - res = resRow; + typedef SparseMatrix RowMajorRhs; + typedef SparseMatrix RowMajorRes; + RowMajorRhs rhsRow = rhs; + RowMajorRes resRow(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(rhsRow, lhs, resRow); + res = resRow; } }; @@ -179,10 +182,11 @@ struct conservative_sparse_sparse_product_selector RowMajorMatrix; - RowMajorMatrix lhsRow = lhs; - RowMajorMatrix resRow(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(rhs, lhsRow, resRow); + typedef SparseMatrix RowMajorLhs; + typedef SparseMatrix RowMajorRes; + RowMajorLhs lhsRow = lhs; + RowMajorRes resRow(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(rhs, lhsRow, resRow); res = resRow; } }; @@ -219,10 +223,11 @@ struct conservative_sparse_sparse_product_selector ColMajorMatrix; - ColMajorMatrix lhsCol = lhs; - ColMajorMatrix resCol(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(lhsCol, rhs, resCol); + typedef SparseMatrix ColMajorLhs; + typedef SparseMatrix ColMajorRes; + ColMajorLhs lhsCol = lhs; + ColMajorRes resCol(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(lhsCol, rhs, resCol); res = resCol; } }; @@ -232,10 +237,11 @@ struct conservative_sparse_sparse_product_selector ColMajorMatrix; - ColMajorMatrix rhsCol = rhs; - ColMajorMatrix resCol(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(lhs, rhsCol, resCol); + typedef SparseMatrix ColMajorRhs; + typedef SparseMatrix ColMajorRes; + ColMajorRhs rhsCol = rhs; + ColMajorRes resCol(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(lhs, rhsCol, resCol); res = resCol; } }; @@ -263,7 +269,8 @@ namespace internal { template static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res) { - typedef typename remove_all::type::Scalar Scalar; + typedef typename remove_all::type::Scalar LhsScalar; + typedef typename remove_all::type::Scalar RhsScalar; Index cols = rhs.outerSize(); eigen_assert(lhs.outerSize() == rhs.innerSize()); @@ -274,12 +281,12 @@ static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, { for (typename evaluator::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) { - Scalar y = rhsIt.value(); + RhsScalar y = rhsIt.value(); Index k = rhsIt.index(); for (typename evaluator::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt) { Index i = lhsIt.index(); - Scalar x = lhsIt.value(); + LhsScalar x = lhsIt.value(); res.coeffRef(i,j) += x * y; } } @@ -310,9 +317,9 @@ struct sparse_sparse_to_dense_product_selector ColMajorMatrix; - ColMajorMatrix lhsCol(lhs); - internal::sparse_sparse_to_dense_product_impl(lhsCol, rhs, res); + typedef SparseMatrix ColMajorLhs; + ColMajorLhs lhsCol(lhs); + internal::sparse_sparse_to_dense_product_impl(lhsCol, rhs, res); } }; @@ -321,9 +328,9 @@ struct sparse_sparse_to_dense_product_selector ColMajorMatrix; - ColMajorMatrix rhsCol(rhs); - internal::sparse_sparse_to_dense_product_impl(lhs, rhsCol, res); + typedef SparseMatrix ColMajorRhs; + ColMajorRhs rhsCol(rhs); + internal::sparse_sparse_to_dense_product_impl(lhs, rhsCol, res); } }; diff --git a/extern/eigen/Eigen/src/SparseCore/SparseMatrix.h b/extern/eigen/Eigen/src/SparseCore/SparseMatrix.h index 323c2323..0a2490bc 100644 --- a/extern/eigen/Eigen/src/SparseCore/SparseMatrix.h +++ b/extern/eigen/Eigen/src/SparseCore/SparseMatrix.h @@ -893,7 +893,7 @@ class SparseMatrix Index p = m_outerIndex[outer] + m_innerNonZeros[outer]++; m_data.index(p) = convert_index(inner); - return (m_data.value(p) = 0); + return (m_data.value(p) = Scalar(0)); } private: @@ -1274,7 +1274,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& m_innerNonZeros[outer]++; m_data.index(p) = inner; - return (m_data.value(p) = 0); + return (m_data.value(p) = Scalar(0)); } template @@ -1381,7 +1381,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& } m_data.index(p) = inner; - return (m_data.value(p) = 0); + return (m_data.value(p) = Scalar(0)); } namespace internal { diff --git a/extern/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h b/extern/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h index 5ab64f1a..65611b3d 100644 --- a/extern/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +++ b/extern/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h @@ -311,7 +311,7 @@ inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, cons while (i && i.index()::type::Scalar Scalar; + typedef typename remove_all::type::Scalar RhsScalar; + typedef typename remove_all::type::Scalar ResScalar; typedef typename remove_all::type::StorageIndex StorageIndex; // make sure to call innerSize/outerSize since we fake the storage order. @@ -31,7 +32,7 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r eigen_assert(lhs.outerSize() == rhs.innerSize()); // allocate a temporary buffer - AmbiVector tempVector(rows); + AmbiVector tempVector(rows); // mimics a resizeByInnerOuter: if(ResultType::IsRowMajor) @@ -63,14 +64,14 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r { // FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index()) tempVector.restart(); - Scalar x = rhsIt.value(); + RhsScalar x = rhsIt.value(); for (typename evaluator::InnerIterator lhsIt(lhsEval, rhsIt.index()); lhsIt; ++lhsIt) { tempVector.coeffRef(lhsIt.index()) += lhsIt.value() * x; } } res.startVec(j); - for (typename AmbiVector::Iterator it(tempVector,tolerance); it; ++it) + for (typename AmbiVector::Iterator it(tempVector,tolerance); it; ++it) res.insertBackByOuterInner(j,it.index()) = it.value(); } res.finalize(); @@ -85,7 +86,6 @@ struct sparse_sparse_product_with_pruning_selector; template struct sparse_sparse_product_with_pruning_selector { - typedef typename traits::type>::Scalar Scalar; typedef typename ResultType::RealScalar RealScalar; static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) @@ -129,8 +129,8 @@ struct sparse_sparse_product_with_pruning_selector ColMajorMatrixLhs; - typedef SparseMatrix ColMajorMatrixRhs; + typedef SparseMatrix ColMajorMatrixLhs; + typedef SparseMatrix ColMajorMatrixRhs; ColMajorMatrixLhs colLhs(lhs); ColMajorMatrixRhs colRhs(rhs); internal::sparse_sparse_product_with_pruning_impl(colLhs, colRhs, res, tolerance); @@ -149,7 +149,7 @@ struct sparse_sparse_product_with_pruning_selector RowMajorMatrixLhs; + typedef SparseMatrix RowMajorMatrixLhs; RowMajorMatrixLhs rowLhs(lhs); sparse_sparse_product_with_pruning_selector(rowLhs,rhs,res,tolerance); } @@ -161,7 +161,7 @@ struct sparse_sparse_product_with_pruning_selector RowMajorMatrixRhs; + typedef SparseMatrix RowMajorMatrixRhs; RowMajorMatrixRhs rowRhs(rhs); sparse_sparse_product_with_pruning_selector(lhs,rowRhs,res,tolerance); } @@ -173,7 +173,7 @@ struct sparse_sparse_product_with_pruning_selector ColMajorMatrixRhs; + typedef SparseMatrix ColMajorMatrixRhs; ColMajorMatrixRhs colRhs(rhs); internal::sparse_sparse_product_with_pruning_impl(lhs, colRhs, res, tolerance); } @@ -185,7 +185,7 @@ struct sparse_sparse_product_with_pruning_selector ColMajorMatrixLhs; + typedef SparseMatrix ColMajorMatrixLhs; ColMajorMatrixLhs colLhs(lhs); internal::sparse_sparse_product_with_pruning_impl(colLhs, rhs, res, tolerance); } diff --git a/extern/eigen/Eigen/src/SparseLU/SparseLU.h b/extern/eigen/Eigen/src/SparseLU/SparseLU.h index f883ab38..7104831c 100644 --- a/extern/eigen/Eigen/src/SparseLU/SparseLU.h +++ b/extern/eigen/Eigen/src/SparseLU/SparseLU.h @@ -499,8 +499,6 @@ void SparseLU::factorize(const MatrixType& matrix) eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); eigen_assert((matrix.rows() == matrix.cols()) && "Only for squared matrices"); - typedef typename IndexVector::Scalar StorageIndex; - m_isInitialized = true; diff --git a/extern/eigen/Eigen/src/SparseQR/SparseQR.h b/extern/eigen/Eigen/src/SparseQR/SparseQR.h index 2d4498b0..7409fcae 100644 --- a/extern/eigen/Eigen/src/SparseQR/SparseQR.h +++ b/extern/eigen/Eigen/src/SparseQR/SparseQR.h @@ -52,7 +52,7 @@ namespace internal { * rank-revealing permutations. Use colsPermutation() to get it. * * Q is the orthogonal matrix represented as products of Householder reflectors. - * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose. + * Use matrixQ() to get an expression and matrixQ().adjoint() to get the adjoint. * You can then apply it to a vector. * * R is the sparse triangular or trapezoidal matrix. The later occurs when A is rank-deficient. @@ -65,6 +65,7 @@ namespace internal { * \implsparsesolverconcept * * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()). + * \warning For complex matrices matrixQ().transpose() will actually return the adjoint matrix. * */ template @@ -196,9 +197,9 @@ class SparseQR : public SparseSolverBase > Index rank = this->rank(); - // Compute Q^T * b; + // Compute Q^* * b; typename Dest::PlainObject y, b; - y = this->matrixQ().transpose() * B; + y = this->matrixQ().adjoint() * B; b = y; // Solve with the triangular matrix R @@ -604,7 +605,7 @@ struct SparseQR_QProduct : ReturnByValue @@ -668,13 +672,14 @@ struct SparseQRMatrixQReturnType : public EigenBase(m_qr,other.derived(),false); } + // To use for operations with the adjoint of Q SparseQRMatrixQTransposeReturnType adjoint() const { return SparseQRMatrixQTransposeReturnType(m_qr); } inline Index rows() const { return m_qr.rows(); } - inline Index cols() const { return (std::min)(m_qr.rows(),m_qr.cols()); } - // To use for operations with the transpose of Q + inline Index cols() const { return m_qr.rows(); } + // To use for operations with the transpose of Q FIXME this is the same as adjoint at the moment SparseQRMatrixQTransposeReturnType transpose() const { return SparseQRMatrixQTransposeReturnType(m_qr); @@ -682,6 +687,7 @@ struct SparseQRMatrixQReturnType : public EigenBase struct SparseQRMatrixQTransposeReturnType { @@ -712,7 +718,7 @@ struct Assignment, internal: typedef typename DstXprType::StorageIndex StorageIndex; static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &/*func*/) { - typename DstXprType::PlainObject idMat(src.m_qr.rows(), src.m_qr.rows()); + typename DstXprType::PlainObject idMat(src.rows(), src.cols()); idMat.setIdentity(); // Sort the sparse householder reflectors if needed const_cast(&src.m_qr)->_sort_matrix_Q(); diff --git a/extern/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h b/extern/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h index 50a69f30..7261c7d0 100644 --- a/extern/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/extern/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -297,8 +297,8 @@ SluMatrix asSluMatrix(MatrixType& mat) template MappedSparseMatrix map_superlu(SluMatrix& sluMat) { - eigen_assert((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR - || (Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC); + eigen_assert(((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR) + || ((Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC)); Index outerSize = (Flags&RowMajor)==RowMajor ? sluMat.ncol : sluMat.nrow; diff --git a/extern/eigen/Eigen/src/misc/lapacke.h b/extern/eigen/Eigen/src/misc/lapacke.h old mode 100644 new mode 100755 diff --git a/extern/eigen/INSTALL b/extern/eigen/INSTALL new file mode 100644 index 00000000..4f717e9c --- /dev/null +++ b/extern/eigen/INSTALL @@ -0,0 +1,35 @@ +Installation instructions for Eigen +*********************************** + +Explanation before starting +*************************** + +Eigen consists only of header files, hence there is nothing to compile +before you can use it. Moreover, these header files do not depend on your +platform, they are the same for everybody. + +Method 1. Installing without using CMake +**************************************** + +You can use right away the headers in the Eigen/ subdirectory. In order +to install, just copy this Eigen/ subdirectory to your favorite location. +If you also want the unsupported features, copy the unsupported/ +subdirectory too. + +Method 2. Installing using CMake +******************************** + +Let's call this directory 'source_dir' (where this INSTALL file is). +Before starting, create another directory which we will call 'build_dir'. + +Do: + + cd build_dir + cmake source_dir + make install + +The "make install" step may require administrator privileges. + +You can adjust the installation destination (the "prefix") +by passing the -DCMAKE_INSTALL_PREFIX=myprefix option to cmake, as is +explained in the message that cmake prints at the end. diff --git a/extern/eigen/README.md b/extern/eigen/README.md new file mode 100644 index 00000000..4654a81c --- /dev/null +++ b/extern/eigen/README.md @@ -0,0 +1,3 @@ +**Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.** + +For more information go to http://eigen.tuxfamily.org/. diff --git a/extern/eigen/eigen3.pc.in b/extern/eigen/eigen3.pc.in new file mode 100644 index 00000000..3368a3aa --- /dev/null +++ b/extern/eigen/eigen3.pc.in @@ -0,0 +1,9 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=${prefix} + +Name: Eigen3 +Description: A C++ template library for linear algebra: vectors, matrices, and related algorithms +Requires: +Version: @EIGEN_VERSION_NUMBER@ +Libs: +Cflags: -I${prefix}/@INCLUDE_INSTALL_DIR@