oomph-lib: linear_solver.cc Source File

Go to the documentation of this file.
// LIC// ====================================================================
// LIC// This file forms part of oomph-lib, the object-oriented,
// LIC// multi-physics finite-element library, available
// LIC// at http://www.oomph-lib.org.
// LIC//
// LIC// Copyright (C) 2006-2024 Matthias Heil and Andrew Hazel
// LIC//
// LIC// This library is free software; you can redistribute it and/or
// LIC// modify it under the terms of the GNU Lesser General Public
// LIC// License as published by the Free Software Foundation; either
// LIC// version 2.1 of the License, or (at your option) any later version.
// LIC//
// LIC// This library is distributed in the hope that it will be useful,
// LIC// but WITHOUT ANY WARRANTY; without even the implied warranty of
// LIC// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// LIC// Lesser General Public License for more details.
// LIC//
// LIC// You should have received a copy of the GNU Lesser General Public
// LIC// License along with this library; if not, write to the Free Software
// LIC// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
// LIC// 02110-1301  USA.
// LIC//
// LIC// The authors may be contacted at oomph-lib@maths.man.ac.uk.
// LIC//
// LIC//====================================================================
// The actual solve functions for dense LU linear solvers.
 
// Config header generated by autoconfig
#ifdef HAVE_CONFIG_H
#include <oomph-lib-config.h>
#endif
 
#ifdef OOMPH_HAS_MPI
#include "mpi.h"
#endif
 
// oomph-lib includes
#include "Vector.h"
#include "linear_solver.h"
#include "matrices.h"
#include "problem.h"
 
 
namespace oomph
{
  //=============================================================================
  /// Solver: Takes pointer to problem and returns the results Vector
  /// which contains the solution of the linear system defined by
  /// the problem's fully assembled Jacobian and residual Vector.
  //=============================================================================
  void DenseLU::solve(Problem* const& problem_pt, DoubleVector& result)
  {
    // Initialise timer
    double t_start = TimingHelpers::timer();
 
    // Find # of degrees of freedom (variables)
    const unsigned n_dof = problem_pt->ndof();
 
    // Allocate storage for the residuals vector and the jacobian matrix
    DoubleVector residuals;
    DenseDoubleMatrix jacobian(n_dof);
 
    // initialise timer
    double t_start_jacobian = TimingHelpers::timer();
 
    // Get the full jacobian and residuals of the problem
    problem_pt->get_jacobian(residuals, jacobian);
 
    // compute jacobian setup time
    double t_end_jacobian = TimingHelpers::timer();
    Jacobian_setup_time = t_end_jacobian - t_start_jacobian;
 
    // Report the time
    if (Doc_time)
    {
      oomph_info << std::endl
                 << "CPU for setup of Dense Jacobian: "
                 << TimingHelpers::convert_secs_to_formatted_string(
                      Jacobian_setup_time)
                 << std::endl;
    }
 
    // Solve by dense LU decomposition VERY INEFFICIENT!
    solve(&jacobian, residuals, result);
 
    // Set the sign of the determinant of the jacobian
    problem_pt->sign_of_jacobian() = Sign_of_determinant_of_matrix;
 
    // Finalise/doc timings
    double t_end = TimingHelpers::timer();
    double total_time = t_end - t_start;
    if (Doc_time)
    {
      oomph_info << "CPU for DenseLU LinearSolver: "
                 << TimingHelpers::convert_secs_to_formatted_string(total_time)
                 << std::endl
                 << std::endl;
    }
  }
 
 
  //=============================================================================
  /// Delete the storage that has been allocated for the LU factors, if
  /// the matrix data is not itself being overwritten.
  //=============================================================================
  void DenseLU::clean_up_memory()
  {
    // delete the Distribution_pt
    this->clear_distribution();
 
    // Clean up the LU factor storage, if it has been allocated
    // N.B. we don't need to check the index storage as well.
    if (LU_factors != 0)
    {
      // Delete the pointer to the LU factors
      delete[] LU_factors;
      // Null out the vector
      LU_factors = 0;
      // Delete the pointer to the Index
      delete[] Index;
      // Null out
      Index = 0;
    }
  }
 
  //=============================================================================
  /// LU decompose the matrix.
  /// WARNING: this class does not perform any PARANOID checks on the vectors -
  /// these are all performed in the solve(...) method.
  //=============================================================================
  void DenseLU::factorise(DoubleMatrixBase* const& matrix_pt)
  {
    // Set the number of unknowns
    const unsigned long n = matrix_pt->nrow();
 
    // Small constant
    const double small_number = 1.0e-20;
 
    // Vector scaling stores the implicit scaling of each row
    Vector<double> scaling(n);
 
    // Integer to store the sign that must multiply the determinant as
    // a consequence of the row/column interchanges
    int signature = 1;
 
    // Loop over rows to get implicit scaling information
    for (unsigned long i = 0; i < n; i++)
    {
      double largest_entry = 0.0;
      for (unsigned long j = 0; j < n; j++)
      {
        double tmp = std::fabs((*matrix_pt)(i, j));
        if (tmp > largest_entry) largest_entry = tmp;
      }
      if (largest_entry == 0.0)
      {
        throw OomphLibError(
          "Singular Matrix", OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
      // Save the scaling
      scaling[i] = 1.0 / largest_entry;
    }
 
    // Firsly, we shall delete any previous LU storage.
    // If the user calls this function twice without changing the matrix
    // then it is their own inefficiency, not ours (this time).
    clean_up_memory();
 
    // Allocate storage for the LU factors, the index and store
    // the number of unknowns
    LU_factors = new double[n * n];
    Index = new long[n];
 
    // Now we know that memory has been allocated, copy over
    // the matrix values
    unsigned count = 0;
    for (unsigned long i = 0; i < n; i++)
    {
      for (unsigned long j = 0; j < n; j++)
      {
        LU_factors[count] = (*matrix_pt)(i, j);
        ++count;
      }
    }
 
    // Loop over columns
    for (unsigned long j = 0; j < n; j++)
    {
      // Initialise imax
      unsigned long imax = 0;
 
      for (unsigned long i = 0; i < j; i++)
      {
        double sum = LU_factors[n * i + j];
        for (unsigned long k = 0; k < i; k++)
        {
          sum -= LU_factors[n * i + k] * LU_factors[n * k + j];
        }
        LU_factors[n * i + j] = sum;
      }
 
      // Initialise search for largest pivot element
      double largest_entry = 0.0;
      for (unsigned long i = j; i < n; i++)
      {
        double sum = LU_factors[n * i + j];
        for (unsigned long k = 0; k < j; k++)
        {
          sum -= LU_factors[n * i + k] * LU_factors[n * k + j];
        }
        LU_factors[n * i + j] = sum;
        // Set temporary
        double tmp = scaling[i] * std::fabs(sum);
        if (tmp >= largest_entry)
        {
          largest_entry = tmp;
          imax = i;
        }
      }
 
      // Test to see if we need to interchange rows
      if (j != imax)
      {
        for (unsigned long k = 0; k < n; k++)
        {
          double tmp = LU_factors[n * imax + k];
          LU_factors[n * imax + k] = LU_factors[n * j + k];
          LU_factors[n * j + k] = tmp;
        }
        // Change the parity of signature
        signature = -signature;
 
        // Interchange scale factor
        scaling[imax] = scaling[j];
      }
 
      // Set the index
      Index[j] = imax;
      if (LU_factors[n * j + j] == 0.0)
      {
        LU_factors[n * j + j] = small_number;
      }
      // Divide by pivot element
      if (j != n - 1)
      {
        double tmp = 1.0 / LU_factors[n * j + j];
        for (unsigned long i = j + 1; i < n; i++)
        {
          LU_factors[n * i + j] *= tmp;
        }
      }
 
    } // End of loop over columns
 
 
    // Now multiply all the diagonal terms together to get the determinant
    // Note that we need to use the mantissa, exponent formulation to
    // avoid underflow errors
    double determinant_mantissa = 1.0;
    int determinant_exponent = 0, iexp;
    for (unsigned i = 0; i < n; i++)
    {
      // Multiply by the next diagonal entry's mantissa
      // and return the exponent
      determinant_mantissa *= frexp(LU_factors[n * i + i], &iexp);
 
      // Add the new exponent to the current exponent
      determinant_exponent += iexp;
 
      // normalise
      determinant_mantissa = frexp(determinant_mantissa, &iexp);
      determinant_exponent += iexp;
    }
 
    // If paranoid issue a warning that the matrix is near singular
    // #ifdef PARANOID
    //  int tiny_exponent = -60;
    //  if(determinant_exponent < tiny_exponent)
    //   {
    //    std::ostringstream warning_stream;
    //    warning_stream << "The determinant of the matrix is very close to
    //    zero.\n"
    //                   << "It is " << determinant_mantissa << " x 2^"
    //                   << determinant_exponent << "\n";
    //    warning_stream << "The results will depend on the exact details of
    //    the\n"
    //                   << "floating point implementation ... just to let you
    //                   know\n";
    //    OomphLibWarning(warning_stream.str(),
    //                    "DenseLU::factorise()",
    //                    OOMPH_EXCEPTION_LOCATION);
    //   }
    // #endif
 
    // Integer to store the sign of the determinant
    int sign = 0;
 
    // Find the sign of the determinant
    if (determinant_mantissa > 0.0)
    {
      sign = 1;
    }
    if (determinant_mantissa < 0.0)
    {
      sign = -1;
    }
 
    // Multiply the sign by the signature
    sign *= signature;
 
    // Return the sign of the determinant
    Sign_of_determinant_of_matrix = sign;
  }
 
  //=============================================================================
  /// Do the backsubstitution for the DenseLU solver.
  /// WARNING: this class does not perform any PARANOID checks on the vectors -
  /// these are all performed in the solve(...) method.
  //=============================================================================
  void DenseLU::backsub(const DoubleVector& rhs, DoubleVector& result)
  {
    // Get pointers to first entries
    const double* rhs_pt = rhs.values_pt();
    double* result_pt = result.values_pt();
 
    // Copy the rhs vector into the result vector
    const unsigned long n = rhs.nrow();
    for (unsigned long i = 0; i < n; ++i)
    {
      result_pt[i] = rhs_pt[i];
    }
 
    // Loop over all rows for forward substition
    unsigned long k = 0;
    for (unsigned long i = 0; i < n; i++)
    {
      unsigned long ip = Index[i];
      double sum = result_pt[ip];
      result_pt[ip] = result_pt[i];
      if (k != 0)
      {
        for (unsigned long j = k - 1; j < i; j++)
        {
          sum -= LU_factors[n * i + j] * result_pt[j];
        }
      }
      else if (sum != 0.0)
      {
        k = i + 1;
      }
      result_pt[i] = sum;
    }
 
    // Now do the back substitution
    for (long i = long(n) - 1; i >= 0; i--)
    {
      double sum = result_pt[i];
      for (long j = i + 1; j < long(n); j++)
      {
        sum -= LU_factors[n * i + j] * result_pt[j];
      }
      result_pt[i] = sum / LU_factors[n * i + i];
    }
  }
 
  //=============================================================================
  /// Do the backsubstitution for the DenseLU solver.
  /// WARNING: this class does not perform any PARANOID checks on the vectors -
  /// these are all performed in the solve(...) method. So, if you call backsub
  /// directly, you have been warned...
  //=============================================================================
  void DenseLU::backsub(const Vector<double>& rhs, Vector<double>& result)
  {
    // Copy the rhs vector into the result vector
    const unsigned long n = rhs.size();
    for (unsigned long i = 0; i < n; ++i)
    {
      result[i] = rhs[i];
    }
 
    // Loop over all rows for forward substition
    unsigned long k = 0;
    for (unsigned long i = 0; i < n; i++)
    {
      unsigned long ip = Index[i];
      double sum = result[ip];
      result[ip] = result[i];
      if (k != 0)
      {
        for (unsigned long j = k - 1; j < i; j++)
        {
          sum -= LU_factors[n * i + j] * result[j];
        }
      }
      else if (sum != 0.0)
      {
        k = i + 1;
      }
      result[i] = sum;
    }
 
    // Now do the back substitution
    for (long i = long(n) - 1; i >= 0; i--)
    {
      double sum = result[i];
      for (long j = i + 1; j < long(n); j++)
      {
        sum -= LU_factors[n * i + j] * result[j];
      }
      result[i] = sum / LU_factors[n * i + i];
    }
  }
 
 
  //=============================================================================
  /// Linear-algebra-type solver: Takes pointer to a matrix and rhs
  /// vector and returns the solution of the linear system.
  //============================================================================
  void DenseLU::solve(DoubleMatrixBase* const& matrix_pt,
                      const DoubleVector& rhs,
                      DoubleVector& result)
  {
#ifdef PARANOID
    // check that the rhs vector is not distributed
    if (rhs.distribution_pt()->distributed())
    {
      std::ostringstream error_message_stream;
      error_message_stream
        << "The vectors rhs and result must not be distributed";
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
 
    // check that the matrix is square
    if (matrix_pt->nrow() != matrix_pt->ncol())
    {
      std::ostringstream error_message_stream;
      error_message_stream << "The matrix at matrix_pt must be square.";
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
    // check that the matrix and the rhs vector have the same nrow()
    if (matrix_pt->nrow() != rhs.nrow())
    {
      std::ostringstream error_message_stream;
      error_message_stream
        << "The matrix and the rhs vector must have the same number of rows.";
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
 
    // if the matrix is distributable then it too should have the same
    // communicator as the rhs vector and should not be distributed
    DistributableLinearAlgebraObject* dist_matrix_pt =
      dynamic_cast<DistributableLinearAlgebraObject*>(matrix_pt);
    if (dist_matrix_pt != 0)
    {
      if (dist_matrix_pt->distribution_pt()->communicator_pt()->nproc() > 1 &&
          dist_matrix_pt->distribution_pt()->distributed() == true)
      {
        throw OomphLibError(
          "Matrix must not be distributed or only one processor",
          OOMPH_CURRENT_FUNCTION,
          OOMPH_EXCEPTION_LOCATION);
      }
      OomphCommunicator temp_comm(*rhs.distribution_pt()->communicator_pt());
      if (!(temp_comm == *dist_matrix_pt->distribution_pt()->communicator_pt()))
      {
        std::ostringstream error_message_stream;
        error_message_stream
          << "The matrix matrix_pt must have the same communicator as the "
             "vectors"
          << " rhs and result must have the same communicator";
        throw OomphLibError(error_message_stream.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
    // if the result vector is setup then check it is not distributed and has
    // the same communicator as the rhs vector
    if (result.distribution_built())
    {
      if (!(*result.distribution_pt() == *rhs.distribution_pt()))
      {
        std::ostringstream error_message_stream;
        error_message_stream
          << "The result vector distribution has been setup; it must have the "
          << "same distribution as the rhs vector.";
        throw OomphLibError(error_message_stream.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
#endif
 
    if (!result.distribution_built())
    {
      result.build(rhs.distribution_pt(), 0.0);
    }
 
    // set the distribution
    this->build_distribution(rhs.distribution_pt());
 
    // Time the solver
    double t_start = TimingHelpers::timer();
 
    // factorise
    factorise(matrix_pt);
 
    // backsubstitute
    backsub(rhs, result);
 
    // Doc time for solver
    double t_end = TimingHelpers::timer();
 
    Solution_time = t_end - t_start;
    if (Doc_time)
    {
      oomph_info << std::endl
                 << "CPU for solve with DenseLU   : "
                 << TimingHelpers::convert_secs_to_formatted_string(
                      Solution_time)
                 << std::endl
                 << std::endl;
    }
 
    // If we are not resolving then delete storage
    if (!Enable_resolve)
    {
      clean_up_memory();
    }
  }
 
  //=============================================================================
  /// Linear-algebra-type solver: Takes pointer to a matrix and rhs
  /// vector and returns the solution of the linear system.
  //=============================================================================
  void DenseLU::solve(DoubleMatrixBase* const& matrix_pt,
                      const Vector<double>& rhs,
                      Vector<double>& result)
  {
    // Time the solver
    clock_t t_start = clock();
 
    factorise(matrix_pt);
    backsub(rhs, result);
 
    // Doc time for solver
    clock_t t_end = clock();
 
    Solution_time = double(t_end - t_start) / CLOCKS_PER_SEC;
    if (Doc_time)
    {
      oomph_info << "CPU for solve with DenseLU   : "
                 << TimingHelpers::convert_secs_to_formatted_string(
                      Solution_time)
                 << std::endl;
    }
 
    // If we are not resolving then delete storage
    if (!Enable_resolve)
    {
      clean_up_memory();
    }
  }
 
  //==================================================================
  /// Solver: Takes pointer to problem and returns the results Vector
  /// which contains the solution of the linear system defined by
  /// the problem's residual Vector. (Jacobian assembled by FD).
  //==================================================================
  void FD_LU::solve(Problem* const& problem_pt, DoubleVector& result)
  {
    // Initialise timer
    clock_t t_start = clock();
 
#ifdef PARANOID
    // if the result vector is setup then check it is not distributed and has
    // the same communicator as the rhs vector
    if (result.built())
    {
      if (result.distributed())
      {
        std::ostringstream error_message_stream;
        error_message_stream << "The result vector must not be distributed";
        throw OomphLibError(error_message_stream.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
#endif
 
    // Find # of degrees of freedom
    unsigned long n_dof = problem_pt->ndof();
 
    // Allocate storage for the residuals vector and the jacobian matrix
    DoubleVector residuals;
    DenseDoubleMatrix jacobian(n_dof);
 
    {
      // initialise timer
      clock_t t_start = clock();
 
      // Get the full jacobian by finite differencing)  VERY INEFFICIENT!
      problem_pt->get_fd_jacobian(residuals, jacobian);
 
      // compute jacobian setup time
      clock_t t_end = clock();
      Jacobian_setup_time = double(t_end - t_start) / CLOCKS_PER_SEC;
 
      // Report the time
      if (Doc_time)
      {
        oomph_info << std::endl
                   << "CPU for setup of Dense Jacobian: "
                   << TimingHelpers::convert_secs_to_formatted_string(
                        Jacobian_setup_time)
                   << std::endl
                   << std::endl;
      }
    }
 
    // Solve by dense LU decomposition (not efficient)
    solve(&jacobian, residuals, result);
 
    // Set the sign of the determinant of the jacobian
    problem_pt->sign_of_jacobian() = Sign_of_determinant_of_matrix;
 
    // Finalise/doc timings
    clock_t t_end = clock();
    double total_time = double(t_end - t_start) / CLOCKS_PER_SEC;
    if (Doc_time)
    {
      oomph_info << "CPU for FD DenseLU LinearSolver: "
                 << TimingHelpers::convert_secs_to_formatted_string(total_time)
                 << std::endl
                 << std::endl;
    }
  }
 
 
  //===================================================================
  // Interface to SuperLU wrapper
  //===================================================================
  extern "C"
  {
    int superlu(int*,
                int*,
                int*,
                int*,
                double*,
                int*,
                int*,
                double*,
                int*,
                int*,
                int*,
                void*,
                int*);
  }
 
 
#ifdef OOMPH_HAS_MPI
  //===================================================================
  // Interface to SuperLU_DIST wrapper
  //===================================================================
  extern "C"
  {
    // Interface to distributed SuperLU solver where each processor
    // holds the entire matrix
    void superlu_dist_global_matrix(int opt_flag,
                                    int allow_permutations,
                                    int n,
                                    int nnz,
                                    double* values,
                                    int* row_index,
                                    int* col_start,
                                    double* b,
                                    int nprow,
                                    int npcol,
                                    int doc,
                                    void** data,
                                    int* info,
                                    MPI_Comm comm);
 
    // Interface to distributed SuperLU solver where each processor
    // holds part of the matrix
    void superlu_dist_distributed_matrix(int opt_flag,
                                         int allow_permutations,
                                         int n,
                                         int nnz_local,
                                         int nrow_local,
                                         int first_row,
                                         double* values,
                                         int* col_index,
                                         int* row_start,
                                         double* b,
                                         int nprow,
                                         int npcol,
                                         int doc,
                                         void** data,
                                         int* info,
                                         MPI_Comm comm);
 
    // helper method - just calls the superlu method dCompRow_to_CompCol to
    // convert the c-style vectors of a cr matrix to a cc matrix
    void superlu_cr_to_cc(int nrow,
                          int ncol,
                          int nnz,
                          double* cr_values,
                          int* cr_index,
                          int* cr_start,
                          double** cc_values,
                          int** cc_index,
                          int** cc_start);
  }
#endif
 
 
  //===================================================================
  // Interface to SuperLU wrapper extras
  //===================================================================
  extern "C"
  {
    /// Function to calculate the number of bytes used to store the
    /// LU factors
    double get_lu_factor_memory_usage_in_bytes();
 
    /// Function to calculate the number of bytes used in calculating
    /// and storing the LU factors
    double get_total_memory_usage_in_bytes();
  }
 
#ifdef OOMPH_HAS_MPI
  //===================================================================
  // Interface to SuperLU_DIST wrapper extras
  //===================================================================
  extern "C"
  {
    /// Function to calculate the number of bytes used to store the
    /// LU factors
    double get_lu_factor_memory_usage_in_bytes_dist();
 
    /// Function to calculate the number of bytes used in calculating
    /// and storing the LU factors
    double get_total_memory_usage_in_bytes_dist();
  }
#endif
 
  //=============================================================================
  /// How much memory do the LU factors take up? In bytes
  /// NOTE: This has been scraped from dQuerySpace(...) in dmemory.c in
  ///                 external_src/oomph_superlu_4.3
  //=============================================================================
  double SuperLUSolver::get_memory_usage_for_lu_factors()
  {
    // If we're using the non-distributed version of SuperLU and the LU
    // factors have also been computed
    if ((Solver_type != Distributed) && (Serial_f_factors != 0))
    {
      return get_lu_factor_memory_usage_in_bytes();
    }
#ifdef OOMPH_HAS_MPI
    // If we're using SuperLU dist and the LU factors have been computed
    if ((Solver_type == Distributed) && (Dist_solver_data_pt != 0))
    {
      return get_lu_factor_memory_usage_in_bytes_dist();
    }
#endif
    // If the factors haven't been computed we can't do anything
    else
    {
      return 0.0;
    }
  } // End of get_memory_usage_for_lu_factors
 
 
  //=============================================================================
  /// How much memory was used in total? In bytes
  /// NOTE: This has been scraped from dQuerySpace(...) in dmemory.c in
  ///                 external_src/oomph_superlu_4.3
  //=============================================================================
  double SuperLUSolver::get_total_needed_memory()
  {
    // If we're using the non-distributed version of SuperLU and the LU
    // factors have also been computed
    if ((Solver_type != Distributed) && (Serial_f_factors != 0))
    {
      return get_total_memory_usage_in_bytes();
    }
#ifdef OOMPH_HAS_MPI
    // If we're using SuperLU dist and the LU factors have been computed
    if ((Solver_type == Distributed) && (Dist_solver_data_pt != 0))
    {
      return get_total_memory_usage_in_bytes_dist();
    }
#endif
    // If the factors haven't been computed we can't do anything
    else
    {
      return 0.0;
    }
  } // End of get_total_needed_memory
 
 
  //==========================================================================
  /// Solver: Takes pointer to problem and returns the results Vector
  /// which contains the solution of the linear system defined by
  /// the problem's fully assembled Jacobian and residual Vector.
  //==========================================================================
  void SuperLUSolver::solve(Problem* const& problem_pt, DoubleVector& result)
  {
    // wipe memory
    this->clean_up_memory();
 
#ifdef OOMPH_HAS_MPI
    // USING SUPERLU DIST
    /// //////////////////
    if (Solver_type == Distributed ||
        (Solver_type == Default && problem_pt->communicator_pt()->nproc() > 1))
    {
      // init the timers
      double t_start = TimingHelpers::timer();
 
      // number of dofs
      unsigned n_dof = problem_pt->ndof();
 
      // set the distribution
      LinearAlgebraDistribution dist(
        problem_pt->communicator_pt(), n_dof, !Dist_use_global_solver);
      this->build_distribution(dist);
 
      // Take a copy of Delete_matrix_data
      bool copy_of_Delete_matrix_data = Dist_delete_matrix_data;
 
      // Set Delete_matrix to true
      Dist_delete_matrix_data = true;
 
      // Use the distributed version of SuperLU_DIST?
      if (!Dist_use_global_solver)
      {
        // Initialise timer
        double t_start = TimingHelpers::timer();
 
        // Storage for the residuals vector
        DoubleVector residuals(this->distribution_pt(), 0.0);
 
        // Get the sparse jacobian and residuals of the problem
        CRDoubleMatrix jacobian(this->distribution_pt());
        problem_pt->get_jacobian(residuals, jacobian);
 
        // Doc time for setup
        double t_end = TimingHelpers::timer();
        Jacobian_setup_time = t_end - t_start;
        if (Doc_time)
        {
          oomph_info << "Time to set up CRDoubleMatrix Jacobian         : "
                     << TimingHelpers::convert_secs_to_formatted_string(
                          Jacobian_setup_time)
                     << std::endl;
        }
 
        // Now call the linear algebra solve, if desired
        if (!Suppress_solve)
        {
          // If the distribution of the result has been build and
          // does not match that of
          // the solver then redistribute before the solve and return
          // to the incoming distribution afterwards.
          if ((result.built()) &&
              (!(*result.distribution_pt() == *this->distribution_pt())))
          {
            LinearAlgebraDistribution temp_global_dist(
              result.distribution_pt());
            result.build(this->distribution_pt(), 0.0);
            solve(&jacobian, residuals, result);
            result.redistribute(&temp_global_dist);
          }
          else
          {
            solve(&jacobian, residuals, result);
          }
        }
      }
      // Otherwise its the global solve version
      else
      {
        // Storage for the residuals vector
        // A non-distriubted residuals vector
        LinearAlgebraDistribution dist(
          problem_pt->communicator_pt(), problem_pt->ndof(), false);
        DoubleVector residuals(&dist, 0.0);
        CRDoubleMatrix jacobian(&dist);
 
        // Get the sparse jacobian and residuals of the problem
        problem_pt->get_jacobian(residuals, jacobian);
 
        // Doc time for setup
        double t_end = TimingHelpers::timer();
        Jacobian_setup_time = t_end - t_start;
        if (Doc_time)
        {
          oomph_info << "Time to set up CR Jacobian    : "
                     << TimingHelpers::convert_secs_to_formatted_string(
                          Jacobian_setup_time)
                     << std::endl;
        }
 
        // Now call the linear algebra solve, if desired
        if (!Suppress_solve)
        {
          // If the result distribution has been built and
          // does not match the global distribution
          // the redistribute before the solve and then return to the
          // distributed version afterwards
          if ((result.built()) && (!(*result.distribution_pt() == dist)))
          {
            LinearAlgebraDistribution temp_global_dist(
              result.distribution_pt());
            result.build(&dist, 0.0);
            solve(&jacobian, residuals, result);
            result.redistribute(&temp_global_dist);
          }
          else
          {
            solve(&jacobian, residuals, result);
          }
        }
      }
      // Set Delete_matrix back to original value
      Dist_delete_matrix_data = copy_of_Delete_matrix_data;
    }
 
    // OTHERWISE WE ARE USING SUPERLU (SERIAL)
    /// ///////////////////////////////////////
    else
#endif
    {
      // set the solver distribution
      LinearAlgebraDistribution dist(
        problem_pt->communicator_pt(), problem_pt->ndof(), false);
      this->build_distribution(dist);
 
      // Allocate storage for the residuals vector
      DoubleVector residuals(dist, 0.0);
 
      // Use the compressed row version?
      if (Serial_compressed_row_flag)
      {
        // Initialise timer
        double t_start = TimingHelpers::timer();
 
        // Get the sparse jacobian and residuals of the problem
        CRDoubleMatrix CR_jacobian(this->distribution_pt());
        problem_pt->get_jacobian(residuals, CR_jacobian);
 
        // If we want to compute the gradient for the globally convergent
        // Newton method, then do it here
        if (Compute_gradient)
        {
          // Compute it
          CR_jacobian.multiply_transpose(residuals,
                                         Gradient_for_glob_conv_newton_solve);
          // Set the flag
          Gradient_has_been_computed = true;
        }
 
        // Doc time for setup
        double t_end = TimingHelpers::timer();
        Jacobian_setup_time = t_end - t_start;
        if (Doc_time)
        {
          oomph_info << std::endl
                     << "Time to set up CRDoubleMatrix Jacobian : "
                     << TimingHelpers::convert_secs_to_formatted_string(
                          Jacobian_setup_time)
                     << std::endl;
        }
 
        // Now call the linear algebra solve, if desired
        if (!Suppress_solve)
        {
          // If the result vector is built and distributed
          // then need to redistribute into the same form as the
          // RHS (non-distributed)
          if ((result.built()) &&
              (!(*result.distribution_pt() == *this->distribution_pt())))
          {
            LinearAlgebraDistribution temp_global_dist(
              result.distribution_pt());
            result.build(this->distribution_pt(), 0.0);
            solve(&CR_jacobian, residuals, result);
            result.redistribute(&temp_global_dist);
          }
          // Otherwise just solve
          else
          {
            solve(&CR_jacobian, residuals, result);
          }
        }
      }
      // Otherwise its the compressed column version
      else
      {
        // Initialise timer
        double t_start = TimingHelpers::timer();
 
        // Get the sparse jacobian and residuals of the problem
        CCDoubleMatrix CC_jacobian;
        problem_pt->get_jacobian(residuals, CC_jacobian);
 
        // If we want to compute the gradient for the globally convergent
        // Newton method, then do it here
        if (Compute_gradient)
        {
          // Compute it
          CC_jacobian.multiply_transpose(residuals,
                                         Gradient_for_glob_conv_newton_solve);
          // Set the flag
          Gradient_has_been_computed = true;
        }
 
        // Doc time for setup
        double t_end = TimingHelpers::timer();
        Jacobian_setup_time = t_end - t_start;
        if (Doc_time)
        {
          oomph_info << "\nTime to set up CCDoubleMatrix Jacobian: "
                     << TimingHelpers::convert_secs_to_formatted_string(
                          Jacobian_setup_time)
                     << std::endl;
        }
 
        // Now call the linear algebra solve, if desired
        if (!Suppress_solve)
        {
          // If the result vector is built and distributed
          // then need to redistribute into the same form as the
          // RHS
          if ((result.built()) &&
              (!(*result.distribution_pt() == *this->distribution_pt())))
          {
            LinearAlgebraDistribution temp_global_dist(
              result.distribution_pt());
            result.build(this->distribution_pt(), 0.0);
            solve(&CC_jacobian, residuals, result);
            result.redistribute(&temp_global_dist);
          }
          // Otherwise just solve
          else
          {
            solve(&CC_jacobian, residuals, result);
          }
        }
      }
 
      // Set the sign of the jacobian
      //(this is computed in the LU decomposition phase)
      problem_pt->sign_of_jacobian() = Serial_sign_of_determinant_of_matrix;
    }
  }
 
  //=========================================================================
  /// Linear-algebra-type solver: Takes pointer to a matrix and rhs
  /// vector and returns the solution of the linear system. Problem pointer
  /// defaults to NULL and can be omitted. The function returns the global
  /// result Vector.
  /// Note: if Delete_matrix_data is true the function
  /// matrix_pt->clean_up_memory() will be used to wipe the matrix data.
  //=========================================================================
  void SuperLUSolver::solve(DoubleMatrixBase* const& matrix_pt,
                            const DoubleVector& rhs,
                            DoubleVector& result)
  {
    // Initialise timer
    double t_start = TimingHelpers::timer();
 
    // Pointer used in various places
    CRDoubleMatrix* cr_pt = 0;
 
 
#ifdef PARANOID
    // check that the rhs vector is setup
    if (!rhs.built())
    {
      std::ostringstream error_message_stream;
      error_message_stream << "The vectors rhs must be setup";
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
 
    // check that the matrix is square
    if (matrix_pt->nrow() != matrix_pt->ncol())
    {
      std::ostringstream error_message_stream;
      error_message_stream << "The matrix at matrix_pt must be square.";
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
 
    // check that the matrix has some entries, and so has a values_pt that
    // makes sense (only for CR because CC is never used I think dense
    // matrices will be safe since they don't use a values pointer).
    cr_pt = dynamic_cast<CRDoubleMatrix*>(matrix_pt);
    if (cr_pt != 0)
    {
      if (cr_pt->nnz() == 0)
      {
        std::ostringstream error_message_stream;
        error_message_stream
          << "Attempted to call SuperLu on a CRDoubleMatrix with no entries, "
          << "SuperLU would segfault (because the values array pt is "
          << "uninitialised or null).";
        throw OomphLibError(error_message_stream.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
 
    // check that the matrix and the rhs vector have the same nrow()
    if (matrix_pt->nrow() != rhs.nrow())
    {
      std::ostringstream error_message_stream;
      error_message_stream
        << "The matrix and the rhs vector must have the same number of rows.";
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
 
    // if the matrix is distributable then should have the same distribution
    // as the rhs vector
    DistributableLinearAlgebraObject* dist_matrix_pt =
      dynamic_cast<DistributableLinearAlgebraObject*>(matrix_pt);
    if (dist_matrix_pt != 0)
    {
      if (!(*dist_matrix_pt->distribution_pt() == *rhs.distribution_pt()))
      {
        std::ostringstream error_message_stream;
        error_message_stream
          << "The matrix matrix_pt must have the same distribution as the "
          << "rhs vector.";
        throw OomphLibError(error_message_stream.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
    // if the matrix is not distributable then it the rhs vector should not be
    // distributed
    else
    {
      if (rhs.distribution_pt()->distributed())
      {
        std::ostringstream error_message_stream;
        error_message_stream
          << "The matrix (matrix_pt) is not distributable and therefore the rhs"
          << " vector must not be distributed";
        throw OomphLibError(error_message_stream.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
    // if the result vector is setup then check it has the same distribution
    // as the rhs
    if (result.built())
    {
      if (!(*result.distribution_pt() == *rhs.distribution_pt()))
      {
        std::ostringstream error_message_stream;
        error_message_stream
          << "The result vector distribution has been setup; it must have the "
          << "same distribution as the rhs vector.";
        throw OomphLibError(error_message_stream.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
#endif
 
    // set the distribution
    if (dynamic_cast<DistributableLinearAlgebraObject*>(matrix_pt))
    {
      // the solver has the same distribution as the matrix if possible
      this->build_distribution(
        dynamic_cast<DistributableLinearAlgebraObject*>(matrix_pt)
          ->distribution_pt());
    }
    else
    {
      // the solver has the same distribution as the RHS
      this->build_distribution(rhs.distribution_pt());
    }
 
    // Doc time for solve
    double t_factorise_start = TimingHelpers::timer();
 
    // Factorise the matrix
    factorise(matrix_pt);
 
    // Doc the end time
    double t_factorise_end = TimingHelpers::timer();
 
    // How long did the factorisation take?
    double factorise_time = t_factorise_end - t_factorise_start;
 
    // Try and upcast the matrix to a CRDoubleMatrix
    // CRDoubleMatrix*
    cr_pt = dynamic_cast<CRDoubleMatrix*>(matrix_pt);
 
    // If the input matrix is a CRDoubleMatrix
    if (cr_pt != 0)
    {
      // ...and actually has an entry
      if (cr_pt->nnz() != 0)
      {
        // Find out how many rows there are in the global Jacobian
        unsigned n_row = cr_pt->nrow();
 
        // And how many non-zeros there are in the global Jacobian
        unsigned n_nnz = cr_pt->nnz();
 
        // Get the memory usage (in bytes) for the global Jacobian storage
        double memory_usage_for_jacobian =
          ((2 * ((n_row + 1) * sizeof(int))) +
           (n_nnz * (sizeof(int) + sizeof(double))));
 
        // Get the memory usage (in bytes) for storage of the LU factors in
        // SuperLU
        double memory_usage_for_lu_storage = get_total_needed_memory();
 
        // Get the memory usage (in bytes) for storage of the LU factors in
        // SuperLU
        double total_memory_usage =
          memory_usage_for_jacobian + memory_usage_for_lu_storage;
 
 
        // How much memory have we used?
        if (Doc_stats)
        {
          oomph_info << "\nMemory statistics:"
                     << "\n - Memory used to store the Jacobian (MB): "
                     << memory_usage_for_jacobian / 1.0e+06
                     << "\n - Memory used to store the LU factors (MB): "
                     << memory_usage_for_lu_storage / 1.0e+06
                     << "\n - Total memory used for matrix storage (MB): "
                     << total_memory_usage / 1.0e+06 << "\n"
                     << std::endl;
        }
      }
    } // if (cr_pt!=0)
 
    // Doc the start time
    double t_backsub_start = TimingHelpers::timer();
 
    // Now do the back solve
    backsub(rhs, result);
 
    // Doc the end time
    double t_backsub_end = TimingHelpers::timer();
 
    // How long did the back substitution take?
    double backsub_time = t_backsub_end - t_backsub_start;
 
    // Doc time for solve
    double t_end = TimingHelpers::timer();
    Solution_time = t_end - t_start;
    if (Doc_time)
    {
      oomph_info
        << "Time for LU factorisation : "
        << TimingHelpers::convert_secs_to_formatted_string(factorise_time)
        << "\nTime for back-substitution: "
        << TimingHelpers::convert_secs_to_formatted_string(backsub_time)
        << "\nTime for SuperLUSolver solve (ndof=" << matrix_pt->nrow() << "): "
        << TimingHelpers::convert_secs_to_formatted_string(Solution_time)
        << std::endl;
    }
 
    // If we are not storing the solver data for resolves, delete it
    if (!Enable_resolve)
    {
      clean_up_memory();
    }
  }
 
 
  //=============================================================================
  /// Solver: Takes pointer to problem and returns the results Vector
  /// which contains the solution of the linear system defined by
  /// the problem's fully assembled Jacobian and residual Vector.
  //=============================================================================
  void SuperLUSolver::solve_transpose(Problem* const& problem_pt,
                                      DoubleVector& result)
  {
    // wipe memory
    this->clean_up_memory();
 
#ifdef OOMPH_HAS_MPI
    // USING SUPERLU DIST
    /// //////////////////
    if (Solver_type == Distributed ||
        (Solver_type == Default && problem_pt->communicator_pt()->nproc() > 1))
    {
      // init the timers
      double t_start = TimingHelpers::timer();
 
      // number of dofs
      unsigned n_dof = problem_pt->ndof();
 
      // set the distribution
      LinearAlgebraDistribution dist(
        problem_pt->communicator_pt(), n_dof, !Dist_use_global_solver);
      this->build_distribution(dist);
 
      // Take a copy of Delete_matrix_data
      bool copy_of_Delete_matrix_data = Dist_delete_matrix_data;
 
      // Set Delete_matrix to true
      Dist_delete_matrix_data = true;
 
      // Use the distributed version of SuperLU_DIST?
      if (!Dist_use_global_solver)
      {
        // Initialise timer
        double t_start = TimingHelpers::timer();
 
        // Storage for the residuals vector
        DoubleVector residuals(this->distribution_pt(), 0.0);
 
        // Get the sparse jacobian and residuals of the problem
        CRDoubleMatrix jacobian(this->distribution_pt());
        problem_pt->get_jacobian(residuals, jacobian);
 
        // Doc time for setup
        double t_end = TimingHelpers::timer();
        Jacobian_setup_time = t_end - t_start;
        if (Doc_time)
        {
          oomph_info << "Time to set up CRDoubleMatrix Jacobian         : "
                     << TimingHelpers::convert_secs_to_formatted_string(
                          Jacobian_setup_time)
                     << std::endl;
        }
 
        // Now call the linear algebra solve, if desired
        if (!Suppress_solve)
        {
          // If the distribution of the result has been build and
          // does not match that of
          // the solver then redistribute before the solve and return
          // to the incoming distribution afterwards.
          if ((result.built()) &&
              (!(*result.distribution_pt() == *this->distribution_pt())))
          {
            LinearAlgebraDistribution temp_global_dist(
              result.distribution_pt());
            result.build(this->distribution_pt(), 0.0);
            solve_transpose(&jacobian, residuals, result);
            result.redistribute(&temp_global_dist);
          }
          else
          {
            solve_transpose(&jacobian, residuals, result);
          }
        }
      }
      // Otherwise its the global solve version
      else
      {
        // Storage for the residuals vector
        // A non-distriubted residuals vector
        LinearAlgebraDistribution dist(
          problem_pt->communicator_pt(), problem_pt->ndof(), false);
        DoubleVector residuals(&dist, 0.0);
        CRDoubleMatrix jacobian(&dist);
 
        // Get the sparse jacobian and residuals of the problem
        problem_pt->get_jacobian(residuals, jacobian);
 
        // Doc time for setup
        double t_end = TimingHelpers::timer();
        Jacobian_setup_time = t_end - t_start;
        if (Doc_time)
        {
          oomph_info << "Time to set up CR Jacobian    : "
                     << TimingHelpers::convert_secs_to_formatted_string(
                          Jacobian_setup_time)
                     << std::endl;
        }
 
        // Now call the linear algebra solve, if desired
        if (!Suppress_solve)
        {
          // If the result distribution has been built and
          // does not match the global distribution
          // the redistribute before the solve and then return to the
          // distributed version afterwards
          if ((result.built()) && (!(*result.distribution_pt() == dist)))
          {
            LinearAlgebraDistribution temp_global_dist(
              result.distribution_pt());
            result.build(&dist, 0.0);
            solve_transpose(&jacobian, residuals, result);
            result.redistribute(&temp_global_dist);
          }
          else
          {
            solve_transpose(&jacobian, residuals, result);
          }
        }
      }
      // Set Delete_matrix back to original value
      Dist_delete_matrix_data = copy_of_Delete_matrix_data;
    }
 
    // OTHERWISE WE ARE USING SUPERLU (SERIAL)
    /// ///////////////////////////////////////
    else
#endif
    {
      // set the solver distribution
      LinearAlgebraDistribution dist(
        problem_pt->communicator_pt(), problem_pt->ndof(), false);
      this->build_distribution(dist);
 
      // Allocate storage for the residuals vector
      DoubleVector residuals(dist, 0.0);
 
      // Use the compressed row version?
      if (Serial_compressed_row_flag)
      {
        // Initialise timer
        double t_start = TimingHelpers::timer();
 
        // Get the sparse jacobian and residuals of the problem
        CRDoubleMatrix CR_jacobian(this->distribution_pt());
        problem_pt->get_jacobian(residuals, CR_jacobian);
 
        // If we want to compute the gradient for the globally convergent
        // Newton method, then do it here
        if (Compute_gradient)
        {
          // Compute it
          CR_jacobian.multiply_transpose(residuals,
                                         Gradient_for_glob_conv_newton_solve);
          // Set the flag
          Gradient_has_been_computed = true;
        }
 
        // Doc time for setup
        double t_end = TimingHelpers::timer();
        Jacobian_setup_time = t_end - t_start;
        if (Doc_time)
        {
          oomph_info << std::endl
                     << "Time to set up CRDoubleMatrix Jacobian: "
                     << TimingHelpers::convert_secs_to_formatted_string(
                          Jacobian_setup_time)
                     << std::endl;
        }
 
        // Now call the linear algebra solve, if desired
        if (!Suppress_solve)
        {
          // If the result vector is built and distributed
          // then need to redistribute into the same form as the
          // RHS (non-distributed)
          if ((result.built()) &&
              (!(*result.distribution_pt() == *this->distribution_pt())))
          {
            LinearAlgebraDistribution temp_global_dist(
              result.distribution_pt());
            result.build(this->distribution_pt(), 0.0);
            solve_transpose(&CR_jacobian, residuals, result);
            result.redistribute(&temp_global_dist);
          }
          // Otherwise just solve
          else
          {
            solve_transpose(&CR_jacobian, residuals, result);
          }
        }
      }
      // Otherwise its the compressed column version
      else
      {
        // Initialise timer
        double t_start = TimingHelpers::timer();
 
        // Get the sparse jacobian and residuals of the problem
        CCDoubleMatrix CC_jacobian;
        problem_pt->get_jacobian(residuals, CC_jacobian);
 
        // If we want to compute the gradient for the globally convergent
        // Newton method, then do it here
        if (Compute_gradient)
        {
          // Compute it
          CC_jacobian.multiply_transpose(residuals,
                                         Gradient_for_glob_conv_newton_solve);
          // Set the flag
          Gradient_has_been_computed = true;
        }
 
        // Doc time for setup
        double t_end = TimingHelpers::timer();
        Jacobian_setup_time = t_end - t_start;
        if (Doc_time)
        {
          oomph_info << "\nTime to set up CCDoubleMatrix Jacobian: "
                     << TimingHelpers::convert_secs_to_formatted_string(
                          Jacobian_setup_time)
                     << std::endl;
        }
 
        // Now call the linear algebra solve, if desired
        if (!Suppress_solve)
        {
          // If the result vector is built and distributed
          // then need to redistribute into the same form as the
          // RHS
          if ((result.built()) &&
              (!(*result.distribution_pt() == *this->distribution_pt())))
          {
            LinearAlgebraDistribution temp_global_dist(
              result.distribution_pt());
            result.build(this->distribution_pt(), 0.0);
            solve_transpose(&CC_jacobian, residuals, result);
            result.redistribute(&temp_global_dist);
          }
          // Otherwise just solve
          else
          {
            solve_transpose(&CC_jacobian, residuals, result);
          }
        }
      }
 
      // Set the sign of the jacobian
      //(this is computed in the LU decomposition phase)
      problem_pt->sign_of_jacobian() = Serial_sign_of_determinant_of_matrix;
    }
  }
 
  //=========================================================================
  /// Linear-algebra-type solver: Takes pointer to a matrix and rhs
  /// vector and returns the solution of the linear system. Problem pointer
  /// defaults to NULL and can be omitted. The function returns the global
  /// result Vector.
  /// Note: if Delete_matrix_data is true the function
  /// matrix_pt->clean_up_memory() will be used to wipe the matrix data.
  //=========================================================================
  void SuperLUSolver::solve_transpose(DoubleMatrixBase* const& matrix_pt,
                                      const DoubleVector& rhs,
                                      DoubleVector& result)
  {
    // Initialise timer
    double t_start = TimingHelpers::timer();
 
    // Pointer used in various places
    CRDoubleMatrix* cr_pt = 0;
 
#ifdef PARANOID
    // check that the rhs vector is setup
    if (!rhs.built())
    {
      std::ostringstream error_message_stream;
      error_message_stream << "The vectors rhs must be setup";
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
 
    // check that the matrix is square
    if (matrix_pt->nrow() != matrix_pt->ncol())
    {
      std::ostringstream error_message_stream;
      error_message_stream << "The matrix at matrix_pt must be square.";
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
 
    // check that the matrix has some entries, and so has a values_pt that
    // makes sense (only for CR because CC is never used I think dense
    // matrices will be safe since they don't use a values pointer).
    cr_pt = dynamic_cast<CRDoubleMatrix*>(matrix_pt);
    if (cr_pt != 0)
    {
      if (cr_pt->nnz() == 0)
      {
        std::ostringstream error_message_stream;
        error_message_stream
          << "Attempted to call SuperLu on a CRDoubleMatrix with no entries, "
          << "SuperLU would segfault (because the values array pt is "
          << "uninitialised or null).";
        throw OomphLibError(error_message_stream.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
 
    // check that the matrix and the rhs vector have the same nrow()
    if (matrix_pt->nrow() != rhs.nrow())
    {
      std::ostringstream error_message_stream;
      error_message_stream
        << "The matrix and the rhs vector must have the same number of rows.";
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
 
    // if the matrix is distributable then should have the same distribution
    // as the rhs vector
    DistributableLinearAlgebraObject* dist_matrix_pt =
      dynamic_cast<DistributableLinearAlgebraObject*>(matrix_pt);
    if (dist_matrix_pt != 0)
    {
      if (!(*dist_matrix_pt->distribution_pt() == *rhs.distribution_pt()))
      {
        std::ostringstream error_message_stream;
        error_message_stream
          << "The matrix matrix_pt must have the same distribution as the "
          << "rhs vector.";
        throw OomphLibError(error_message_stream.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
    // if the matrix is not distributable then it the rhs vector should not be
    // distributed
    else
    {
      if (rhs.distribution_pt()->distributed())
      {
        std::ostringstream error_message_stream;
        error_message_stream
          << "The matrix (matrix_pt) is not distributable and therefore the rhs"
          << " vector must not be distributed";
        throw OomphLibError(error_message_stream.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
    // if the result vector is setup then check it has the same distribution
    // as the rhs
    if (result.built())
    {
      if (!(*result.distribution_pt() == *rhs.distribution_pt()))
      {
        std::ostringstream error_message_stream;
        error_message_stream
          << "The result vector distribution has been setup; it must have the "
          << "same distribution as the rhs vector.";
        throw OomphLibError(error_message_stream.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
#endif
 
    // set the distribution
    if (dynamic_cast<DistributableLinearAlgebraObject*>(matrix_pt))
    {
      // the solver has the same distribution as the matrix if possible
      this->build_distribution(
        dynamic_cast<DistributableLinearAlgebraObject*>(matrix_pt)
          ->distribution_pt());
    }
    else
    {
      // the solver has the same distribution as the RHS
      this->build_distribution(rhs.distribution_pt());
    }
 
    // Doc time for solve
    double t_factorise_start = TimingHelpers::timer();
 
    // Factorise the matrix
    factorise(matrix_pt);
 
    // Doc the end time
    double t_factorise_end = TimingHelpers::timer();
 
    // How long did the factorisation take?
    double factorise_time = t_factorise_end - t_factorise_start;
 
    // Try and upcast the matrix to a CRDoubleMatrix
    // CRDoubleMatrix*
    cr_pt = dynamic_cast<CRDoubleMatrix*>(matrix_pt);
 
    // If the input matrix is a CRDoubleMatrix
    if (cr_pt != 0)
    {
      // ...and actually has an entry
      if (cr_pt->nnz() != 0)
      {
        // Find out how many rows there are in the global Jacobian
        unsigned n_row = cr_pt->nrow();
 
        // And how many non-zeros there are in the global Jacobian
        unsigned n_nnz = cr_pt->nnz();
 
        // Get the memory usage (in bytes) for the global Jacobian storage
        double memory_usage_for_jacobian =
          ((2 * ((n_row + 1) * sizeof(int))) +
           (n_nnz * (sizeof(int) + sizeof(double))));
 
        // Get the memory usage (in bytes) for storage of the LU factors in
        // SuperLU
        double memory_usage_for_lu_storage = get_total_needed_memory();
 
        // Get the memory usage (in bytes) for storage of the LU factors in
        // SuperLU
        double total_memory_usage =
          memory_usage_for_jacobian + memory_usage_for_lu_storage;
 
        // How much memory have we used?
        if (Doc_stats)
        {
          oomph_info << "\nMemory statistics:"
                     << "\n - Memory used to store the Jacobian (MB): "
                     << memory_usage_for_jacobian / 1.0e+06
                     << "\n - Memory used to store the LU factors (MB): "
                     << memory_usage_for_lu_storage / 1.0e+06
                     << "\n - Total memory used for matrix storage (MB): "
                     << total_memory_usage / 1.0e+06 << "\n"
                     << std::endl;
        }
      }
    } // if (cr_pt!=0)
 
    // Doc the start time
    double t_backsub_start = TimingHelpers::timer();
 
    // Now do the back solve
    backsub_transpose(rhs, result);
 
    // Doc the end time
    double t_backsub_end = TimingHelpers::timer();
 
    // How long did the back substitution take?
    double backsub_time = t_backsub_end - t_backsub_start;
 
    // Doc time for solve
    double t_end = TimingHelpers::timer();
    Solution_time = t_end - t_start;
    if (Doc_time)
    {
      oomph_info
        << "Time for LU factorisation : "
        << TimingHelpers::convert_secs_to_formatted_string(factorise_time)
        << "\nTime for back-substitution: "
        << TimingHelpers::convert_secs_to_formatted_string(backsub_time)
        << "\nTime for SuperLUSolver solve (ndof=" << matrix_pt->nrow() << "): "
        << TimingHelpers::convert_secs_to_formatted_string(Solution_time)
        << std::endl;
    }
 
    // If we are not storing the solver data for resolves, delete it
    if (!Enable_resolve)
    {
      clean_up_memory();
    }
  } // End of solve_transpose
 
  //===============================================================
  /// Resolve the system for a given RHS
  //===============================================================
  void SuperLUSolver::resolve(const DoubleVector& rhs, DoubleVector& result)
  {
    // Store starting time for solve
    double t_start = TimingHelpers::timer();
 
    // backsub
    backsub(rhs, result);
 
    // Doc time for solve
    double t_end = TimingHelpers::timer();
    Solution_time = t_end - t_start;
    if (Doc_time)
    {
      oomph_info << "Time for SuperLUSolver solve (ndof=" << rhs.nrow() << "): "
                 << TimingHelpers::convert_secs_to_formatted_string(t_end -
                                                                    t_start)
                 << std::endl;
    }
  }
 
 
  //===============================================================
  /// Resolve the (transposed) system for a given RHS
  //===============================================================
  void SuperLUSolver::resolve_transpose(const DoubleVector& rhs,
                                        DoubleVector& result)
  {
    // Store starting time for solve
    double t_start = TimingHelpers::timer();
 
    // Backsub (but solve the transposed system)
    backsub_transpose(rhs, result);
 
    // Doc time for solve
    double t_end = TimingHelpers::timer();
    Solution_time = t_end - t_start;
    if (Doc_time)
    {
      oomph_info << "Time for SuperLUSolver solve (ndof=" << rhs.nrow() << "): "
                 << TimingHelpers::convert_secs_to_formatted_string(t_end -
                                                                    t_start)
                 << std::endl;
    }
  }
 
 
  //===================================================================
  /// LU decompose the matrix addressed by matrix_pt by using
  /// the SuperLU solver. The resulting matrix factors are stored
  /// internally.
  //===================================================================
  void SuperLUSolver::factorise(DoubleMatrixBase* const& matrix_pt)
  {
    // wipe memory
    this->clean_up_memory();
 
    // if we have mpi and the solver is distributed or default and nproc
    // gt 1
#ifdef OOMPH_HAS_MPI
    DistributableLinearAlgebraObject* dist_matrix_pt =
      dynamic_cast<DistributableLinearAlgebraObject*>(matrix_pt);
    unsigned nproc = 1;
    if (dist_matrix_pt != 0)
    {
      nproc = dist_matrix_pt->distribution_pt()->communicator_pt()->nproc();
    }
    if (Solver_type == Distributed || (Solver_type == Default && nproc > 1 &&
                                       MPI_Helpers::mpi_has_been_initialised()))
    {
      // if the matrix is a distributed linear algebra object then use SuperLU
      // dist
      if (dist_matrix_pt != 0)
      {
        factorise_distributed(matrix_pt);
        Using_dist = true;
      }
      else
      {
        factorise_serial(matrix_pt);
        Using_dist = false;
      }
    }
    else
#endif
    {
      factorise_serial(matrix_pt);
      Using_dist = false;
    }
  }
 
#ifdef OOMPH_HAS_MPI
  //=============================================================================
  /// LU decompose the matrix addressed by matrix_pt using
  /// the SuperLU_DIST solver. The resulting matrix factors are stored
  /// internally.
  //=============================================================================
  void SuperLUSolver::factorise_distributed(DoubleMatrixBase* const& matrix_pt)
  {
    // Check that we have a square matrix
#ifdef PARANOID
    int m = matrix_pt->ncol();
    int n = matrix_pt->nrow();
    if (n != m)
    {
      std::ostringstream error_message_stream;
      error_message_stream << "Can only solve for square matrices\n"
                           << "N, M " << n << " " << m << std::endl;
 
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
    // number of processors
    unsigned nproc = MPI_Helpers::communicator_pt()->nproc();
    if (dynamic_cast<DistributableLinearAlgebraObject*>(matrix_pt) != 0)
    {
      nproc = dynamic_cast<DistributableLinearAlgebraObject*>(matrix_pt)
                ->distribution_pt()
                ->communicator_pt()
                ->nproc();
    }
 
    // Find number of rows and columns for the process grid
    // First guess at number of rows:
    int nprow = int(sqrt(double(nproc)));
 
    // Does this evenly divide the processor grid?
    while (nprow > 1)
    {
      if (nproc % nprow == 0) break;
      nprow -= 1;
    }
 
    // Store Number of rows/columns for process grid
    Dist_nprow = nprow;
    Dist_npcol = nproc / Dist_nprow;
 
    // Make sure any existing factors are deleted
    clean_up_memory();
 
    // Doc (0/1) = (true/false)
    int doc = !Doc_stats;
 
    // Rset Info
    Dist_info = 0;
 
    // Flag for row and column permutations
    int allow_permutations = Dist_allow_row_and_col_permutations;
 
    // Is it a DistributedCRDoubleMatrix?
    if (dynamic_cast<CRDoubleMatrix*>(matrix_pt) != 0)
    {
      // Get a cast pointer to the matrix
      CRDoubleMatrix* cr_matrix_pt = dynamic_cast<CRDoubleMatrix*>(matrix_pt);
 
      // Get the distribution from the matrix
      this->build_distribution(cr_matrix_pt->distribution_pt());
 
#ifdef PARANOID
      // paranoid check that the matrix has been setup
      if (!cr_matrix_pt->built())
      {
        throw OomphLibError(
          "To apply SuperLUSolver to a CRDoubleMatrix - it must be built",
          OOMPH_CURRENT_FUNCTION,
          OOMPH_EXCEPTION_LOCATION);
      }
#endif
 
      // if the matrix is distributed then setup setup superlu dist distributed
      if (cr_matrix_pt->distributed())
      {
        // Find the number of non-zero entries in the matrix
        const int nnz_local = int(cr_matrix_pt->nnz());
 
        // Set up the pointers to the matrix.
        // NOTE: these arrays (accessed via value_pt, index_pt and
        // start_pt) may be modified by the SuperLU_DIST routines, and so
        // a copy must be taken if the matrix is to be preserved.
 
        // Copy values
        Dist_value_pt = new double[nnz_local];
        double* matrix_value_pt = cr_matrix_pt->value();
        for (int i = 0; i < nnz_local; i++)
        {
          Dist_value_pt[i] = matrix_value_pt[i];
        }
 
        // Copy column indices
        Dist_index_pt = new int[nnz_local];
        int* matrix_index_pt = cr_matrix_pt->column_index();
        for (int i = 0; i < nnz_local; i++)
        {
          Dist_index_pt[i] = matrix_index_pt[i];
        }
 
        // Copy row starts
        int nrow_local = cr_matrix_pt->nrow_local();
        Dist_start_pt = new int[nrow_local + 1];
        int* matrix_start_pt = cr_matrix_pt->row_start();
        for (int i = 0; i <= nrow_local; i++)
        {
          Dist_start_pt[i] = matrix_start_pt[i];
        }
 
        // cache
        int ndof = cr_matrix_pt->distribution_pt()->nrow();
        int first_row = cr_matrix_pt->first_row();
 
        // Now delete the matrix if we are allowed
        if (Dist_delete_matrix_data == true)
        {
          cr_matrix_pt->clear();
        }
 
        // Factorize
        superlu_dist_distributed_matrix(
          1,
          allow_permutations,
          ndof,
          nnz_local,
          nrow_local,
          first_row,
          Dist_value_pt,
          Dist_index_pt,
          Dist_start_pt,
          0,
          Dist_nprow,
          Dist_npcol,
          doc,
          &Dist_solver_data_pt,
          &Dist_info,
          this->distribution_pt()->communicator_pt()->mpi_comm());
 
        // Record that data is stored
        Dist_distributed_solve_data_allocated = true;
      }
      // else the CRDoubleMatrix is not distributed
      else
      {
        // Find the number of non-zero entries in the matrix
        const int nnz = int(cr_matrix_pt->nnz());
 
        // cache the number of rows
        int nrow = cr_matrix_pt->nrow();
 
        // Set up the pointers to the matrix.
        // NOTE: these arrays (accessed via value_pt, index_pt and
        // start_pt) may be modified by the SuperLU_DIST routines, and so
        // a copy must be taken if the matrix is to be preserved.
 
        // create the corresponing cc matrix
        superlu_cr_to_cc(nrow,
                         nrow,
                         nnz,
                         cr_matrix_pt->value(),
                         cr_matrix_pt->column_index(),
                         cr_matrix_pt->row_start(),
                         &Dist_value_pt,
                         &Dist_index_pt,
                         &Dist_start_pt);
 
        // Delete the matrix if we are allowed
        if (Dist_delete_matrix_data == true)
        {
          cr_matrix_pt->clear();
        }
 
        // do the factorization
        superlu_dist_global_matrix(
          1,
          allow_permutations,
          nrow,
          nnz,
          Dist_value_pt,
          Dist_index_pt,
          Dist_start_pt,
          0,
          Dist_nprow,
          Dist_npcol,
          doc,
          &Dist_solver_data_pt,
          &Dist_info,
          this->distribution_pt()->communicator_pt()->mpi_comm());
 
        // Record that data is stored
        Dist_global_solve_data_allocated = true;
      }
    }
 
    // Or is it a CCDoubleMatrix?
    else if (dynamic_cast<CCDoubleMatrix*>(matrix_pt))
    {
      // Get a cast pointer to the matrix
      CCDoubleMatrix* serial_matrix_pt =
        dynamic_cast<CCDoubleMatrix*>(matrix_pt);
 
      // Find the number of non-zero entries in the matrix
      const int nnz = int(serial_matrix_pt->nnz());
 
      // Find # of degrees of freedom (variables)
      int ndof = int(serial_matrix_pt->nrow());
 
      // Find the local number of degrees of freedom in the linear system
      int ndof_local = ndof;
 
      // Set up the pointers to the matrix.
      // NOTE: these arrays (accessed via value_pt, index_pt and
      // start_pt) may be modified by the SuperLU_DIST routines, and so
      // a copy must be taken if the matrix is to be preserved.
 
      // Copy values
      Dist_value_pt = new double[nnz];
      double* matrix_value_pt = serial_matrix_pt->value();
      for (int i = 0; i < nnz; i++)
      {
        Dist_value_pt[i] = matrix_value_pt[i];
      }
 
      // copy row indices
      Dist_index_pt = new int[nnz];
      int* matrix_index_pt = serial_matrix_pt->row_index();
      for (int i = 0; i < nnz; i++)
      {
        Dist_index_pt[i] = matrix_index_pt[i];
      }
 
      // copy column starts
      Dist_start_pt = new int[ndof_local + 1];
      int* matrix_start_pt = serial_matrix_pt->column_start();
      for (int i = 0; i <= ndof_local; i++)
      {
        Dist_start_pt[i] = matrix_start_pt[i];
      }
 
      // Delete the matrix if we are allowed
      if (Dist_delete_matrix_data == true)
      {
        serial_matrix_pt->clean_up_memory();
      }
 
      // do the factorization
      superlu_dist_global_matrix(
        1,
        allow_permutations,
        ndof,
        nnz,
        Dist_value_pt,
        Dist_index_pt,
        Dist_start_pt,
        0,
        Dist_nprow,
        Dist_npcol,
        doc,
        &Dist_solver_data_pt,
        &Dist_info,
        this->distribution_pt()->communicator_pt()->mpi_comm());
 
      // Record that data is stored
      Dist_global_solve_data_allocated = true;
    }
    // Otherwise throw an error
    else
    {
      std::ostringstream error_message_stream;
      error_message_stream << "SuperLUSolver implemented only for "
                           << " CCDoubleMatrix, CRDoubleMatrix\n"
                           << "and DistributedCRDoubleMatrix matrices\n";
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
 
    // Throw an error if superLU returned an error status in info.
    if (Dist_info != 0)
    {
      std::ostringstream error_msg;
      error_msg << "SuperLU returned the error status code " << Dist_info
                << " . See the SuperLU documentation for what this means.";
      throw OomphLibError(
        error_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
  }
#endif
 
  //===================================================================
  /// LU decompose the matrix addressed by matrix_pt by using
  /// the SuperLU solver. The resulting matrix factors are stored
  /// internally.
  //===================================================================
  void SuperLUSolver::factorise_serial(DoubleMatrixBase* const& matrix_pt)
  {
#ifdef PARANOID
    // PARANOID check that if the matrix is distributable then it should not be
    // then it should not be distributed
    if (dynamic_cast<DistributableLinearAlgebraObject*>(matrix_pt) != 0)
    {
      if (dynamic_cast<DistributableLinearAlgebraObject*>(matrix_pt)
            ->distributed())
      {
        std::ostringstream error_message_stream;
        error_message_stream << "The matrix must not be distributed.";
        throw OomphLibError(error_message_stream.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
#endif
 
    // Find # of degrees of freedom (variables)
    int n = matrix_pt->nrow();
 
    // Check that we have a square matrix
#ifdef PARANOID
    int m = matrix_pt->ncol();
    if (n != m)
    {
      std::ostringstream error_message_stream;
      error_message_stream << "Can only solve for square matrices\n"
                           << "N, M " << n << " " << m << std::endl;
 
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
    // Storage for the values, rows and column indices
    // required by SuplerLU
    double* value = 0;
    int *index = 0, *start = 0;
 
    // Integer used to represent compressed row or column format
    // Default compressed row
    int transpose = 0;
 
    // Number of non-zero entries in the matrix
    int nnz = 0;
 
    // Doc flag (convert to int for SuperLU)
    int doc = Doc_stats;
 
    // Is it a CR matrix
    if (dynamic_cast<CRDoubleMatrix*>(matrix_pt))
    {
      // Set the appropriate row flags
      Serial_compressed_row_flag = true;
      transpose = 1;
      // Get a cast pointer to the matrix
      CRDoubleMatrix* CR_matrix_pt = dynamic_cast<CRDoubleMatrix*>(matrix_pt);
 
      // Now set the pointers to the interanally stored values
      // and indices
      nnz = CR_matrix_pt->nnz();
      value = CR_matrix_pt->value();
      index = CR_matrix_pt->column_index();
      start = CR_matrix_pt->row_start();
    }
    // Otherwise is it the compressed column version?
    else if (dynamic_cast<CCDoubleMatrix*>(matrix_pt))
    {
      // Set the compressed row flag to false
      Serial_compressed_row_flag = false;
      // Get a cast pointer to the matrix
      CCDoubleMatrix* CC_matrix_pt = dynamic_cast<CCDoubleMatrix*>(matrix_pt);
 
      // Now set the pointers to the interanally stored values
      // and indices
      nnz = CC_matrix_pt->nnz();
      value = CC_matrix_pt->value();
      index = CC_matrix_pt->row_index();
      start = CC_matrix_pt->column_start();
    }
    // Otherwise throw and error
    else
    {
      throw OomphLibError("SuperLU only works with CR or CC Double matrices",
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
 
    // Clean up any previous storage so that if this is called twice with
    // the same matrix, we don't get a memory leak
    clean_up_memory();
 
    // Perform the lu decompose phase (i=1)
    int i = 1;
    Serial_sign_of_determinant_of_matrix = superlu(&i,
                                                   &n,
                                                   &nnz,
                                                   0,
                                                   value,
                                                   index,
                                                   start,
                                                   0,
                                                   &n,
                                                   &transpose,
                                                   &doc,
                                                   &Serial_f_factors,
                                                   &Serial_info);
 
    // Throw an error if superLU returned an error status in info.
    if (Serial_info != 0)
    {
      std::ostringstream error_msg;
      error_msg << "SuperLU returned the error status code " << Serial_info
                << " . See the SuperLU documentation for what this means.";
      throw OomphLibError(
        error_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
 
    // Set the number of degrees of freedom in the linear system
    Serial_n_dof = n;
  }
 
  //=============================================================================
  /// Do the backsubstitution for SuperLUSolver.
  /// Note - this method performs no paranoid checks - these are all performed
  /// in solve(...) and resolve(...)
  //=============================================================================
  void SuperLUSolver::backsub(const DoubleVector& rhs, DoubleVector& result)
  {
#ifdef OOMPH_HAS_MPI
    if (Using_dist)
    {
      backsub_distributed(rhs, result);
    }
    else
#endif
    {
      backsub_serial(rhs, result);
    }
  }
 
 
  //=============================================================================
  /// Do the backsubstitution of the transposed system for SuperLUSolver.
  /// Note - this method performs no paranoid checks - these are all performed
  /// in solve(...) and resolve(...)
  //=============================================================================
  void SuperLUSolver::backsub_transpose(const DoubleVector& rhs,
                                        DoubleVector& result)
  {
#ifdef OOMPH_HAS_MPI
    if (Using_dist)
    {
      backsub_transpose_distributed(rhs, result);
    }
    else
#endif
    {
      backsub_transpose_serial(rhs, result);
    }
  }
 
#ifdef OOMPH_HAS_MPI
  //=========================================================================
  /// Static warning to suppress warnings about incorrect distribution of
  /// RHS vector. Default is false
  //=========================================================================
  bool SuperLUSolver::Suppress_incorrect_rhs_distribution_warning_in_resolve =
    false;
 
  //=============================================================================
  /// Do the backsubstitution for SuperLU solver.
  /// Note - this method performs no paranoid checks - these are all performed
  /// in solve(...) and resolve(...)
  //=============================================================================
  void SuperLUSolver::backsub_distributed(const DoubleVector& rhs,
                                          DoubleVector& result)
  {
#ifdef PARANOID
    // check that the rhs vector is setup
    if (!rhs.distribution_pt()->built())
    {
      std::ostringstream error_message_stream;
      error_message_stream << "The vectors rhs must be setup";
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
#endif
    // check that the rhs distribution is the same as the distribution as this
    // solver. If not redistribute and issue a warning
    LinearAlgebraDistribution rhs_distribution(rhs.distribution_pt());
    if (!(*rhs.distribution_pt() == *this->distribution_pt()))
    {
      if (!Suppress_incorrect_rhs_distribution_warning_in_resolve)
      {
        std::ostringstream warning_stream;
        warning_stream << "The distribution of rhs vector does not match that "
                          "ofthe solver.\n";
        warning_stream << "The rhs will be redistributed, which is likely to  "
                          "be inefficient\n";
        warning_stream
          << "To remove this warning you can either:\n"
          << "    i) Ensure that the rhs vector has the correct distribution\n"
          << "       before calling the resolve() function\n"
          << "or ii) Set the flag \n"
          << " SuperLUSolver::Suppress_incorrect_rhs_distribution_warning_in_"
             "resolve\n"
          << "       to be true\n\n";
 
        OomphLibWarning(warning_stream.str(),
                        "SuperLUSolver::resolve()",
                        OOMPH_EXCEPTION_LOCATION);
      }
 
      // Have to cast away const-ness (which tells us that we shouldn't really
      // be doing this!)
      const_cast<DoubleVector&>(rhs).redistribute(this->distribution_pt());
    }
 
#ifdef PARANOID
    // if the result vector is setup then check it has the same distribution
    // as the rhs
    if (result.distribution_built())
    {
      if (!(*result.distribution_pt() == *rhs.distribution_pt()))
      {
        std::ostringstream error_message_stream;
        error_message_stream
          << "The result vector distribution has been setup; it must have the "
          << "same distribution as the rhs vector.";
        throw OomphLibError(error_message_stream.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
#endif
    // Doc (0/1) = (true/false)
    int doc = !Doc_stats;
 
    // Reset Info
    Dist_info = 0;
 
    // number of DOFs
    int ndof = this->distribution_pt()->nrow();
 
    // Copy the rhs values to result
    result = rhs;
 
    // Do the backsubsitition phase
    if (Dist_distributed_solve_data_allocated)
    {
      // Call distributed solver
      superlu_dist_distributed_matrix(
        2,
        -1,
        ndof,
        0,
        0,
        0,
        0,
        0,
        0,
        result.values_pt(),
        Dist_nprow,
        Dist_npcol,
        doc,
        &Dist_solver_data_pt,
        &Dist_info,
        this->distribution_pt()->communicator_pt()->mpi_comm());
    }
    else if (Dist_global_solve_data_allocated)
    {
      // Call global solver
      superlu_dist_global_matrix(
        2,
        -1,
        ndof,
        0,
        0,
        0,
        0,
        result.values_pt(),
        Dist_nprow,
        Dist_npcol,
        doc,
        &Dist_solver_data_pt,
        &Dist_info,
        this->distribution_pt()->communicator_pt()->mpi_comm());
    }
    else
    {
      throw OomphLibError("The matrix factors have not been stored",
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
 
    // Throw an error if superLU returned an error status in info.
    if (Dist_info != 0)
    {
      std::ostringstream error_msg;
      error_msg << "SuperLU returned the error status code " << Dist_info
                << " . See the SuperLU documentation for what this means.";
      throw OomphLibError(
        error_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // Redistribute to original distribution
    // Have to cast away const-ness (which tells us that we shouldn't really
    // be doing this!)
    const_cast<DoubleVector&>(rhs).redistribute(&rhs_distribution);
  }
 
  //=============================================================================
  /// Do the backsubstitution for SuperLU solver.
  /// Note - this method performs no paranoid checks - these are all performed
  /// in solve(...) and resolve(...)
  //=============================================================================
  void SuperLUSolver::backsub_transpose_distributed(const DoubleVector& rhs,
                                                    DoubleVector& result)
  {
    // Create an output stream
    std::ostringstream error_message_stream;
 
    // Create the error message
    error_message_stream << "This function hasn't been implemented yet. If you "
                         << "need it, implement it!" << std::endl;
 
    // Throw the error message
    throw OomphLibError(error_message_stream.str(),
                        OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
  }
#endif
 
  //================================================================
  /// Do the backsubstitution for SuperLU
  //================================================================
  void SuperLUSolver::backsub_serial(const DoubleVector& rhs,
                                     DoubleVector& result)
  {
    // Find the number of unknowns
    int n = rhs.nrow();
 
#ifdef PARANOID
    // PARANOID check that this rhs distribution is setup
    if (!rhs.built())
    {
      std::ostringstream error_message_stream;
      error_message_stream << "The rhs vector distribution must be setup.";
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
    // PARANOID check that the rhs has the right number of global rows
    if (static_cast<int>(Serial_n_dof) != n)
    {
      throw OomphLibError(
        "RHS does not have the same dimension as the linear system",
        OOMPH_CURRENT_FUNCTION,
        OOMPH_EXCEPTION_LOCATION);
    }
    // PARANOID check that the rhs is not distributed
    if (rhs.distribution_pt()->distributed())
    {
      std::ostringstream error_message_stream;
      error_message_stream << "The rhs vector must not be distributed.";
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
    // PARANOID check that if the result is setup it matches the distribution
    // of the rhs
    if (result.built())
    {
      if (!(*rhs.distribution_pt() == *result.distribution_pt()))
      {
        std::ostringstream error_message_stream;
        error_message_stream << "If the result distribution is setup then it "
                                "must be the same as the "
                             << "rhs distribution";
        throw OomphLibError(error_message_stream.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
#endif
 
    // copy result to rhs
    result = rhs;
 
    // Number of RHSs
    int nrhs = 1;
 
    // Cast the boolean flags to ints for SuperLU
    int transpose = Serial_compressed_row_flag;
    int doc = Doc_stats;
 
    // Do the backsubsitition phase
    int i = 2;
    superlu(&i,
            &n,
            0,
            &nrhs,
            0,
            0,
            0,
            result.values_pt(),
            &n,
            &transpose,
            &doc,
            &Serial_f_factors,
            &Serial_info);
 
    // Throw an error if superLU returned an error status in info.
    if (Serial_info != 0)
    {
      std::ostringstream error_msg;
      error_msg << "SuperLU returned the error status code " << Serial_info
                << " . See the SuperLU documentation for what this means.";
      throw OomphLibError(
        error_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
  }
 
  //================================================================
  /// Do the backsubstitution for SuperLU
  //================================================================
  void SuperLUSolver::backsub_transpose_serial(const DoubleVector& rhs,
                                               DoubleVector& result)
  {
    // Find the number of unknowns
    int n = rhs.nrow();
 
#ifdef PARANOID
    // PARANOID check that this rhs distribution is setup
    if (!rhs.built())
    {
      std::ostringstream error_message_stream;
      error_message_stream << "The rhs vector distribution must be setup.";
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
    // PARANOID check that the rhs has the right number of global rows
    if (static_cast<int>(Serial_n_dof) != n)
    {
      throw OomphLibError(
        "RHS does not have the same dimension as the linear system",
        OOMPH_CURRENT_FUNCTION,
        OOMPH_EXCEPTION_LOCATION);
    }
    // PARANOID check that the rhs is not distributed
    if (rhs.distribution_pt()->distributed())
    {
      std::ostringstream error_message_stream;
      error_message_stream << "The rhs vector must not be distributed.";
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
    }
    // PARANOID check that if the result is setup it matches the distribution
    // of the rhs
    if (result.built())
    {
      if (!(*rhs.distribution_pt() == *result.distribution_pt()))
      {
        std::ostringstream error_message_stream;
        error_message_stream << "If the result distribution is setup then it "
                                "must be the same as the "
                             << "rhs distribution";
        throw OomphLibError(error_message_stream.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
#endif
 
    // copy result to rhs
    result = rhs;
 
    // Number of RHSs
    int nrhs = 1;
 
    // Cast the boolean flags to ints for SuperLU
    int transpose = (!Serial_compressed_row_flag);
    int doc = Doc_stats;
 
    // Do the backsubsitition phase
    int i = 2;
    superlu(&i,
            &n,
            0,
            &nrhs,
            0,
            0,
            0,
            result.values_pt(),
            &n,
            &transpose,
            &doc,
            &Serial_f_factors,
            &Serial_info);
 
    // Throw an error if superLU returned an error status in info.
    if (Serial_info != 0)
    {
      std::ostringstream error_msg;
      error_msg << "SuperLU returned the error status code " << Serial_info
                << " . See the SuperLU documentation for what this means.";
      throw OomphLibError(
        error_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
  }
 
  //=============================================================================
  /// Clean up the memory
  //=============================================================================
  void SuperLUSolver::clean_up_memory()
  {
    // If we have non-zero LU factors stored
    if (Serial_f_factors != 0)
    {
      // Clean up those factors
      int i = 3;
      int transpose = Serial_compressed_row_flag;
      superlu(&i,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              &transpose,
              0,
              &Serial_f_factors,
              &Serial_info);
 
      // Set the F_factors to zero
      Serial_f_factors = 0;
      Serial_n_dof = 0;
    }
 
#ifdef OOMPH_HAS_MPI
    // If we have non-zero LU factors stored
    if (Dist_solver_data_pt != 0)
    {
      // Clean up any stored solver data
 
      // Doc (0/1) = (true/false)
      int doc = !Doc_stats;
 
      // Reset Info flag
      Dist_info = 0;
 
      // number of DOFs
      int ndof = this->distribution_pt()->nrow();
 
      if (Dist_distributed_solve_data_allocated)
      {
        superlu_dist_distributed_matrix(
          3,
          -1,
          ndof,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          Dist_nprow,
          Dist_npcol,
          doc,
          &Dist_solver_data_pt,
          &Dist_info,
          this->distribution_pt()->communicator_pt()->mpi_comm());
        Dist_distributed_solve_data_allocated = false;
      }
      if (Dist_global_solve_data_allocated)
      {
        superlu_dist_global_matrix(
          3,
          -1,
          ndof,
          0,
          0,
          0,
          0,
          0,
          Dist_nprow,
          Dist_npcol,
          doc,
          &Dist_solver_data_pt,
          &Dist_info,
          this->distribution_pt()->communicator_pt()->mpi_comm());
        Dist_global_solve_data_allocated = false;
      }
 
      Dist_solver_data_pt = 0;
 
      // Delete internal copy of the matrix
      delete[] Dist_value_pt;
      delete[] Dist_index_pt;
      delete[] Dist_start_pt;
      Dist_value_pt = 0;
      Dist_index_pt = 0;
      Dist_start_pt = 0;
 
      // and the distribution
      this->clear_distribution();
    }
#endif
  }
 
} // namespace oomph