oomph-lib: partitioning.cc Source File

Go to the documentation of this file.
// LIC// ====================================================================
// LIC// This file forms part of oomph-lib, the object-oriented,
// LIC// multi-physics finite-element library, available
// LIC// at http://www.oomph-lib.org.
// LIC//
// LIC// Copyright (C) 2006-2024 Matthias Heil and Andrew Hazel
// LIC//
// LIC// This library is free software; you can redistribute it and/or
// LIC// modify it under the terms of the GNU Lesser General Public
// LIC// License as published by the Free Software Foundation; either
// LIC// version 2.1 of the License, or (at your option) any later version.
// LIC//
// LIC// This library is distributed in the hope that it will be useful,
// LIC// but WITHOUT ANY WARRANTY; without even the implied warranty of
// LIC// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// LIC// Lesser General Public License for more details.
// LIC//
// LIC// You should have received a copy of the GNU Lesser General Public
// LIC// License along with this library; if not, write to the Free Software
// LIC// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
// LIC// 02110-1301  USA.
// LIC//
// LIC// The authors may be contacted at oomph-lib@maths.man.ac.uk.
// LIC//
// LIC//====================================================================
#include <float.h>
 
#include "partitioning.h"
#include "mesh.h"
#include "refineable_mesh.h"
// Include to fill in additional_setup_shared_node_scheme() function
#include "refineable_mesh.template.cc"
 
#ifdef OOMPH_TRANSITION_TO_VERSION_3
 
// for the new METIS API, need to use symbols defined in the standard header
// which aren't available in the current frozen (old) version of METIS
// Version 3 will (presumably) have this header in the include path as standard
#include "metis.h"
 
#endif
 
namespace oomph
{
  //====================================================================
  /// Namespace for METIS graph partitioning routines
  //====================================================================
  namespace METIS
  {
    /// Default function that translates spatial
    /// error into weight for METIS partitioning (unit weight regardless
    /// of input).
    void default_error_to_weight_fct(const double& spatial_error,
                                     const double& max_error,
                                     const double& min_error,
                                     int& weight)
    {
      weight = 1;
    }
 
    /// Function pointer to to function that translates spatial
    /// error into weight for METIS partitioning.
    ErrorToWeightFctPt Error_to_weight_fct_pt = &default_error_to_weight_fct;
 
  } // namespace METIS
 
 
  //==================================================================
  /// Partition mesh uniformly by dividing elements
  /// equally over the partitions, in the order
  /// in which they are returned by problem.
  /// On return, element_domain[ielem] contains the number
  /// of the domain [0,1,...,ndomain-1] to which
  /// element ielem has been assigned.
  //==================================================================
  void METIS::uniform_partition_mesh(Problem* problem_pt,
                                     const unsigned& ndomain,
                                     Vector<unsigned>& element_domain)
  {
    // Number of elements
    unsigned nelem = problem_pt->mesh_pt()->nelement();
 
#ifdef PARANOID
    if (nelem != element_domain.size())
    {
      std::ostringstream error_stream;
      error_stream << "element_domain Vector has wrong length " << nelem << " "
                   << element_domain.size() << std::endl;
 
      throw OomphLibError(
        error_stream.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
 
    // Uniform partitioning
    unsigned nel_per_domain = int(float(nelem) / float(ndomain));
    for (unsigned ielem = 0; ielem < nelem; ielem++)
    {
      unsigned idomain = unsigned(float(ielem) / float(nel_per_domain));
      element_domain[ielem] = idomain;
    }
  }
 
 
  //==================================================================
  /// Use METIS to assign each element to a domain.
  /// On return, element_domain[ielem] contains the number
  /// of the domain [0,1,...,ndomain-1] to which
  /// element ielem has been assigned.
  /// - objective=0: minimise edgecut.
  /// - objective=1: minimise total communications volume.
  /// .
  /// Partioning is based on dual graph of mesh.
  //==================================================================
  void METIS::partition_mesh(Problem* problem_pt,
                             const unsigned& ndomain,
                             const unsigned& objective,
                             Vector<unsigned>& element_domain)
  {
    // Global mesh
    Mesh* mesh_pt = problem_pt->mesh_pt();
 
    // Number of elements
    unsigned nelem = mesh_pt->nelement();
 
#ifdef PARANOID
    if (nelem != element_domain.size())
    {
      std::ostringstream error_stream;
      error_stream << "element_domain Vector has wrong length " << nelem << " "
                   << element_domain.size() << std::endl;
 
      throw OomphLibError(
        error_stream.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
    // Setup dual graph
    //------------------
 
    // Start timer
    clock_t cpu_start = clock();
 
    // Container to collect all elements associated with given global eqn number
    std::map<unsigned, std::set<unsigned>> elements_connected_with_global_eqn;
 
    // Container for all unique global eqn numbers
    std::set<unsigned> all_global_eqns;
 
    // Loop over all elements
    for (unsigned e = 0; e < nelem; e++)
    {
      GeneralisedElement* el_pt = mesh_pt->element_pt(e);
 
      // Add all global eqn numbers
      unsigned ndof = el_pt->ndof();
      for (unsigned j = 0; j < ndof; j++)
      {
        // Get global eqn number
        unsigned eqn_number = el_pt->eqn_number(j);
        elements_connected_with_global_eqn[eqn_number].insert(e);
        all_global_eqns.insert(eqn_number);
      }
    }
 
    // Now reverse the lookup scheme to find out all elements
    // that are connected because they share the same global eqn
    Vector<std::set<unsigned>> connected_elements(nelem);
 
    // Counter for total number of entries in connected_elements structure
    unsigned count = 0;
 
    // Loop over all global eqns
    for (std::set<unsigned>::iterator it = all_global_eqns.begin();
         it != all_global_eqns.end();
         it++)
    {
      // Get set of elements connected with this data item
      std::set<unsigned> elements = elements_connected_with_global_eqn[*it];
 
      // Double loop over connnected elements: Everybody's connected to
      // everybody
      for (std::set<unsigned>::iterator it1 = elements.begin();
           it1 != elements.end();
           it1++)
      {
        for (std::set<unsigned>::iterator it2 = elements.begin();
             it2 != elements.end();
             it2++)
        {
          if ((*it1) != (*it2))
          {
            connected_elements[(*it1)].insert(*it2);
          }
        }
      }
    }
 
 
    // Now convert into C-style packed array for interface with METIS
    int* xadj = new int[nelem + 1];
    Vector<int> adjacency_vector;
 
    // Reserve (too much) space
    adjacency_vector.reserve(count);
 
    // Initialise counters
    unsigned ientry = 0;
 
    // Loop over all elements
    for (unsigned e = 0; e < nelem; e++)
    {
      // First entry for current element
      xadj[e] = ientry;
 
      // Loop over elements that are connected to current element
      typedef std::set<unsigned>::iterator IT;
      for (IT it = connected_elements[e].begin();
           it != connected_elements[e].end();
           it++)
      {
        // Copy into adjacency array
        adjacency_vector.push_back(*it);
 
        // We've just made another entry
        ientry++;
      }
 
      // Entry after last entry for current element:
      xadj[e + 1] = ientry;
    }
 
    // End timer
    clock_t cpu_end = clock();
 
    // Doc
    double cpu0 = double(cpu_end - cpu_start) / CLOCKS_PER_SEC;
    oomph_info
      << "CPU time for setup of METIS data structures            [nelem="
      << nelem << "]: " << cpu0 << " sec" << std::endl;
 
 
    // If the adjacency vector is empty then the elements are
    // actually unconnected (can happen in dummy problems where
    // each element only has internal data). In that case the
    // partition is irrelevant and we may as well distribute the
    // elements in round-robin fashion
    if (adjacency_vector.size() == 0)
    {
      unsigned n_proc = problem_pt->communicator_pt()->nproc();
      oomph_info
        << "Note: All elements in the Problem's Mesh appear to be\n"
        << "unconnected. This happens, e.g. if all elements only have\n"
        << "internal Data. Bypassing metis and distributing elements\n"
        << "in round-robin fashion amongst the " << n_proc << " processors."
        << std::endl;
      for (unsigned e = 0; e < nelem; e++)
      {
        element_domain[e] = e % n_proc;
      }
      return;
    }
 
 
    // Call METIS graph partitioner
    //-----------------------------
 
    // Start timer
    cpu_start = clock();
 
    // Number of vertices in graph
    int nvertex = nelem;
 
    // No vertex weights
    int* vwgt = 0;
 
    // No edge weights
    int* adjwgt = 0;
 
    // Flag indicating that graph isn't weighted: 0; vertex weights only: 2
    // Note that wgtflag==2 requires nodal weights to be stored in vwgt.
    int wgtflag = 0;
 
    // Use C-style numbering (first array entry is zero)
    int numflag = 0;
 
    // Number of desired partitions
    int nparts = ndomain;
 
    // Use default options
    int* options = new int[10];
    options[0] = 0;
 
#ifdef OOMPH_TRANSITION_TO_VERSION_3
    switch (objective)
    {
      case 0:
        // Edge-cut minimization
        options[0] = METIS_OBJTYPE_CUT;
        break;
 
      case 1:
        // communication volume minimisation
        options[0] = METIS_OBJTYPE_VOL;
        break;
 
      default:
        std::ostringstream error_stream;
        error_stream << "Wrong objective for METIS. objective = " << objective
                     << std::endl;
 
        throw OomphLibError(
          error_stream.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
    // Number of cut edges in graph
    int* edgecut = new int[nelem];
 
    // Array containing the partition information
    int* part = new int[nelem];
 
    // Can we get an error estimate?
 
    unsigned n_mesh = problem_pt->nsub_mesh();
 
    if (n_mesh == 0)
    {
      RefineableMeshBase* mmesh_pt = dynamic_cast<RefineableMeshBase*>(mesh_pt);
      if (mmesh_pt != 0)
      {
        // Bias distribution?
        if (Error_to_weight_fct_pt != &default_error_to_weight_fct)
        {
          oomph_info
            << "Biasing element distribution via spatial error estimate\n";
 
          // Adjust flag and provide storage for weights
          wgtflag = 2;
          vwgt = new int[nelem];
 
          // Get error for all elements
          Vector<double> elemental_error(nelem);
          mmesh_pt->spatial_error_estimator_pt()->get_element_errors(
            mesh_pt, elemental_error);
 
          double max_error =
            *(std::max_element(elemental_error.begin(), elemental_error.end()));
          double min_error =
            *(std::min_element(elemental_error.begin(), elemental_error.end()));
 
          // Bias weights
          int weight = 1;
          for (unsigned e = 0; e < nelem; e++)
          {
            // Translate error into weight
            Error_to_weight_fct_pt(
              elemental_error[e], max_error, min_error, weight);
            vwgt[e] = weight;
          }
        }
      }
    }
    else // There are submeshes
    {
      // Are any of the submeshes refineable?
      bool refineable_submesh_exists = false;
      // Vector to store "start and end point" for loops in submeshes
      Vector<unsigned> loop_helper(n_mesh + 1);
      loop_helper[0] = 0;
 
      // Loop over submeshes
      for (unsigned i_mesh = 0; i_mesh < n_mesh; i_mesh++)
      {
        // Store the end of the loop
        loop_helper[i_mesh + 1] =
          problem_pt->mesh_pt(i_mesh)->nelement() + loop_helper[i_mesh];
 
        RefineableMeshBase* mmesh_pt =
          dynamic_cast<RefineableMeshBase*>(problem_pt->mesh_pt(i_mesh));
        if (mmesh_pt != 0)
        {
          refineable_submesh_exists = true;
        }
      }
 
      // If a refineable submesh exists
      if (refineable_submesh_exists)
      {
        // Bias distribution?
        if (Error_to_weight_fct_pt != &default_error_to_weight_fct)
        {
          oomph_info
            << "Biasing element distribution via spatial error estimate\n";
 
          // Adjust flag and provide storage for weights
          wgtflag = 2;
          vwgt = new int[nelem];
 
          // Loop over submeshes
          for (unsigned i_mesh = 0; i_mesh < n_mesh; i_mesh++)
          {
            RefineableMeshBase* mmesh_pt =
              dynamic_cast<RefineableMeshBase*>(problem_pt->mesh_pt(i_mesh));
            if (mmesh_pt != 0)
            {
              // Get error for all elements
              unsigned nsub_elem =
                loop_helper[i_mesh + 1] - loop_helper[i_mesh];
              Vector<double> elemental_error(nsub_elem);
              mmesh_pt->spatial_error_estimator_pt()->get_element_errors(
                problem_pt->mesh_pt(i_mesh), elemental_error);
 
              double max_error = *(std::max_element(elemental_error.begin(),
                                                    elemental_error.end()));
              double min_error = *(std::min_element(elemental_error.begin(),
                                                    elemental_error.end()));
 
              // Bias weights
              int weight = 1;
              unsigned start = loop_helper[i_mesh];
              unsigned end = loop_helper[i_mesh + 1];
              for (unsigned e = start; e < end; e++)
              {
                unsigned error_index = e - start;
                // Translate error into weight
                Error_to_weight_fct_pt(
                  elemental_error[error_index], max_error, min_error, weight);
                vwgt[e] = weight;
              }
            }
            else // This mesh is not refineable
            {
              // There's no error estimator, so use the default weight
              int weight = 1;
              unsigned start = loop_helper[i_mesh];
              unsigned end = loop_helper[i_mesh + 1];
              for (unsigned e = start; e < end; e++)
              {
                vwgt[e] = weight;
              }
            }
          }
        }
      }
    }
 
#ifdef OOMPH_TRANSITION_TO_VERSION_3
 
    // Call partitioner
    METIS_PartGraphKway(&nvertex,
                        xadj,
                        &adjacency_vector[0],
                        vwgt,
                        adjwgt,
                        &wgtflag,
                        &numflag,
                        &nparts,
                        options,
                        edgecut,
                        part);
#else
    // original code to delete in version 3
 
    // Call partitioner
    if (objective == 0)
    {
      // Partition with the objective of minimising the edge cut
      METIS_PartGraphKway(&nvertex,
                          xadj,
                          &adjacency_vector[0],
                          vwgt,
                          adjwgt,
                          &wgtflag,
                          &numflag,
                          &nparts,
                          options,
                          edgecut,
                          part);
    }
    else if (objective == 1)
    {
      // Partition with the objective of minimising the total communication
      // volume
      METIS_PartGraphVKway(&nvertex,
                           xadj,
                           &adjacency_vector[0],
                           vwgt,
                           adjwgt,
                           &wgtflag,
                           &numflag,
                           &nparts,
                           options,
                           edgecut,
                           part);
    }
    else
    {
      std::ostringstream error_stream;
      error_stream << "Wrong objective for METIS. objective = " << objective
                   << std::endl;
 
      throw OomphLibError(
        error_stream.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
#ifdef PARANOID
    std::vector<bool> done(nparts, false);
#endif
 
    // Copy across
    for (unsigned e = 0; e < nelem; e++)
    {
      element_domain[e] = part[e];
#ifdef PARANOID
      done[part[e]] = true;
#endif
    }
 
 
#ifdef PARANOID
    // Check
    std::ostringstream error_stream;
    bool shout = false;
    for (int p = 0; p < nparts; p++)
    {
      if (!done[p])
      {
        shout = true;
        error_stream << "No elements on processor " << p
                     << "when trying to partition " << nelem << "elements over "
                     << nparts << " processors!\n";
      }
    }
    if (shout)
    {
      throw OomphLibError(
        error_stream.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
 
    // End timer
    cpu_end = clock();
 
    // Doc
    double cpu1 = double(cpu_end - cpu_start) / CLOCKS_PER_SEC;
    oomph_info
      << "CPU time for METIS mesh partitioning                   [nelem="
      << nelem << "]: " << cpu1 << " sec" << std::endl;
 
 
    // Cleanup
    delete[] xadj;
    delete[] part;
    delete[] edgecut;
    delete[] options;
  }
 
 
#ifdef OOMPH_HAS_MPI
 
 
  //==================================================================
  /// Use METIS to assign each element in an already-distributed mesh
  /// to a domain. On return, element_domain_on_this_proc[e] contains the number
  /// of the domain [0,1,...,ndomain-1] to which non-halo element e on THE
  /// CURRENT PROCESSOR ONLY has been assigned. The order of the non-halo
  /// elements is the same as in the Problem's mesh, with the halo
  /// elements being skipped.
  /// Objective:
  /// - objective=0: minimise edgecut.
  /// - objective=1: minimise total communications volume.
  /// .
  /// The partioning is based on the dof graph of the complete mesh by
  /// taking into
  /// account which global equation numbers are affected by each element and
  /// connecting elements which affect the same global equation number.
  /// Partitioning is done such that all elements associated with the
  /// same tree root move together. Non-refineable elements are
  /// treated as their own root elements. If the optional boolean
  /// flag is set to true (it defaults to false) each processor
  /// assigns a dumb-but-repeatable equidistribution of its non-halo
  /// elements over the domains and outputs the input that would have
  /// gone into METIS in the file metis_input_for_validation.dat
  //==================================================================
  void METIS::partition_distributed_mesh(
    Problem* problem_pt,
    const unsigned& objective,
    Vector<unsigned>& element_domain_on_this_proc,
    const bool& bypass_metis)
  {
    // Start timer
    clock_t cpu_start = clock();
 
    // Communicator
    OomphCommunicator* comm_pt = problem_pt->communicator_pt();
 
    // Number of processors / domains
    unsigned n_proc = comm_pt->nproc();
    unsigned my_rank = comm_pt->my_rank();
 
    // Global mesh
    Mesh* mesh_pt = problem_pt->mesh_pt();
 
    // Total number of elements (halo and nonhalo) on this proc
    unsigned n_elem = mesh_pt->nelement();
 
    // Get elemental assembly times
    Vector<double> elemental_assembly_time =
      problem_pt->elemental_assembly_time();
 
#ifdef PARANOID
    unsigned n = elemental_assembly_time.size();
    if ((n != 0) && (n != n_elem))
    {
      std::ostringstream error_stream;
      error_stream << "Number of elements doesn't match the \n"
                   << "number of elemental assembly times: " << n_elem << " "
                   << n << std::endl;
      throw OomphLibError(
        error_stream.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
    // Can we base load balancing on assembly times?
    bool can_load_balance_on_assembly_times = false;
    if (elemental_assembly_time.size() != 0)
    {
      can_load_balance_on_assembly_times = true;
    }
 
    // Storage for global eqn numbers on current processor
    std::set<unsigned> global_eqns_on_this_proc;
 
    // Storage for pointers to root elements that are connected with given
    // eqn number -- assembled on local processor
    std::map<unsigned, std::set<GeneralisedElement*>>
      root_elements_connected_with_global_eqn_on_this_proc;
 
    // Storage for long sequence of equation numbers as encountered
    // by the root elements on this processor
    Vector<unsigned> eqn_numbers_with_root_elements_on_this_proc;
 
    // Reserve number of elements x average/estimate (?) for number of dofs
    // per element
    eqn_numbers_with_root_elements_on_this_proc.reserve(n_elem * 9);
 
    // Storage for the number of eqn numbers associated with each
    // root element on this processors -- once this and the previous
    // container have been collected from all processors we're
    // able to reconstruct which root element (in the nominal "global" mesh)
    // is connected with which global equations
    Vector<unsigned> number_of_dofs_for_root_element;
    number_of_dofs_for_root_element.reserve(n_elem);
 
    // Ditto for number of "leaf" elements connected with each root
    Vector<unsigned> number_of_non_halo_elements_for_root_element;
    number_of_non_halo_elements_for_root_element.reserve(n_elem);
 
    // Ditto for total assembly time of "leaf" elements connected with each root
    Vector<double> total_assembly_time_for_root_element;
    total_assembly_time_for_root_element.reserve(n_elem);
 
    // Map storing the number of the root elements on this processor
    // (offset by one to bypass the zero default).
    std::map<GeneralisedElement*, unsigned> root_el_number_plus_one;
 
    // Loop over non-halo elements on this processor
    int number_of_root_elements = 0;
    unsigned number_of_non_halo_elements = 0;
    for (unsigned e = 0; e < n_elem; e++)
    {
      double el_assembly_time = 0.0;
      GeneralisedElement* el_pt = mesh_pt->element_pt(e);
      if (!el_pt->is_halo())
      {
        if (can_load_balance_on_assembly_times)
        {
          el_assembly_time = elemental_assembly_time[e];
        }
 
        // Get the associated root element which is either...
        GeneralisedElement* root_el_pt = 0;
        RefineableElement* ref_el_pt = dynamic_cast<RefineableElement*>(el_pt);
        if (ref_el_pt != 0)
        {
          //...the actual root element
          root_el_pt = ref_el_pt->root_element_pt();
        }
        // ...or the element itself
        else
        {
          root_el_pt = el_pt;
        }
 
        // Have we already encountered this root element?
        // (offset of one to bypass the default return of zero)
        bool already_encountered = false;
        unsigned root_el_number = root_el_number_plus_one[root_el_pt];
        if (root_el_number_plus_one[root_el_pt] == 0)
        {
          // This is a new one
          already_encountered = false;
 
          // Give it a number
          number_of_root_elements++;
          root_el_number_plus_one[root_el_pt] = number_of_root_elements;
 
          // Remove offset
          root_el_number = number_of_root_elements - 1;
        }
        else
        {
          // We've already visited this one before...
          already_encountered = true;
 
          // Remove offset
          root_el_number -= 1;
        }
 
 
        // Get global equation numbers of actual element
        unsigned n_dof = el_pt->ndof();
        for (unsigned i = 0; i < n_dof; i++)
        {
          unsigned eqn_no = el_pt->eqn_number(i);
 
          // Record which root elements are connected with this eqn number
          root_elements_connected_with_global_eqn_on_this_proc[eqn_no].insert(
            root_el_pt);
 
          // Record all global eqn numbers on this processor
          global_eqns_on_this_proc.insert(eqn_no);
 
          // Add eqn number of the current element to the long sequence
          // of eqn numbers
          eqn_numbers_with_root_elements_on_this_proc.push_back(eqn_no);
        }
 
        // Now record how many equations are associated with the current
        // non-halo element
        if (already_encountered)
        {
          number_of_dofs_for_root_element[root_el_number] += n_dof;
          number_of_non_halo_elements_for_root_element[root_el_number]++;
          total_assembly_time_for_root_element[root_el_number] +=
            el_assembly_time;
        }
        else
        {
          number_of_dofs_for_root_element.push_back(n_dof);
          number_of_non_halo_elements_for_root_element.push_back(1);
          total_assembly_time_for_root_element.push_back(el_assembly_time);
        }
 
        // Bump up number of non-halos
        number_of_non_halo_elements++;
      }
    }
 
    // Tell everybody how many root elements
    // are on each processor
    unsigned root_processor = 0;
    Vector<int> number_of_root_elements_on_each_proc(n_proc, 0);
    MPI_Allgather(&number_of_root_elements,
                  1,
                  MPI_INT,
                  &number_of_root_elements_on_each_proc[0],
                  1,
                  MPI_INT,
                  comm_pt->mpi_comm());
 
 
    // In the big sequence of concatenated root elements (enumerated
    // individually on the various processors) where do the root elements from a
    // given processor start? Also figure out how many root elements there are
    // in total by summing up their numbers
    Vector<int> start_index(n_proc, 0);
    unsigned total_number_of_root_elements = 0;
    for (unsigned i_proc = 0; i_proc < n_proc; i_proc++)
    {
      total_number_of_root_elements +=
        number_of_root_elements_on_each_proc[i_proc];
      if (i_proc != 0)
      {
        start_index[i_proc] = total_number_of_root_elements -
                              number_of_root_elements_on_each_proc[i_proc];
      }
      else
      {
        start_index[0] = 0;
      }
    }
 
 
    // How many global equations are held on this processor?
    int n_eqns_on_this_proc =
      eqn_numbers_with_root_elements_on_this_proc.size();
 
    // Gather this information for all processors:
    // n_eqns_on_each_proc[iproc] now contains the number of global
    // equations held on processor iproc.
    Vector<int> n_eqns_on_each_proc(n_proc, 0);
    MPI_Allgather(&n_eqns_on_this_proc,
                  1,
                  MPI_INT,
                  &n_eqns_on_each_proc[0],
                  1,
                  MPI_INT,
                  comm_pt->mpi_comm());
 
 
    // In the big sequence of equation numbers from the root elements
    // (enumerated individually on the various processors) where do the
    // equation numbers associated with the root elements from a given
    // processor start? Also figure out how long the sequence of equation
    // numbers is
    Vector<int> start_eqns_index(n_proc, 0);
    unsigned total_n_eqn = 0;
    for (unsigned i_proc = 0; i_proc < n_proc; i_proc++)
    {
      total_n_eqn += n_eqns_on_each_proc[i_proc];
      if (i_proc != 0)
      {
        start_eqns_index[i_proc] = total_n_eqn - n_eqns_on_each_proc[i_proc];
      }
      else
      {
        start_eqns_index[0] = 0;
      }
    }
 
 
    // Big vector that contains the number of dofs for each root element
    // (concatenated in processor-by-processor order)
    Vector<unsigned> number_of_dofs_for_global_root_element(
      total_number_of_root_elements);
    // Create at least one entry so we don't get a seg fault below
    if (number_of_dofs_for_root_element.size() == 0)
    {
      number_of_dofs_for_root_element.resize(1);
    }
    MPI_Gatherv(
      &number_of_dofs_for_root_element[0], // pointer to first entry in
                                           // vector to be gathered on root
      number_of_root_elements, // Number of entries to be sent
                               // from current processor
      MPI_UNSIGNED,
      &number_of_dofs_for_global_root_element[0], // Target -- this will
                                                  // store the concatenated
                                                  // vectors sent from
                                                  // everywhere
      &number_of_root_elements_on_each_proc[0], // Pointer to
                                                // vector containing
                                                // the length of the
                                                // vectors received
                                                // from elsewhere
      &start_index[0], // "offset" for storage of vector received
                       // from various processors in the global
                       // concatenated vector stored on root
      MPI_UNSIGNED,
      root_processor,
      comm_pt->mpi_comm());
 
 
    // ditto for number of non-halo elements associated with root element
    Vector<unsigned> number_of_non_halo_elements_for_global_root_element(
      total_number_of_root_elements);
 
    // Create at least one entry so we don't get a seg fault below
    if (number_of_non_halo_elements_for_root_element.size() == 0)
    {
      number_of_non_halo_elements_for_root_element.resize(1);
    }
    MPI_Gatherv(&number_of_non_halo_elements_for_root_element[0],
                // pointer to first entry in
                // vector to be gathered on root
                number_of_root_elements, // Number of entries to be sent
                                         // from current processor
                MPI_UNSIGNED,
                &number_of_non_halo_elements_for_global_root_element[0],
                // Target -- this will
                // store the concatenated
                // vectors sent from
                // everywhere
                &number_of_root_elements_on_each_proc[0], // Pointer to
                                                          // vector containing
                                                          // the length of the
                                                          // vectors received
                                                          // from elsewhere
                &start_index[0], // "offset" for storage of vector received
                                 // from various processors in the global
                                 // concatenated vector stored on root
                MPI_UNSIGNED,
                root_processor,
                comm_pt->mpi_comm());
 
 
    // ditto for assembly times elements associated with root element
    Vector<double> total_assembly_time_for_global_root_element(
      total_number_of_root_elements);
 
    // Create at least one entry so we don't get a seg fault below
    if (total_assembly_time_for_root_element.size() == 0)
    {
      total_assembly_time_for_root_element.resize(1);
    }
    MPI_Gatherv(&total_assembly_time_for_root_element[0],
                // pointer to first entry in
                // vector to be gathered on root
                number_of_root_elements, // Number of entries to be sent
                                         // from current processor
                MPI_DOUBLE,
                &total_assembly_time_for_global_root_element[0],
                // Target -- this will
                // store the concatenated
                // vectors sent from
                // everywhere
                &number_of_root_elements_on_each_proc[0], // Pointer to
                                                          // vector containing
                                                          // the length of the
                                                          // vectors received
                                                          // from elsewhere
                &start_index[0], // "offset" for storage of vector received
                                 // from various processors in the global
                                 // concatenated vector stored on root
                MPI_DOUBLE,
                root_processor,
                comm_pt->mpi_comm());
 
 
    // Big vector to store the long sequence of global equation numbers
    // associated with the long sequence of root elements
    Vector<unsigned> eqn_numbers_with_root_elements(total_n_eqn);
 
    // Create at least one entry so we don't get a seg fault below
    if (eqn_numbers_with_root_elements_on_this_proc.size() == 0)
    {
      eqn_numbers_with_root_elements_on_this_proc.resize(1);
    }
    MPI_Gatherv(&eqn_numbers_with_root_elements_on_this_proc[0],
                n_eqns_on_this_proc,
                MPI_UNSIGNED,
                &eqn_numbers_with_root_elements[0],
                &n_eqns_on_each_proc[0],
                &start_eqns_index[0],
                MPI_UNSIGNED,
                root_processor,
                comm_pt->mpi_comm());
 
    // Doc
    clock_t cpu_end = clock();
 
    double cpu0 = double(cpu_end - cpu_start) / CLOCKS_PER_SEC;
    oomph_info
      << "CPU time for global setup of METIS data structures [nroot_elem="
      << total_number_of_root_elements << "]: " << cpu0 << " sec" << std::endl;
 
 
    // Now the root processor has gathered all the data needed to establish
    // the root element connectivity (as in the serial case) so use METIS
    // to determine "partitioning" for non-uniformly refined mesh
    //----------------------------------------------------------------------
 
    // Vector to store target domain for each of the root elements (concatenated
    // in processor-by-processor order)
    Vector<unsigned> root_element_domain(total_number_of_root_elements, 0);
    if (my_rank == root_processor) //--
    {
      // Start timer
      clock_t cpu_start = clock();
 
      // Repeat the steps used in the serial code: Storage for
      // the global equations (on root processor)
      std::set<unsigned> all_global_eqns_root_processor;
 
      // Set of root elements (as enumerated in the processor-by-processor
      // order) associated with given global equation number
      std::map<unsigned, std::set<unsigned>>
        root_elements_connected_with_global_eqn_on_root_processor;
 
      // Retrace the steps of the serial code: Who's connected with who
      unsigned count_all = 0;
      for (unsigned e = 0; e < total_number_of_root_elements; e++)
      {
        unsigned n_eqn_no = number_of_dofs_for_global_root_element[e];
        for (unsigned n = 0; n < n_eqn_no; n++)
        {
          unsigned eqn_no = eqn_numbers_with_root_elements[count_all];
          count_all++;
          root_elements_connected_with_global_eqn_on_root_processor[eqn_no]
            .insert(e);
          all_global_eqns_root_processor.insert(eqn_no);
        }
      }
 
      // Number of domains
      unsigned ndomain = n_proc;
 
      // Now reverse the lookup scheme to find out all root elements
      // that are connected because they share the same global eqn
      Vector<std::set<unsigned>> connected_root_elements(
        total_number_of_root_elements);
 
      // Counter for total number of entries in connected_root_elements
      // structure
      unsigned count = 0;
 
      // Loop over all global eqns
      for (std::set<unsigned>::iterator it =
             all_global_eqns_root_processor.begin();
           it != all_global_eqns_root_processor.end();
           it++)
      {
        // Get set of root elements connected with this data item
        std::set<unsigned> root_elements =
          root_elements_connected_with_global_eqn_on_root_processor[*it];
 
        // Double loop over connnected root elements: Everybody's connected to
        // everybody
        for (std::set<unsigned>::iterator it1 = root_elements.begin();
             it1 != root_elements.end();
             it1++)
        {
          for (std::set<unsigned>::iterator it2 = root_elements.begin();
               it2 != root_elements.end();
               it2++)
          {
            if ((*it1) != (*it2))
            {
              connected_root_elements[(*it1)].insert(*it2);
            }
          }
        }
      }
 
      // End timer
      clock_t cpu_end = clock();
 
      // Doc
      double cpu0b = double(cpu_end - cpu_start) / CLOCKS_PER_SEC;
      oomph_info << "CPU time for setup of connected elements (load balance) "
                    "[nroot_elem="
                 << total_number_of_root_elements << "]: " << cpu0b << " sec"
                 << std::endl;
 
      // Now convert into C-style packed array for interface with METIS
      cpu_start = clock();
      int* xadj = new int[total_number_of_root_elements + 1];
      Vector<int> adjacency_vector;
 
      // Reserve (too much) space
      adjacency_vector.reserve(count);
 
      // Initialise counters
      unsigned ientry = 0;
 
      // Loop over all elements
      for (unsigned e = 0; e < total_number_of_root_elements; e++)
      {
        // First entry for current element
        xadj[e] = ientry;
 
        // Loop over elements that are connected to current element
        typedef std::set<unsigned>::iterator IT;
        for (IT it = connected_root_elements[e].begin();
             it != connected_root_elements[e].end();
             it++)
        {
          // Copy into adjacency array
          adjacency_vector.push_back(*it);
 
          // We've just made another entry
          ientry++;
        }
 
        // Entry after last entry for current element:
        xadj[e + 1] = ientry;
      }
 
      // End timer
      cpu_end = clock();
 
      // Doc
      double cpu0 = double(cpu_end - cpu_start) / CLOCKS_PER_SEC;
      oomph_info << "CPU time for setup of METIS data structures (load "
                    "balance) [nroot_elem="
                 << total_number_of_root_elements << "]: " << cpu0 << " sec"
                 << std::endl;
 
 
      // Call METIS graph partitioner
      //-----------------------------
 
      // Start timer
      cpu_start = clock();
 
      // Number of vertices in graph
      int nvertex = total_number_of_root_elements;
 
      // No vertex weights
      int* vwgt = 0;
 
      // No edge weights
      int* adjwgt = 0;
 
      // Flag indicating that graph isn't weighted: 0; vertex weights only: 2
      // Note that wgtflag==2 requires nodal weights to be stored in vwgt.
      int wgtflag = 0;
 
      // Use C-style numbering (first array entry is zero)
      int numflag = 0;
 
      // Number of desired partitions
      int nparts = ndomain;
 
      // Use default options
      int* options = new int[10];
      options[0] = 0;
 
#ifdef OOMPH_TRANSITION_TO_VERSION_3
      switch (objective)
      {
        case 0:
          // Edge-cut minimization
          options[0] = METIS_OBJTYPE_CUT;
          break;
 
        case 1:
          // communication volume minimisation
          options[0] = METIS_OBJTYPE_VOL;
          break;
 
        default:
          std::ostringstream error_stream;
          error_stream << "Wrong objective for METIS. objective = " << objective
                       << std::endl;
 
          throw OomphLibError(error_stream.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
      }
#endif
 
      // Number of cut edges in graph
      int* edgecut = new int[total_number_of_root_elements];
 
      // Array containing the partition information
      int* part = new int[total_number_of_root_elements];
 
      // Now bias distribution by giving each root element
      // a weight equal to the number of elements associated with it
 
      // Adjust flag and provide storage for weights
      wgtflag = 2;
      vwgt = new int[total_number_of_root_elements];
 
 
      // Load balance based on assembly times of all leaf
      // elements associated with root
      if (can_load_balance_on_assembly_times)
      {
        oomph_info << "Basing distribution on assembly times of elements\n";
 
        // Normalise
        double min_time = *(
          std::min_element(total_assembly_time_for_global_root_element.begin(),
                           total_assembly_time_for_global_root_element.end()));
#ifdef PARANOID
        if (min_time == 0.0)
        {
          std::ostringstream error_stream;
          error_stream << "Minimum assemble time for element is zero!\n";
          throw OomphLibError(error_stream.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
        }
#endif
 
        // Bypass METIS (usually for validation) and use made-up but
        // repeatable timings
        if (bypass_metis)
        {
          for (unsigned e = 0; e < total_number_of_root_elements; e++)
          {
            vwgt[e] = e;
          }
        }
        else
        {
          for (unsigned e = 0; e < total_number_of_root_elements; e++)
          {
            // Use assembly times (relative to minimum) as weight
            vwgt[e] =
              int(total_assembly_time_for_global_root_element[e] / min_time);
          }
        }
      }
      // Load balanced based on number of leaf elements associated with
      // root
      else
      {
        oomph_info << "Basing distribution on number of elements\n";
        for (unsigned e = 0; e < total_number_of_root_elements; e++)
        {
          vwgt[e] = number_of_non_halo_elements_for_global_root_element[e];
        }
      }
 
      // Bypass METIS (usually for validation)
      if (bypass_metis)
      {
        // Simple repeatable partition: Equidistribute root element
        for (unsigned e = 0; e < total_number_of_root_elements; e++)
        {
          // Simple repeatable partition: Equidistribute elements on each
          // processor
          part[e] = (n_proc - 1) -
                    unsigned(double(e) / double(total_number_of_root_elements) *
                             double(n_proc));
        }
 
        oomph_info
          << "Bypassing METIS for validation purposes.\n"
          << "Appending input for metis in metis_input_for_validation.dat\n";
        std::ofstream outfile;
        outfile.open("metis_input_for_validation.dat", std::ios_base::app);
 
        // Dump out relevant input to metis
        for (unsigned e = 0; e < total_number_of_root_elements + 1; e++)
        {
          outfile << xadj[e] << std::endl;
        }
        unsigned n = adjacency_vector.size();
        for (unsigned i = 0; i < n; i++)
        {
          outfile << adjacency_vector[i] << std::endl;
        }
        for (unsigned e = 0; e < total_number_of_root_elements; e++)
        {
          outfile << vwgt[e] << std::endl;
        }
        outfile.close();
      }
      // Actually use METIS (good but not always repeatable!)
      else
      {
#ifdef OOMPH_TRANSITION_TO_VERSION_3
 
        METIS_PartGraphKway(&nvertex,
                            xadj,
                            &adjacency_vector[0],
                            vwgt,
                            adjwgt,
                            &wgtflag,
                            &numflag,
                            &nparts,
                            options,
                            edgecut,
                            part);
#else
        // for old version of METIS; these two functions have been merged
        // in the new METIS API
 
        if (objective == 0)
        {
          // Partition with the objective of minimising the edge cut
          METIS_PartGraphKway(&nvertex,
                              xadj,
                              &adjacency_vector[0],
                              vwgt,
                              adjwgt,
                              &wgtflag,
                              &numflag,
                              &nparts,
                              options,
                              edgecut,
                              part);
        }
        else if (objective == 1)
        {
          // Partition with the objective of minimising the total communication
          // volume
          METIS_PartGraphVKway(&nvertex,
                               xadj,
                               &adjacency_vector[0],
                               vwgt,
                               adjwgt,
                               &wgtflag,
                               &numflag,
                               &nparts,
                               options,
                               edgecut,
                               part);
        }
#endif
      }
 
      // Copy across
      Vector<unsigned> total_weight_on_proc(n_proc, 0);
      for (unsigned e = 0; e < total_number_of_root_elements; e++)
      {
        root_element_domain[e] = part[e];
        total_weight_on_proc[part[e]] += vwgt[e];
      }
 
      // Document success of partitioning
      for (unsigned j = 0; j < n_proc; j++)
      {
        oomph_info << "Total weight on proc " << j << " is "
                   << total_weight_on_proc[j] << std::endl;
      }
 
      // Doc
      double cpu1 = double(cpu_end - cpu_start) / CLOCKS_PER_SEC;
      oomph_info << "CPU time for METIS mesh partitioning [nroot_elem="
                 << total_number_of_root_elements << "]: " << cpu1 << " sec"
                 << std::endl;
 
      // Cleanup
      delete[] xadj;
      delete[] part;
      delete[] vwgt;
      delete[] edgecut;
      delete[] options;
    }
 
    // Now scatter things back to processors: root_element_domain[] contains
    // the target domain for all elements (concatenated in processor-by
    // processor order on the root processor). Distribute this back
    // to the processors so that root_element_domain_on_this_proc[e] contains
    // the target domain for root element e (in whatever order the processor
    // decided to line up its root elements).
    cpu_start = clock();
    Vector<unsigned> root_element_domain_on_this_proc(number_of_root_elements);
 
    // Create at least one entry so we don't get a seg fault below
    if (root_element_domain_on_this_proc.size() == 0)
    {
      root_element_domain_on_this_proc.resize(1);
    }
    MPI_Scatterv(&root_element_domain[0],
                 &number_of_root_elements_on_each_proc[0],
                 &start_index[0],
                 MPI_UNSIGNED,
                 &root_element_domain_on_this_proc[0],
                 number_of_root_elements,
                 MPI_UNSIGNED,
                 root_processor,
                 comm_pt->mpi_comm());
 
 
    // Now translate back into target domain for the actual (non-root)
    // elements
    element_domain_on_this_proc.resize(number_of_non_halo_elements);
    unsigned count_non_halo = 0;
    for (unsigned e = 0; e < n_elem; e++)
    {
      GeneralisedElement* el_pt = mesh_pt->element_pt(e);
      if (!el_pt->is_halo())
      {
        // Get the associated root element which is either...
        GeneralisedElement* root_el_pt = 0;
        RefineableElement* ref_el_pt = dynamic_cast<RefineableElement*>(el_pt);
        if (ref_el_pt != 0)
        {
          //...the actual root element
          root_el_pt = ref_el_pt->root_element_pt();
        }
        // ...or the element itself
        else
        {
          root_el_pt = el_pt;
        }
 
        // Recover the root element number (offset by one)
        unsigned root_el_number = root_el_number_plus_one[root_el_pt] - 1;
 
        // Copy target domain across from root element
        element_domain_on_this_proc[count_non_halo] =
          root_element_domain_on_this_proc[root_el_number];
 
        // Bump up counter for non-root elements
        count_non_halo++;
      }
    }
 
 
#ifdef PARANOID
    if (count_non_halo != number_of_non_halo_elements)
    {
      std::ostringstream error_stream;
      error_stream << "Non-halo counts don't match: " << count_non_halo << " "
                   << number_of_non_halo_elements << std::endl;
 
      throw OomphLibError(
        error_stream.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
    // End timer
    cpu_end = clock();
 
    // Doc
    double cpu2 = double(cpu_end - cpu_start) / CLOCKS_PER_SEC;
    oomph_info << "CPU time for communication of partition to all processors "
                  "[nroot_elem="
               << total_number_of_root_elements << "]: " << cpu2 << " sec"
               << std::endl;
  }
 
 
#endif
 
} // namespace oomph