Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Negative element count for max.-refinement level 12 on a fractal Menger sponge setup #324

Closed
jmark opened this issue Nov 8, 2024 · 4 comments
Labels

Comments

@jmark
Copy link

jmark commented Nov 8, 2024

Description

I created a timing setup to measure ghost layer creation timings for a fractally refined Menger sponge (3D resp. hex) as a stress test.

When setting max. refinement level to 12 the output gives negative global element count (serial and parallel with a small number of MPI tasks). Till max. ref. level 11 all runs just fine.

Update:
With 32 MPI tasks on the same machine (36 cores, 128 GB RAM) the example runs without problems.

To Reproduce

A minimal working example looks as follows which is a modified version of the example/timings/bricks{2,3}.c program.

#ifndef P4_TO_P8
#include <p4est_bits.h>
#include <p4est_extended.h>
#include <p4est_vtk.h>
#include <p4est_ghost.h>
#else
#include <p8est_bits.h>
#include <p8est_extended.h>
#include <p8est_vtk.h>
#include <p8est_ghost.h>
#endif
#include <sc_options.h>

static int          refine_level;

/* This refine callback creates a Menger sponge of given `refinement_level`. */
static int
refine_fractal (p4est_t * p4est, p4est_topidx_t which_tree,
                p4est_quadrant_t * q)
{
  int child_id = p4est_quadrant_child_id (q);
  int level_element = (int) q->level;
  int ancestor_id = p4est_quadrant_ancestor_id (q, level_element - 1);

  if (0 == level_element % 2) {
    if (ancestor_id < 4) {
      if (child_id > 3) {
        if (4 != child_id - ancestor_id) {
          return 0;
        }
      }
      else {
        if (3 == child_id + ancestor_id) {
          return 0;
        }
      }
    }
    else {
      if (child_id > 3) {
        if (11 == child_id + ancestor_id) {
          return 0;
        }
      }
      else {
        if (4 != ancestor_id - child_id) {
          return 0;
        }
      }
    }
  }
  if (level_element < refine_level) {
    return 1;
  }
  return 0;
}

static void
run_menger (sc_MPI_Comm mpicomm, int rlevel)
{
  int                 mpiret;
  int                 tcount;
  int                 per;
  double              elapsed_create, elapsed_partition, elapsed_balance, elapsed_ghost;
#ifdef BRICKS_VTK
  char                filename[BUFSIZ];
#endif
  p4est_connectivity_t *conn;
  p4est_t            *p4est;

  P4EST_GLOBAL_PRODUCTIONF ("Run Menger till level %d\n", rlevel);

  /* create and refine the forest */

  mpiret = sc_MPI_Barrier (mpicomm);
  SC_CHECK_MPI (mpiret);
  elapsed_create = -sc_MPI_Wtime ();

  tcount = 1; /* tree count */
  per = 0; /* periodic */
#ifndef P4_TO_P8
  conn = p4est_connectivity_new_brick (tcount, tcount, per, per);
#else
  conn = p8est_connectivity_new_brick (tcount, tcount, tcount, per, per, per);
#endif
  p4est = p4est_new_ext (mpicomm, conn, 0, 1, 1, 0, NULL, NULL);

  refine_level = rlevel;
  p4est_refine (p4est, 1, refine_fractal, NULL);

  elapsed_create += sc_MPI_Wtime ();

  /* partition the forest */

  mpiret = sc_MPI_Barrier (mpicomm);
  SC_CHECK_MPI (mpiret);
  elapsed_partition = -sc_MPI_Wtime ();

  p4est_partition (p4est, 0, NULL);

  elapsed_partition += sc_MPI_Wtime ();

 /* Deactivate balancing to get the proper Menger sponge. */
# if 0
  /* balance the forest */

  mpiret = sc_MPI_Barrier (mpicomm);
  SC_CHECK_MPI (mpiret);
  elapsed_balance = -sc_MPI_Wtime ();

  p4est_balance (p4est, P4EST_CONNECT_FULL, NULL);

  elapsed_balance += sc_MPI_Wtime ();
# endif

  /* create ghost layer */

  mpiret = sc_MPI_Barrier (mpicomm);
  SC_CHECK_MPI (mpiret);
  elapsed_ghost = -sc_MPI_Wtime ();

  p4est_ghost_t *ghost = p4est_ghost_new (p4est, P4EST_CONNECT_FACE);

  elapsed_ghost += sc_MPI_Wtime ();

  /* postprocessing */

  P4EST_GLOBAL_PRODUCTIONF ("Timings %g %g %g %g\n", elapsed_create,
                            elapsed_partition, elapsed_balance, elapsed_ghost);

#ifdef BRICKS_VTK
  snprintf (filename, BUFSIZ, "brick_%02d_%02d_B", rlevel, l);
  p4est_vtk_write_file (p4est, NULL, filename);
#endif

  p4est_ghost_destroy (ghost);
  p4est_destroy (p4est);
  p4est_connectivity_destroy (conn);
}

int
main (int argc, char **argv)
{
  sc_MPI_Comm         mpicomm;
  int                 mpiret, retval;
  int                 rlevel;
  sc_options_t       *opt;

  mpiret = sc_MPI_Init (&argc, &argv);
  SC_CHECK_MPI (mpiret);
  mpicomm = sc_MPI_COMM_WORLD;

  sc_init (sc_MPI_COMM_WORLD, 1, 1, NULL, SC_LP_DEFAULT);
  p4est_init (NULL, SC_LP_DEFAULT);

  opt = sc_options_new (argv[0]);
  sc_options_add_int (opt, 'l', "level", &rlevel, 0,
                      "Maximum refinement level");
  retval = sc_options_parse (p4est_package_id, SC_LP_ERROR, opt, argc, argv);
  if (retval == -1 || retval < argc) {
    sc_options_print_usage (p4est_package_id, SC_LP_PRODUCTION, opt, NULL);
    sc_abort_collective ("Usage error");
  }

  run_menger (mpicomm, rlevel);

  sc_options_destroy (opt);

  sc_finalize ();

  mpiret = sc_MPI_Finalize ();
  SC_CHECK_MPI (mpiret);

  return 0;
}

Call it with option -l 12 for hex mesh and the output reads:

[libsc] This is libsc 2.8.6
[libsc] CPP                      mpicc -E
[libsc] CPPFLAGS                 
[libsc] CC                       mpicc
[libsc] CFLAGS                   -g -O2
[libsc] LDFLAGS                  
[libsc] LIBS                     -lz -lm 
[p4est] This is p4est 2.8.6.5-9d68
[p4est] CPP                      mpicc -E
[p4est] CPPFLAGS                 
[p4est] CC                       mpicc
[p4est] CFLAGS                   -g -O2
[p4est] LDFLAGS                  
[p4est] LIBS                     -lz -lm 
[p4est] Run Menger till level 12
[p4est] Into p8est_new with min quadrants 0 level 1 uniform 1
[p4est]  New p8est with 1 trees on 1 processors
[p4est]  Initial level 1 potential global quadrants 8 per tree 8
[p4est] Done p8est_new with 8 total quadrants
[p4est] Into p8est_refine with 8 total quadrants, allowed level 29
[p4est] Done p8est_refine with -2112846816 total quadrants
[p4est] Into p8est_partition with -2112846816 total quadrants
[p4est] Done p8est_partition no shipping
[p4est] Into p8est_ghost_new FACE
[p4est] Done p8est_ghost_new
[p4est] Timings 88.2357 7.48e-06 0 9.86389

Additional information
Operating system, build configuration, etc.

Configure: ../p4est/configure --prefix="$(pwd)/local" --enable-mpi
Compiler: gcc 12.1.0
OS: Linux 6.8.0-47-generic

I tested this also on a cluster with different environment and compiler. Same result.

I get the same wrong result with an analog setup in t8code. See this PR: DLR-AMR/t8code#1287

@jmark
Copy link
Author

jmark commented Nov 8, 2024

With debug flags the output reads

[libsc] This is libsc 2.8.6
[libsc] CPP                      mpicc -E
[libsc] CPPFLAGS                 
[libsc] CC                       mpicc
[libsc] CFLAGS                   -g -O2
[libsc] LDFLAGS                  
[libsc] LIBS                     -lz -lm 
[p4est] This is p4est 2.8.6.5-9d68
[p4est] CPP                      mpicc -E
[p4est] CPPFLAGS                 
[p4est] CC                       mpicc
[p4est] CFLAGS                   -g -O2
[p4est] LDFLAGS                  
[p4est] LIBS                     -lz -lm 
[p4est] Run Menger till level 12
[p4est] Into p8est_new with min quadrants 0 level 1 uniform 1
[p4est]  New p8est with 1 trees on 1 processors
[p4est]  Initial level 1 potential global quadrants 8 per tree 8
[p4est 0]  first tree 0 first quadrant 0 global quadrant 0
[p4est 0]  last tree 0 last quadrant 7 global quadrant 7
[p4est 0]  total local quadrants 8
[p4est] Done p8est_new with 8 total quadrants
[p4est] Into p8est_refine with 8 total quadrants, allowed level 29
[p4est 0]  Into refine tree 0 with 8
[p4est 0]  Done refine tree 0 now 2182120480
[libsc 0] Abort: Assertion 'p4est->global_num_quadrants >= old_gnq'
[libsc 0] Abort: /home/mark_jo/codes/p4est/master/src/p4est.c:922
[libsc 0] Abort: Obtained 8 stack frames
[libsc 0] Stack 0: libsc.so.3(+0xd3a6) [0x78a8102033a6]
[libsc 0] Stack 1: libsc.so.3(sc_abort+0xa) [0x78a8102026ba]
[libsc 0] Stack 2: libsc.so.3(+0xcbb6) [0x78a810202bb6]
[libsc 0] Stack 3: libp4est.so.3(p8est_refine_ext+0x116d) [0x78a8102b6c1d]
[libsc 0] Stack 4: p8est_bricks() [0x401322]
[libsc 0] Stack 5: libc.so.6(+0x29d90) [0x78a80f629d90]
[libsc 0] Stack 6: libc.so.6(__libc_start_main+0x80) [0x78a80f629e40]
[libsc 0] Stack 7: p8est_bricks() [0x4014e5]

@cburstedde
Copy link
Owner

Thanks for the report. We use 32 bit integers for local (per-process) counts and 64 bits for global (sum over processes) counts. We do not check whether we overrun the 32 bits locally, which is what the example provokes. If you want to play this safe, you may refine non-recursively and partition in between the calls to refine to have a more even load balance.

@cburstedde
Copy link
Owner

cburstedde commented Nov 18, 2024

Our counts are signed, on purpose, so you have INT32_MAX available, separately on every process.

If you really want to go over this bound locally, redefine p4est_locidx to 64 bits in p4est_base.h, along with the other constants there.

@jmark
Copy link
Author

jmark commented Nov 19, 2024

I see. Thank you very much for the explanation!

@jmark jmark closed this as completed Nov 19, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

2 participants