diff --git a/fftw3_2d_mpi_example/fftw3_2d_mpi_example.c b/fftw3_2d_mpi_example/fftw3_2d_mpi_example.c
index 4282c770694699c3e5279e5d3947abaaf363c916..c97a28b23a89b265a01e7b5905e998fba2da4c16 100644
--- a/fftw3_2d_mpi_example/fftw3_2d_mpi_example.c
+++ b/fftw3_2d_mpi_example/fftw3_2d_mpi_example.c
@@ -9,7 +9,9 @@ int main(int argc, char **argv)
   fftw_plan plan;
   fftw_complex *data;
   ptrdiff_t alloc_local, local_n0, local_0_start, i, j;
-
+  int rank, nranks;
+  double t0, t1;
+  
   if(argc < 10) {
     fprintf(stderr, "USAGE: %s <N>\n", argv[0]);
     exit(-1);
@@ -20,15 +22,26 @@ int main(int argc, char **argv)
   MPI_Init(&argc, &argv);
   fftw_mpi_init();
 
+  MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &nranks);
+
+  t0 = MPI_Wtime();
   /* get local data size and allocate */
-  alloc_local = fftw_mpi_local_size_2d(N0, N1, MPI_COMM_WORLD,
+  alloc_local = fftw_mpi_local_size_2d(N0, N1, comm,
 				       &local_n0, &local_0_start);
   data = fftw_alloc_complex(alloc_local);
+  t1 = MPI_Wtime();
+  if(rank==0)printf("t_%s = %f seconds\n","alloc",t1-t0);
 
+  t0 = MPI_Wtime();
   /* create plan for in-place forward DFT */
-  plan = fftw_mpi_plan_dft_2d(N0, N1, data, data, MPI_COMM_WORLD,
+  plan = fftw_mpi_plan_dft_2d(N0, N1, data, data, comm,
 			      FFTW_FORWARD, FFTW_ESTIMATE);    
+  t1 = MPI_Wtime();
+  if(rank==0)printf("t_%s = %f seconds\n","plan",t1-t0);
 
+  t0 = MPI_Wtime();
   /* initialize data to some function my_function(x,y) */
   for (i = 0; i < local_n0; ++i)
     for (j = 0; j < N1; ++j) {
@@ -36,10 +49,15 @@ int main(int argc, char **argv)
       if( (local_0_start + i) == 0 && j == 0)
 	data[i*N1 + j][0] = 1.0;
     }
+  t1 = MPI_Wtime();
+  if(rank==0)printf("t_%s = %f seconds\n","init",t1-t0);
 
+  t0 = MPI_Wtime();
   /* compute transforms, in-place, as many times as desired */
   fftw_execute(plan);
-
+  t1 = MPI_Wtime();
+  if(rank==0)printf("t_%s = %f seconds\n","fftf",t1-t0);
+  
   fftw_destroy_plan(plan);
 
   MPI_Finalize();
diff --git a/fftw3_2d_mpi_example/fftw3_2d_mpi_example_omp.c b/fftw3_2d_mpi_example/fftw3_2d_mpi_example_omp.c
new file mode 100644
index 0000000000000000000000000000000000000000..9d3d7bb266544c958e9320f6cb7a0494ed02ba91
--- /dev/null
+++ b/fftw3_2d_mpi_example/fftw3_2d_mpi_example_omp.c
@@ -0,0 +1,78 @@
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <fftw3-mpi.h>
+#include <omp.h>
+
+int threads_ok;
+
+int main(int argc, char **argv)
+{
+  ptrdiff_t N0, N1;
+  fftw_plan plan;
+  fftw_complex *data;
+  ptrdiff_t alloc_local, local_n0, local_0_start, i, j;
+  int provided;
+  int rank, nranks;
+  double t0, t1;
+  
+  if(argc < 10) {
+    fprintf(stderr, "USAGE: %s <N>\n", argv[0]);
+    exit(-1);
+  }
+  N0 = atoi(argv[1]);
+  N1 = N0;
+
+  MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
+  threads_ok = provided >= MPI_THREAD_FUNNELED;
+  if (threads_ok) threads_ok = fftw_init_threads();
+  fftw_mpi_init();
+  int nthreads = omp_get_max_threads();
+  if (threads_ok) fftw_plan_with_nthreads(nthreads);
+
+  MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &nranks);
+
+  if(rank==0) {
+    printf("\nomp_max_threads = %d\n",nthreads);
+    printf("fftw_planner_nthreads = %d\n\n",fftw_planner_nthreads());
+  }
+
+  t0 = MPI_Wtime();
+  /* get local data size and allocate */
+  alloc_local = fftw_mpi_local_size_2d(N0, N1, comm,
+				       &local_n0, &local_0_start);
+  data = fftw_alloc_complex(alloc_local);
+  t1 = MPI_Wtime();
+  if(rank==0)printf("t_%s = %f seconds\n","alloc",t1-t0);
+  
+  t0 = MPI_Wtime();
+  /* create plan for in-place forward DFT */
+  plan = fftw_mpi_plan_dft_2d(N0, N1, data, data, comm,
+			      FFTW_FORWARD, FFTW_ESTIMATE);
+  t1 = MPI_Wtime();
+  if(rank==0)printf("t_%s = %f seconds\n","plan",t1-t0);
+
+  t0 = MPI_Wtime();
+  /* initialize data to some function my_function(x,y) */
+#pragma omp parallel for
+  for (i = 0; i < local_n0; ++i)
+    for (j = 0; j < N1; ++j) {
+      data[i*N1 + j][0] = data[i*N1 + j][1] = 0.0;
+      if( (local_0_start + i) == 0 && j == 0)
+	data[i*N1 + j][0] = 1.0;
+    }
+  t1 = MPI_Wtime();
+  if(rank==0)printf("t_%s = %f seconds\n","init",t1-t0);
+
+  t0 = MPI_Wtime();
+  /* compute transforms, in-place, as many times as desired */
+  fftw_execute(plan);
+  t1 = MPI_Wtime();
+  if(rank==0)printf("t_%s = %f seconds\n","fftf",t1-t0);
+
+  fftw_destroy_plan(plan);
+
+  MPI_Finalize();
+}
diff --git a/fftw3_2d_mpi_example/makefile_perlmutter_cpu b/fftw3_2d_mpi_example/makefile_perlmutter_cpu
index 215341e884c53a4bb1c462a4ee0e2acb5d5fc0ff..083e53d3eb5b5ba852133fa5f5f7e70677f8b0bf 100644
--- a/fftw3_2d_mpi_example/makefile_perlmutter_cpu
+++ b/fftw3_2d_mpi_example/makefile_perlmutter_cpu
@@ -1,11 +1,20 @@
 FFTW_HOME = ../fftw-gcc-cc/3.3.10
 
 MPICC = cc
+
 MPI_C_FLAGS = -g -O3 -I${FFTW_HOME}/include
 MPI_LD_FLAGS = -L${FFTW_HOME}/lib -lfftw3_mpi -lfftw3 -lm
 
+MPI_OMP_C_FLAGS = -g -O3 -fopenmp -I${FFTW_HOME}/include
+MPI_OMP_LD_FLAGS = -L${FFTW_HOME}/lib -lfftw3_mpi -lfftw3_omp -lfftw3 -lm
+
+all: fftw3_2d_mpi_example fftw3_2d_mpi_example_omp
+
 fftw3_2d_mpi_example: fftw3_2d_mpi_example.c
-	${MPICC} fftw3_2d_mpi_example.c ${MPI_C_FLAGS} ${MPI_LD_FLAGS} -o fftw3_2d_mpi_example
+	${MPICC} $^ ${MPI_C_FLAGS} ${MPI_LD_FLAGS} -o $@
+
+fftw3_2d_mpi_example_omp: fftw3_2d_mpi_example_omp.c
+	${MPICC} $^ ${MPI_OMP_C_FLAGS} ${MPI_OMP_LD_FLAGS} -o $@
 
 clean:
-	rm -f fftw3_2d_mpi_example
+	rm -f fftw3_2d_mpi_example fftw3_2d_mpi_example_omp