HDF5

HDF (Hierarchical Data Format) group[1] provides technologies and services for the management of large and complex data collections. The Hierarchical Data Format version 5 (HDF5), is an open source file format that supports large, complex, heterogeneous data. HDF5 uses a "file directory" like structure that allows you to organize data within the file in many different structured ways, as you might do with files on your computer. The HDF5 format also allows for embedding of metadata making it self-describing. For more details, see NEON Science Hub [2].

Installed Versions

All the available versions of HDF5 for use can be viewed by issuing the following command. This applies for other applications as well.

module avail hdf5

output:

---------------------- /usr/local/share/modulefiles -------------------------

hdf5/1.10.1

The default version is identified by "(D)" behind the module name and can be loaded as:

module load hdf5

The other versions of HDF5 can be loaded as:

module load hdf5/<version>

Running HDF5 in HPC Cluster

Sample Example: h5_copy18.c [3]

Copy the file h5_copy18.c in your home directory:

/***********************************************************************/

/* */

/* PROGRAM: h5_copy.c */

/* PURPOSE: Shows how to use the H5SCOPY function. */

/* DESCRIPTION: */

/* This program creates two files, copy1.h5, and copy2.h5. */

/* In copy1.h5, it creates a 3x4 dataset called 'Copy1', */

/* and write 0's to this dataset. */

/* In copy2.h5, it create a 3x4 dataset called 'Copy2', */

/* and write 1's to this dataset. */

/* It closes both files, reopens both files, selects two */

/* points in copy1.h5 and writes values to them. Then it */

/* does an H5Scopy from the first file to the second, and */

/* writes the values to copy2.h5. It then closes the */

/* files, reopens them, and prints the contents of the */

/* two datasets. */

/* */

/***********************************************************************/

#include "hdf5.h"

#define FILE1 "copy1.h5"

#define FILE2 "copy2.h5"

#define RANK 2

#define DIM1 3

#define DIM2 4

#define NUMP 2

int main (void)

{

hid_t file1, file2, dataset1, dataset2;

hid_t mid1, mid2, fid1, fid2;

hsize_t fdim[] = {DIM1, DIM2};

hsize_t mdim[] = {DIM1, DIM2};

hsize_t start[2], stride[2], count[2], block[2];

int buf1[DIM1][DIM2];

int buf2[DIM1][DIM2];

int bufnew[DIM1][DIM2];

int val[] = {53, 59};

hsize_t marray[] = {2};

hsize_t coord[NUMP][RANK];

herr_t ret;

uint i, j;

/***********************************************************************/

/* */

/* Create two files containing identical datasets. Write 0's to one */

/* and 1's to the other. */

/* */

/***********************************************************************/

for ( i = 0; i < DIM1; i++ )

for ( j = 0; j < DIM2; j++ )

buf1[i][j] = 0;

for ( i = 0; i < DIM1; i++ )

for ( j = 0; j < DIM2; j++ )

buf2[i][j] = 1;

file1 = H5Fcreate(FILE1, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

file2 = H5Fcreate(FILE2, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

fid1 = H5Screate_simple (RANK, fdim, NULL);

fid2 = H5Screate_simple (RANK, fdim, NULL);

dataset1 = H5Dcreate (file1, "Copy1", H5T_NATIVE_INT, fid1, H5P_DEFAULT,

H5P_DEFAULT, H5P_DEFAULT);

dataset2 = H5Dcreate (file2, "Copy2", H5T_NATIVE_INT, fid2, H5P_DEFAULT,

H5P_DEFAULT, H5P_DEFAULT);

ret = H5Dwrite(dataset1, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, H5P_DEFAULT, buf1);

ret = H5Dwrite(dataset2, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, H5P_DEFAULT, buf2);

ret = H5Dclose (dataset1);

ret = H5Dclose (dataset2);

ret = H5Sclose (fid1);

ret = H5Sclose (fid2);

ret = H5Fclose (file1);

ret = H5Fclose (file2);

/***********************************************************************/

/* */

/* Open the two files. Select two points in one file, write values to */

/* those point locations, then do H5Scopy and write the values to the */

/* other file. Close files. */

/* */

/***********************************************************************/

file1 = H5Fopen (FILE1, H5F_ACC_RDWR, H5P_DEFAULT);

file2 = H5Fopen (FILE2, H5F_ACC_RDWR, H5P_DEFAULT);

dataset1 = H5Dopen (file1, "Copy1", H5P_DEFAULT);

dataset2 = H5Dopen (file2, "Copy2", H5P_DEFAULT);

fid1 = H5Dget_space (dataset1);

mid1 = H5Screate_simple(1, marray, NULL);

coord[0][0] = 0; coord[0][1] = 3;

coord[1][0] = 0; coord[1][1] = 1;

ret = H5Sselect_elements (fid1, H5S_SELECT_SET, NUMP, (const hsize_t *)&coord);

ret = H5Dwrite (dataset1, H5T_NATIVE_INT, mid1, fid1, H5P_DEFAULT, val);

fid2 = H5Scopy (fid1);

ret = H5Dwrite (dataset2, H5T_NATIVE_INT, mid1, fid2, H5P_DEFAULT, val);

ret = H5Dclose (dataset1);

ret = H5Dclose (dataset2);

ret = H5Sclose (fid1);

ret = H5Sclose (fid2);

ret = H5Fclose (file1);

ret = H5Fclose (file2);

ret = H5Sclose (mid1);

/***********************************************************************/

/* */

/* Open both files and print the contents of the datasets. */

/* */

/***********************************************************************/

file1 = H5Fopen (FILE1, H5F_ACC_RDWR, H5P_DEFAULT);

file2 = H5Fopen (FILE2, H5F_ACC_RDWR, H5P_DEFAULT);

dataset1 = H5Dopen (file1, "Copy1", H5P_DEFAULT);

dataset2 = H5Dopen (file2, "Copy2", H5P_DEFAULT);

ret = H5Dread (dataset1, H5T_NATIVE_INT, H5S_ALL, H5S_ALL,

H5P_DEFAULT, bufnew);

printf ("\nDataset 'Copy1' in file 'copy1.h5' contains: \n");

for (i=0;i<DIM1; i++) {

for (j=0;j<DIM2;j++) printf ("%3d ", bufnew[i][j]);

printf("\n");

}

printf ("\nDataset 'Copy2' in file 'copy2.h5' contains: \n");

ret = H5Dread (dataset2, H5T_NATIVE_INT, H5S_ALL, H5S_ALL,

H5P_DEFAULT, bufnew);

for (i=0;i<DIM1; i++) {

for (j=0;j<DIM2;j++) printf ("%3d ", bufnew[i][j]);

printf("\n");

}

ret = H5Dclose (dataset1);

ret = H5Dclose (dataset2);

ret = H5Fclose (file1);

ret = H5Fclose (file2);

}

Interactive

Request a compute node

srun -t 1:00:00 --mem-per-cpu=4gb -c 4 --pty bash

Load the HDF5 module

module load hdf5/1.10.1

Compile. For parallel compilation, use "module load hdf5" and h5pcc compiler.

h5cc -o example h5_copy18.c

Run

./example18

Batch

Serial

Copy the following pbs script job.slurm in your home directory where you created the executable ‘example’.

#!/bin/bash

#SLURM --timelimit=10:00 # 10 min for example

#SLURM --mem-per-cpu=4gb

#SLURM -c 4

#SLURM --name=hdf5_Test_Ex18

#SLURM --output hdf5_ex18_%j.out

module load hdf5/1.10.1

cd $SLURM_SUBMIT_DIR

cp example18 $PFSDIR

cd $PFSDIR

echo $PFSDIR

./example18

cp -r * $SLURM_SUBMIT_DIR

Submit the job:

sbatch job.slurm

You will get the output file hdf5Test.o<jobid> with following contents along with two .h5 files – copy1.h5 and copy2.h5:

Output:

Dataset 'Copy1' in file 'copy1.h5' contains:

0 59 0 53

0 0 0 0

Dataset 'Copy2' in file 'copy2.h5' contains:

1 59 1 53

1 1 1 1

Parallel Job

There is a parallel c compiler, h5pcc, and there is an example script that will compile and run a parallel job through a batch job using a slurm script, /usr/local/doc/HDF5/parallel.slurm. The resources show an example request of 4 tasks to support multiple mpi tasks.

Copy the files parallel.slurm and hdp-file-create.c to your own directory structure, and run the job with 'sbatch parallel.slurm'.

References:

[1] HDF Website

[2] NEON Science Learning Hub Tutorial

[3] HDF Tutorial