<< Previous question   Table of contents   MOSIX home   Next question >>

Question:

Example how to perform a checkpoint from within a program

Answer:

The following program performs 100 units of work and uses the checkpoint-unit argument to trigger a checkpoint right after that unit. The "Checkpoint-file" is used to save the copies of the program.

#include < stdlib.h>
#include < unistd.h>
#include < string.h>
#include < stdio.h>
#include < fcntl.h>
#include < sys/stat.h>
#include < sys/types.h>

// Setting the checkpoint file from withing the process
// This can also be done via the -C argument to mosrun
int setCheckpointFile(char *file) {
     int fd;

     fd = open("/proc/self/checkpointfile", 1|O_CREAT, file);
     if (fd == -1) {
        return 0;
     }
     return 1;

}

// Triggering a checkpoint from within the process
int triggerCheckpoint() {
     int fd;
     fd = open("/proc/self/checkpoint", 1|O_CREAT, 1);
     if(fd == -1) {
        fprintf(stderr, "Error doing self checkpoint \n");
        return 0;
     }
     printf("Checkpoint was done successfully\n");
     return 1;
}

int main(int argc, char **argv) {
     int j, unit, t;
     char *checkpointFileName;
     int checkpointUnit = 0;

     if(argc < 3) {
        fprintf(stderr, "Usage %s < checkpoint-file> < unit> \n", argv[0]);
        exit(1);
     }

     checkpointFileName = strdup(argv[1]);
     checkpointUnit = atoi(argv[2]);
     if(checkpointUnit < 1 || checkpointUnit > 100) {
        fprintf(stderr, "Checkpoint unit should be > 0 and < 100\n");
        exit(1);
     }

     printf("Checkpoint file: %s\n", checkpointFileName);
     printf("Checkpoint unit: %d\n", checkpointUnit);

// Setting the checkpoint file from within the process (can also be done using
// the -C argument of mosrun
     if(!setCheckpointFile(checkpointFileName)) {
        fprintf(stderr, "Error setting the checkpoint filename from within the process\n");
        fprintf(stderr, "Make sure you are running this program via mosrun\n");
        return 0;
     }

// Main loop ... running for 100 units. change this loop if you wish
// the program to run do more loops
     for( unit = 0; unit < 100 ; unit++ ) {
        // Consuming some cpu time (simulating the run of the application)
        // Change the number below to cause each loop to consume more (or) less time
        for( t=0, j = 0; j < 1000000 * 500; j++ ) {
          t = j+unit*2;
       }
       printf("Unit %d done\n", unit);

// Trigerring a checkpoint request from within the process
       if(unit == checkpointUnit) {
          if(!triggerCheckpoint())
             return 0;
          }
       }
       return 1;
}

To compile: gcc -o checkpoint_demo checkpoint_demo.c
To run: mosrun checkpoint_demo

A typical run:
> mosrun ./checkpoint_demo ccc 5
Checkpoint file: ccc
Checkpoint unit: 5
Unit 0 done
Unit 1 done
Unit 2 done
Unit 3 done
Unit 4 done
Unit 5 done
Checkpoint was done successfully
Unit 6 done
Unit 7 done
Unit 8 done
^C

The program triggered a checkpoint after unit 5. The checkpointed file was saved in ccc.1.
After unit 8 the program was killed.

To restart:
> mosrun -R ccc.1
Checkpoint was done successfully
Unit 6 done
Unit 7 done
Unit 8 done
Unit 9 done
Unit 10 done
...

The program was restarted from the point right after it was checkpointed.