KMR
Functions
kmrckpt.c File Reference

Checkpoint/Restart Support. More...

#include <mpi.h>
#include <assert.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <dirent.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>
#include <fcntl.h>
#include <errno.h>
#include <limits.h>
#include "../config.h"
#include "kmr.h"
#include "kmrimpl.h"
#include "kmrckpt.h"

Go to the source code of this file.

Functions

static int kmr_ckpt_check_restart (KMR *, int **, int *, int *)
 
static int kmr_ckpt_cmp_long (const void *v1, const void *v2)
 
void kmr_ckpt_create_context (KMR *mr)
 Initialize checkpoint context. More...
 
static void kmr_ckpt_delete_ckpt_data (KMR *, long)
 
static void kmr_ckpt_delete_ckpt_files (KMR *, const char *, int)
 
int kmr_ckpt_disable_ckpt (KMR *mr)
 It temporally disables checkpoint/restart. More...
 
int kmr_ckpt_enable_ckpt (KMR *mr, int lock_id)
 It temporally enables checkpoint/restart which has been disabled by calling kmr_ckpt_disable_ckpt(). More...
 
int kmr_ckpt_enabled (KMR *mr)
 Check if checkpoint/restart is enabled. More...
 
static void kmr_ckpt_fin_log (KMR *)
 
static struct kmr_ckpt_data_filekmr_ckpt_find_data_file (long kvs_id, struct kmr_ckpt_data_file *dataflst, int nfiles)
 
long kmr_ckpt_first_unprocessed_kv (KMR *mr)
 It returns the index of the first unprocessed key-value in the input KVS. More...
 
static void kmr_ckpt_flush (KMR *, FILE *)
 
void kmr_ckpt_free_context (KMR *mr)
 Free checkpoint context. More...
 
static void kmr_ckpt_get_data_flist (KMR *, const char *, struct kmr_ckpt_data_file **, int *, _Bool)
 
static void kmr_ckpt_init_data_file (KMR *mr, const char *dname, const char *fname, _Bool setall, struct kmr_ckpt_data_file *file)
 
static void kmr_ckpt_init_environment (KMR *)
 
static void kmr_ckpt_init_log (KMR *, const char *)
 
static void kmr_ckpt_int_list_add (struct kmr_ckpt_list *, long)
 
static void * kmr_ckpt_int_list_alocfn (void *val)
 
static int kmr_ckpt_int_list_compfn (void *v1, void *v2)
 
static long kmr_ckpt_int_list_del (struct kmr_ckpt_list *, long)
 
static void kmr_ckpt_int_list_free (struct kmr_ckpt_list *)
 
static void kmr_ckpt_int_list_freefn (void *val)
 
static void kmr_ckpt_int_list_init (struct kmr_ckpt_list *)
 
static long kmr_ckpt_int_list_rsearch (struct kmr_ckpt_list *, long)
 
static long kmr_ckpt_int_list_search (struct kmr_ckpt_list *, long)
 
static long kmr_ckpt_kv_record_add (KMR_KVS *)
 
static void kmr_ckpt_kv_record_fin (KMR *)
 
static void kmr_ckpt_kv_record_init (KMR *, KMR_KVS *)
 
static void kmr_ckpt_kv_record_init_data (KMR *mr, KMR_KVS *kvs)
 
static void kmr_ckpt_kvs_chains_connect (struct kmr_ckpt_kvs_chains *, struct kmr_ckpt_operation)
 
static struct kmr_ckpt_listkmr_ckpt_kvs_chains_find (struct kmr_ckpt_kvs_chains *, long)
 
static void kmr_ckpt_kvs_chains_free (struct kmr_ckpt_kvs_chains *)
 
static void kmr_ckpt_kvs_chains_init (struct kmr_ckpt_kvs_chains *)
 
static void kmr_ckpt_kvs_chains_new_chain (struct kmr_ckpt_kvs_chains *, struct kmr_ckpt_operation)
 
static void kmr_ckpt_list_add (struct kmr_ckpt_list *, void *)
 
static void * kmr_ckpt_list_del (struct kmr_ckpt_list *, void *)
 
static void kmr_ckpt_list_free (struct kmr_ckpt_list *)
 
static void kmr_ckpt_list_init (struct kmr_ckpt_list *, kmr_ckpt_list_alocfn_t, kmr_ckpt_list_freefn_t, kmr_ckpt_list_compfn_t)
 
static void * kmr_ckpt_list_rsearch (struct kmr_ckpt_list *, void *)
 
static void * kmr_ckpt_list_search (struct kmr_ckpt_list *, void *)
 
void kmr_ckpt_lock_finish (KMR *mr)
 Define the end position of code region that is referred when restart. More...
 
void kmr_ckpt_lock_start (KMR *mr)
 Define the start position of code region that is referred when restart. More...
 
static void kmr_ckpt_log_block_add (KMR *, long, long)
 
static void kmr_ckpt_log_block_finish (KMR *)
 
static void kmr_ckpt_log_block_start (KMR *, KMR_KVS *)
 
static void kmr_ckpt_log_deletable (KMR *, long)
 
static void kmr_ckpt_log_delete_finish (KMR *, long)
 
static void kmr_ckpt_log_delete_start (KMR *, long)
 
static void kmr_ckpt_log_index_add (KMR *, long, long)
 
static void kmr_ckpt_log_index_finish (KMR *)
 
static void kmr_ckpt_log_index_start (KMR *, KMR_KVS *)
 
static void kmr_ckpt_log_progress (KMR *)
 
static void kmr_ckpt_log_skipped (KMR *)
 
static void kmr_ckpt_log_whole_finish (KMR *)
 
static void kmr_ckpt_log_whole_start (KMR *)
 
static void kmr_ckpt_make_fname (const char *, const char *, enum kmr_ckpt_type, int, long, char *, size_t)
 
static int kmr_ckpt_merge_check_ignorable (struct kmr_ckpt_kvs_chains *, long)
 
static void kmr_ckpt_merge_ignore_ckpt_data (long, struct kmr_ckpt_prev_state *, struct kmr_ckpt_merge_ctx *)
 
static void kmr_ckpt_merge_sort_data (KMR *, const char *, long, struct kmr_ckpt_merge_source *)
 
static void kmr_ckpt_merge_store_ckpt_data (long, int, long, struct kmr_ckpt_prev_state *, struct kmr_ckpt_merge_ctx *)
 
static void kmr_ckpt_merge_update_ckpt_data (long, int, long, long, struct kmr_ckpt_list *, struct kmr_ckpt_prev_state *, struct kmr_ckpt_merge_ctx *)
 
static void kmr_ckpt_merge_write_file (KMR *, const char *, struct kmr_ckpt_merge *)
 
static FILE * kmr_ckpt_open (KMR_KVS *, const char *)
 
static FILE * kmr_ckpt_open_log (KMR *, const char *, struct kmr_ckpt_log *, unsigned long *)
 
static FILE * kmr_ckpt_open_path (KMR *, const char *, const char *)
 
static void kmr_ckpt_opr_list_add (struct kmr_ckpt_list *, struct kmr_ckpt_operation)
 
static void * kmr_ckpt_opr_list_alocfn (void *val)
 
static int kmr_ckpt_opr_list_compfn (void *v1, void *v2)
 
static void kmr_ckpt_opr_list_free (struct kmr_ckpt_list *)
 
static void kmr_ckpt_opr_list_freefn (void *val)
 
static void kmr_ckpt_opr_list_init (struct kmr_ckpt_list *)
 
void kmr_ckpt_progress_fin (KMR *mr)
 It finalizes the progress of MapReduce checkpointing. More...
 
int kmr_ckpt_progress_init (KMR_KVS *kvi, KMR_KVS *kvo, struct kmr_option opt)
 It initializes a progress of MapReduce checkpointing. More...
 
void kmr_ckpt_remove_ckpt (KMR_KVS *kvs)
 It removes checkpoint data file. More...
 
void kmr_ckpt_restore_ckpt (KMR_KVS *kvs)
 It restores checkpoint data to kvs. More...
 
static void kmr_ckpt_restore_prev_progress (KMR *, int *, int)
 
static void kmr_ckpt_restore_prev_progress_all (KMR *mr, int *target_ranks, int target_rank_count)
 
static void kmr_ckpt_restore_prev_progress_selective (KMR *mr, int *target_ranks, int target_rank_count)
 
static void kmr_ckpt_restore_prev_state (KMR *, const char *, int *, int, int)
 
static void kmr_ckpt_restore_prev_state_each_rank (KMR *, struct kmr_ckpt_prev_state *, struct kmr_ckpt_merge_ctx *)
 
static void kmr_ckpt_restore_prev_state_each_rank_all (KMR *mr, struct kmr_ckpt_prev_state *prev_state, struct kmr_ckpt_merge_ctx *merge_ctx)
 
static void kmr_ckpt_restore_prev_state_each_rank_selective (KMR *mr, struct kmr_ckpt_prev_state *prev_state, struct kmr_ckpt_merge_ctx *merge_ctx)
 
static void kmr_ckpt_save_ckpt (KMR_KVS *)
 
void kmr_ckpt_save_kvo_block_add (KMR *mr, KMR_KVS *kvo, long nkvi)
 It adds a new block of key-value pairs of the output KVS to the checkpoint data file. More...
 
void kmr_ckpt_save_kvo_block_fin (KMR *mr, KMR_KVS *kvo)
 It finalizes saving block of key-value pairs of the output KVS to the checkpoint data file. More...
 
void kmr_ckpt_save_kvo_block_init (KMR *mr, KMR_KVS *kvo)
 It initializes saving blocks of key-value pairs of the output KVS to a checkpoint data file. More...
 
void kmr_ckpt_save_kvo_each_add (KMR *mr, KMR_KVS *kvo, long ikv_index)
 It adds new key-value pairs of the output KVS to the checkpoint data file. More...
 
void kmr_ckpt_save_kvo_each_fin (KMR *mr, KMR_KVS *kvo)
 It finalizes saving indexed key-value pairs of the output KVS to the checkpoint data file. More...
 
void kmr_ckpt_save_kvo_each_init (KMR *mr, KMR_KVS *kvo)
 It initializes saving indexed key-value pairs of the output KVS to a checkpoint data file. More...
 
void kmr_ckpt_save_kvo_whole (KMR *mr, KMR_KVS *kvo)
 It saves all key-value pairs in the output KVS to a checkpoint data file. More...
 
static void kmr_ckpt_save_log2 (KMR *mr, int state)
 
static void kmr_ckpt_save_log4 (KMR *mr, int state, long nkvi, long nkvo)
 
static void kmr_ckpt_save_log_del (KMR *mr, int state, long kvs_id)
 
static void kmr_ckpt_save_log_lock (KMR *mr, int state)
 
static void kmr_ckpt_save_log_raw (KMR *mr, struct kmr_ckpt_log_entry *ckptle)
 
static void kmr_ckpt_save_nprocs (KMR *, const char *)
 
static _Bool kmr_ckpt_write_file_p (KMR *mr)
 

Detailed Description

Checkpoint/Restart Support.

Definition in file kmrckpt.c.

Function Documentation

◆ kmr_ckpt_create_context()

void kmr_ckpt_create_context ( KMR mr)

Initialize checkpoint context.

This function should be called only once when MapReduce data type is initialized.

Parameters
[in]mrMapReduce data type

Definition at line 119 of file kmrckpt.c.

◆ kmr_ckpt_free_context()

void kmr_ckpt_free_context ( KMR mr)

Free checkpoint context.

This function should be called only once when MapReduce data type is freed.

Parameters
[in]mrMapReduce data type

Definition at line 162 of file kmrckpt.c.

◆ kmr_ckpt_lock_start()

void kmr_ckpt_lock_start ( KMR mr)

Define the start position of code region that is referred when restart.

If an execution is stopped due to an error in this region, restart with the different number of processes is not allowed.

Parameters
[in]mrMapReduce data type

Definition at line 1934 of file kmrckpt.c.

◆ kmr_ckpt_lock_finish()

void kmr_ckpt_lock_finish ( KMR mr)

Define the end position of code region that is referred when restart.

If an execution is stopped due to an error in this region, restart with the different number of processes is not allowed.

Parameters
[in]mrMapReduce data type

Definition at line 1945 of file kmrckpt.c.

◆ kmr_ckpt_enabled()

int kmr_ckpt_enabled ( KMR mr)

Check if checkpoint/restart is enabled.

Parameters
[in]mrMapReduce data type
Returns
It returns 1 if checkpoint/restart is enabled. Otherwise it returns 0.

Definition at line 2479 of file kmrckpt.c.

◆ kmr_ckpt_disable_ckpt()

int kmr_ckpt_disable_ckpt ( KMR mr)

It temporally disables checkpoint/restart.

Parameters
[in]mrMapReduce data type
Returns
If it succeeds disabling, it returns a lock id. Otherwise it returns 0.

Definition at line 2495 of file kmrckpt.c.

◆ kmr_ckpt_enable_ckpt()

int kmr_ckpt_enable_ckpt ( KMR mr,
int  lock_id 
)

It temporally enables checkpoint/restart which has been disabled by calling kmr_ckpt_disable_ckpt().

Parameters
[in]mrMapReduce data type
[in]lock_idID of lock returned by kmr_ckpt_disable_ckpt()
Returns
If it succeeds enabling, it returns 1. Otherwise it returns 0.

Definition at line 2516 of file kmrckpt.c.

◆ kmr_ckpt_first_unprocessed_kv()

long kmr_ckpt_first_unprocessed_kv ( KMR mr)

It returns the index of the first unprocessed key-value in the input KVS.

Parameters
[in]mrMapReduce data type
Returns
It returns the index of the first unprocessed key-value in the input KVS.

Definition at line 2536 of file kmrckpt.c.

◆ kmr_ckpt_restore_ckpt()

void kmr_ckpt_restore_ckpt ( KMR_KVS kvs)

It restores checkpoint data to kvs.

Parameters
[out]kvsan KVS where the checkpoint data will be restored

Definition at line 2558 of file kmrckpt.c.

◆ kmr_ckpt_remove_ckpt()

void kmr_ckpt_remove_ckpt ( KMR_KVS kvs)

It removes checkpoint data file.

Parameters
[in]kvsKVS whose checkpoint data is removed

Definition at line 2613 of file kmrckpt.c.

◆ kmr_ckpt_save_kvo_whole()

void kmr_ckpt_save_kvo_whole ( KMR mr,
KMR_KVS kvo 
)

It saves all key-value pairs in the output KVS to a checkpoint data file.

Parameters
[in]kviinput KVS
[in]kvooutput KVS

Definition at line 2639 of file kmrckpt.c.

◆ kmr_ckpt_save_kvo_block_init()

void kmr_ckpt_save_kvo_block_init ( KMR mr,
KMR_KVS kvo 
)

It initializes saving blocks of key-value pairs of the output KVS to a checkpoint data file.

Parameters
[in]mrMapReduce data type
[in]kvooutput KVS

Definition at line 2655 of file kmrckpt.c.

◆ kmr_ckpt_save_kvo_block_add()

void kmr_ckpt_save_kvo_block_add ( KMR mr,
KMR_KVS kvo,
long  nkvi 
)

It adds a new block of key-value pairs of the output KVS to the checkpoint data file.

Parameters
[in]mrMapReduce data type
[in]kvooutput KVS
[in]nkvinumber of processed kv in the input KVS

Definition at line 2671 of file kmrckpt.c.

◆ kmr_ckpt_save_kvo_block_fin()

void kmr_ckpt_save_kvo_block_fin ( KMR mr,
KMR_KVS kvo 
)

It finalizes saving block of key-value pairs of the output KVS to the checkpoint data file.

Parameters
[in]mrMapReduce data type
[in]kvooutput KVS

Definition at line 2686 of file kmrckpt.c.

◆ kmr_ckpt_save_kvo_each_init()

void kmr_ckpt_save_kvo_each_init ( KMR mr,
KMR_KVS kvo 
)

It initializes saving indexed key-value pairs of the output KVS to a checkpoint data file.

Parameters
[in]mrMapReduce data type
[in]kvooutput KVS

Definition at line 2704 of file kmrckpt.c.

◆ kmr_ckpt_save_kvo_each_add()

void kmr_ckpt_save_kvo_each_add ( KMR mr,
KMR_KVS kvo,
long  ikv_index 
)

It adds new key-value pairs of the output KVS to the checkpoint data file.

Parameters
[in]mrMapReduce data type
[in]kvooutput KVS
[in]ikv_indexindex of processed kv in the input KVS

Definition at line 2719 of file kmrckpt.c.

◆ kmr_ckpt_save_kvo_each_fin()

void kmr_ckpt_save_kvo_each_fin ( KMR mr,
KMR_KVS kvo 
)

It finalizes saving indexed key-value pairs of the output KVS to the checkpoint data file.

Parameters
[in]mrMapReduce data type
[in]kvooutput KVS

Definition at line 2734 of file kmrckpt.c.

◆ kmr_ckpt_progress_init()

int kmr_ckpt_progress_init ( KMR_KVS kvi,
KMR_KVS kvo,
struct kmr_option  opt 
)

It initializes a progress of MapReduce checkpointing.

Parameters
[in]kviinput KVS to a MapReduce operation
[in]kvooutput KVS to the MapReduce operation
[in]optstruct kmr_option
Returns
It returns 1 if operation can be skipped. Otherwise it returns 0.

Definition at line 2754 of file kmrckpt.c.

◆ kmr_ckpt_progress_fin()

void kmr_ckpt_progress_fin ( KMR mr)

It finalizes the progress of MapReduce checkpointing.

Parameters
[in]mrMapReduce data type

Definition at line 2846 of file kmrckpt.c.