From 1dc9a8c8d71d14cf373df6e12d5655bc81e7e5e3 Mon Sep 17 00:00:00 2001 From: Saptarshi Bhowmik Date: Tue, 26 May 2020 13:21:07 -0700 Subject: [PATCH 1/5] Initial changes --- tracer/p2p-events.C | 18 ++++++++++++++++++ tracer/tracer-driver.C | 15 +++++++++++++++ tracer/tracer-driver.h | 4 ++++ 3 files changed, 37 insertions(+) diff --git a/tracer/p2p-events.C b/tracer/p2p-events.C index 5497107..cdf6746 100644 --- a/tracer/p2p-events.C +++ b/tracer/p2p-events.C @@ -470,6 +470,7 @@ tw_stime exec_task( m->saved_task = ns->my_pe->currentTask; ns->my_pe->currentTask = task_id.taskid; + #if TRACER_BIGSIM_TRACES //For each entry of the task, create a recv event and send them out to //whereever it belongs @@ -717,6 +718,23 @@ tw_stime exec_task( if(PE_isEndEvent(ns->my_pe, task_id.taskid)) { ns->end_ts = tw_now(lp); } + + if(t->event_id == TRACER_LOOP_EVT && ns->region_start == 0){ + ns->computation_t = 0; + ns->regionruntime_t = 0; + ns->region_start = 1; + } + else if (t->event_id == TRACER_LOOP_EVT && ns->region_start == 1){ + ns->region_end = 1; + } + + /*Computation time add*/ + if(t->event_id == TRACER_USER_EVT && ns->region_end == 0){ + ns->computation_t += time; + printf("[%d:%d] COMP: ns->computation_t %f finish_time %f codes_local_latency(lp) %f sendFinishTime %f recvFinishTime %f time %f tw_now(lp) %f\n", ns->my_job, + ns->my_pe_num, ns->computation_t, finish_time, codes_local_latency(lp), sendFinishTime, recvFinishTime, time, tw_now(lp)); + + } //Return the execution time of the task return time; } diff --git a/tracer/tracer-driver.C b/tracer/tracer-driver.C index 560fa6a..8212988 100644 --- a/tracer/tracer-driver.C +++ b/tracer/tracer-driver.C @@ -460,6 +460,16 @@ void proc_init( /* skew each kickoff event slightly to help avoid event ties later on */ kickoff_time = startTime + g_tw_lookahead + tw_rand_unif(lp->rng); ns->end_ts = 0; + + /* Initialize net time spend in computation by a processes to 0*/ + ns->computation_t = 0; + /* Initialize net runtime time spend in a region by a processes to 0*/ + ns->regionruntime_t = 0; + /* Flag to indicate the start of a region*/ + ns->region_start = 0; + /* Flag to indicate the end of a region*/ + ns->region_end = 0; + /* maintain message sequencing for MPI */ ns->my_pe->sendSeq = new int64_t[jobs[ns->my_job].numRanks]; ns->my_pe->recvSeq = new int64_t[jobs[ns->my_job].numRanks]; @@ -678,6 +688,11 @@ void proc_finalize( if(lpid_to_pe(lp->gid) == 0) printf("Job[%d]PE[%d]: FINALIZE in %f seconds.\n", ns->my_job, ns->my_pe_num, ns_to_s(tw_now(lp)-ns->start_ts)); + + tw_stime commTime = ns->regionruntime_t - ns->computation_t; + if(lpid_to_pe(lp->gid) == 0) + printf("Job[%d]PE[%d]: Computation time in %f seconds.\n", ns->my_job, + ns->my_pe_num, ns_to_s(ns->computation_t)); #if TRACER_OTF_TRACES PE_printStat(ns->my_pe, -1); diff --git a/tracer/tracer-driver.h b/tracer/tracer-driver.h index e624177..4181f8f 100644 --- a/tracer/tracer-driver.h +++ b/tracer/tracer-driver.h @@ -45,6 +45,10 @@ struct proc_state { tw_stime start_ts; /* time when first event is processed */ tw_stime end_ts; /* time when last event is processed */ + tw_stime computation_t; /* store time spend in computation*/ + tw_stime regionruntime_t; /* store runtime time spend in a region*/ + int region_start; + int region_end; PE* my_pe; /* stores all core information */ #if TRACER_BIGSIM_TRACES TraceReader* trace_reader; /* for reading the bigsim traces */ From 4102a6502c9112e12803874c712e6e63b50350ef Mon Sep 17 00:00:00 2001 From: Saptarshi Bhowmik Date: Sun, 31 May 2020 16:45:32 -0700 Subject: [PATCH 2/5] Adding region runtime and print output --- tracer/p2p-events.C | 10 +++++---- tracer/tracer-driver.C | 51 +++++++++++++++++++++++++++++++++++++----- tracer/tracer-driver.h | 15 ++++++++++--- 3 files changed, 63 insertions(+), 13 deletions(-) diff --git a/tracer/p2p-events.C b/tracer/p2p-events.C index cdf6746..f3b121a 100644 --- a/tracer/p2p-events.C +++ b/tracer/p2p-events.C @@ -718,14 +718,16 @@ tw_stime exec_task( if(PE_isEndEvent(ns->my_pe, task_id.taskid)) { ns->end_ts = tw_now(lp); } - + /* Marks beginning of region */ if(t->event_id == TRACER_LOOP_EVT && ns->region_start == 0){ - ns->computation_t = 0; - ns->regionruntime_t = 0; + ns->computation_t = 0; + ns->region_start_sim_time = tw_now(lp); ns->region_start = 1; } + /* Marks End of Region */ else if (t->event_id == TRACER_LOOP_EVT && ns->region_start == 1){ - ns->region_end = 1; + ns->region_end_sim_time = tw_now(lp); + ns->region_end = 1; } /*Computation time add*/ diff --git a/tracer/tracer-driver.C b/tracer/tracer-driver.C index 8212988..c62a921 100644 --- a/tracer/tracer-driver.C +++ b/tracer/tracer-driver.C @@ -41,10 +41,13 @@ extern "C" { char tracer_input[256]; /* filename for tracer input file */ CoreInf *global_rank; /* core to job ID and process ID */ +TimeInfo *time_rank; /* core to comm time and comp time*/ JobInf *jobs; int default_mapping; int total_ranks; tw_stime *jobTimes; +tw_stime *commTimes; +tw_stime *compTimes; int num_jobs = 0; tw_stime soft_delay_mpi = 100; tw_stime nic_delay = 400; @@ -206,6 +209,16 @@ int main(int argc, char **argv) global_rank[i].jobID = -1; } + time_rank = (TimeInfo*) malloc(num_servers * sizeof(TimeInfo)); + + for(int i = 0; i < num_servers; i++) { + time_rank[i].jobID = -1; + time_rank[i].rank = -1; + time_rank[i].comm_time = 0; + time_rank[i].comp_time = 0; + } + + /* read in the mapping file and populating global_rank */ if(dump_topo_only || strcmp("NA", globalIn) == 0) { if(!rank) printf("Using default linear mapping of jobs\n"); @@ -238,6 +251,8 @@ int main(int argc, char **argv) fscanf(jobIn, "%d", &num_jobs); /* number of jobs */ jobs = (JobInf*) malloc(num_jobs * sizeof(JobInf)); jobTimes = (tw_stime*) malloc(num_jobs * sizeof(tw_stime)); + compTimes = (tw_stime*) malloc(num_jobs * sizeof(tw_stime)); + commTimes = (tw_stime*) malloc(num_jobs * sizeof(tw_stime)); total_ranks = 0; /* read per job information */ @@ -256,6 +271,8 @@ int main(int argc, char **argv) jobs[i].rankMap = (int*) malloc(jobs[i].numRanks * sizeof(int)); jobs[i].skipMsgId = -1; jobTimes[i] = 0; + compTimes[i] = 0; + commTimes[i] = 0; if(!rank) { printf("Job %d - ranks %d, trace folder %s, rank file %s, iters %d\n", i, jobs[i].numRanks, jobs[i].traceDir, jobs[i].map_file, jobs[i].numIters); @@ -403,6 +420,25 @@ int main(int argc, char **argv) for(int i = 0; i < num_jobs; i++) { printf("Job %d Time %f s\n", i, ns_to_s(jobTimesMax[i])); } +#if DEBUG_PRINT + FILE *fptr = fopen("mpi_rank_times.txt","w"); + for (int i = 0; i < num_servers; i++){ + fprintf(fptr,"Job %d Rank %d Comp Time %f Comm Time %f\n", time_rank[i].jobID, time_rank[i].rank, time_rank[i].comp_time, time_rank[i].comm_time); + } + fclose(fptr); +#endif + + for (int i = 0; i < num_servers; i++){ + if (time_rank[i].jobID >= 0){ + commTimes[time_rank[i].jobID] += time_rank[i].comm_time; + compTimes[time_rank[i].jobID] += time_rank[i].comp_time; + } + } + for (int i = 0; i < num_jobs; i++){ + printf("Job[%d] : Communication Time %f, Computation Time %f\n", i, commTimes[i], compTimes[i]); + } + + } model_net_report_stats(net_id); @@ -463,13 +499,14 @@ void proc_init( /* Initialize net time spend in computation by a processes to 0*/ ns->computation_t = 0; - /* Initialize net runtime time spend in a region by a processes to 0*/ - ns->regionruntime_t = 0; /* Flag to indicate the start of a region*/ ns->region_start = 0; /* Flag to indicate the end of a region*/ ns->region_end = 0; - + /* Initialize simulation time during start of a region to 0*/ + ns->region_start_sim_time = 0; + /* Initialize simulation time during end of a region to 0*/ + ns->region_end_sim_time = 0; /* maintain message sequencing for MPI */ ns->my_pe->sendSeq = new int64_t[jobs[ns->my_job].numRanks]; ns->my_pe->recvSeq = new int64_t[jobs[ns->my_job].numRanks]; @@ -683,13 +720,15 @@ void proc_finalize( if(dump_topo_only) return; tw_stime jobTime = ns->end_ts - ns->start_ts; - tw_stime finalTime = tw_now(lp); - + tw_stime commTime = ((ns->region_end_sim_time - ns->region_start_sim_time) - ns->computation_t); + time_rank[ns->my_pe_num * (ns->my_job + 1)].jobID = ns->my_job; + time_rank[ns->my_pe_num * (ns->my_job + 1)].rank = ns->my_pe_num; + time_rank[ns->my_pe_num * (ns->my_job + 1)].comp_time = ns->computation_t; + time_rank[ns->my_pe_num * (ns->my_job + 1)].comm_time = commTime; if(lpid_to_pe(lp->gid) == 0) printf("Job[%d]PE[%d]: FINALIZE in %f seconds.\n", ns->my_job, ns->my_pe_num, ns_to_s(tw_now(lp)-ns->start_ts)); - tw_stime commTime = ns->regionruntime_t - ns->computation_t; if(lpid_to_pe(lp->gid) == 0) printf("Job[%d]PE[%d]: Computation time in %f seconds.\n", ns->my_job, ns->my_pe_num, ns_to_s(ns->computation_t)); diff --git a/tracer/tracer-driver.h b/tracer/tracer-driver.h index 4181f8f..12780c1 100644 --- a/tracer/tracer-driver.h +++ b/tracer/tracer-driver.h @@ -40,15 +40,24 @@ typedef struct CoreInf { int mapsTo, jobID; } CoreInf; +/* stores mapping of core to Communication Time and Computation Time */ +typedef struct TimeInfo{ + int jobID; + int rank; + tw_stime comm_time; + tw_stime comp_time; +} TimeInfo; + /* ROSS level state information for each core */ struct proc_state { tw_stime start_ts; /* time when first event is processed */ tw_stime end_ts; /* time when last event is processed */ tw_stime computation_t; /* store time spend in computation*/ - tw_stime regionruntime_t; /* store runtime time spend in a region*/ - int region_start; - int region_end; + int region_start;/* flag to mark the start of a region*/ + int region_end;/* flag to mark end of a region*/ + int region_start_sim_time;/* store current simulation time when the region starts*/ + int region_end_sim_time;/* store current simulation time when region ends*/ PE* my_pe; /* stores all core information */ #if TRACER_BIGSIM_TRACES TraceReader* trace_reader; /* for reading the bigsim traces */ From 341a9a5896529d23c4fc9c99bad43dcf5ad5780a Mon Sep 17 00:00:00 2001 From: Saptarshi Bhowmik Date: Sun, 31 May 2020 20:41:06 -0700 Subject: [PATCH 3/5] Debug print statements --- tracer/p2p-events.C | 11 ++++++++--- tracer/tracer-driver.C | 4 ---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tracer/p2p-events.C b/tracer/p2p-events.C index f3b121a..bde257f 100644 --- a/tracer/p2p-events.C +++ b/tracer/p2p-events.C @@ -721,20 +721,25 @@ tw_stime exec_task( /* Marks beginning of region */ if(t->event_id == TRACER_LOOP_EVT && ns->region_start == 0){ ns->computation_t = 0; - ns->region_start_sim_time = tw_now(lp); + ns->region_start_sim_time = finish_time; ns->region_start = 1; } /* Marks End of Region */ else if (t->event_id == TRACER_LOOP_EVT && ns->region_start == 1){ ns->region_end_sim_time = tw_now(lp); - ns->region_end = 1; +#if DEBUG_PRINT + printf("[%d:%d] COMP: ns->computation_t %f start_time %f finish_time %f tw_now(lp) %f region_end_sim_time %f\n", ns->my_job,ns->my_pe_num, ns->computation_t, ns->region_start_sim_time, finish_time, tw_now(lp), ns->region_end_sim_time); +#endif + ns->region_end = 1; } /*Computation time add*/ if(t->event_id == TRACER_USER_EVT && ns->region_end == 0){ ns->computation_t += time; - printf("[%d:%d] COMP: ns->computation_t %f finish_time %f codes_local_latency(lp) %f sendFinishTime %f recvFinishTime %f time %f tw_now(lp) %f\n", ns->my_job, +#if DEBUG_PRINT + printf("[%d:%d] COMP: ns->computation_t %f finish_time %f codes_local_latency(lp) %f sendFinishTime %f recvFinishTime %f time %f tw_now(lp) %f\n", ns->my_job, ns->my_pe_num, ns->computation_t, finish_time, codes_local_latency(lp), sendFinishTime, recvFinishTime, time, tw_now(lp)); +#endif } //Return the execution time of the task diff --git a/tracer/tracer-driver.C b/tracer/tracer-driver.C index c62a921..8ea1156 100644 --- a/tracer/tracer-driver.C +++ b/tracer/tracer-driver.C @@ -728,10 +728,6 @@ void proc_finalize( if(lpid_to_pe(lp->gid) == 0) printf("Job[%d]PE[%d]: FINALIZE in %f seconds.\n", ns->my_job, ns->my_pe_num, ns_to_s(tw_now(lp)-ns->start_ts)); - - if(lpid_to_pe(lp->gid) == 0) - printf("Job[%d]PE[%d]: Computation time in %f seconds.\n", ns->my_job, - ns->my_pe_num, ns_to_s(ns->computation_t)); #if TRACER_OTF_TRACES PE_printStat(ns->my_pe, -1); From eef46de4ac731a1949f20f54f2a006bc257fc671 Mon Sep 17 00:00:00 2001 From: Saptarshi Bhowmik Date: Sun, 31 May 2020 21:29:59 -0700 Subject: [PATCH 4/5] Seperating the macro from printing application times to a file --- tracer/tracer-driver.C | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tracer/tracer-driver.C b/tracer/tracer-driver.C index 8ea1156..e147a3d 100644 --- a/tracer/tracer-driver.C +++ b/tracer/tracer-driver.C @@ -420,7 +420,7 @@ int main(int argc, char **argv) for(int i = 0; i < num_jobs; i++) { printf("Job %d Time %f s\n", i, ns_to_s(jobTimesMax[i])); } -#if DEBUG_PRINT +#if WRITE_APP_TIME FILE *fptr = fopen("mpi_rank_times.txt","w"); for (int i = 0; i < num_servers; i++){ fprintf(fptr,"Job %d Rank %d Comp Time %f Comm Time %f\n", time_rank[i].jobID, time_rank[i].rank, time_rank[i].comp_time, time_rank[i].comm_time); From 05f3849c2ea30bee77c2c739e3cbb68f0ef0069c Mon Sep 17 00:00:00 2001 From: Saptarshi Bhowmik Date: Tue, 23 Jun 2020 14:27:25 -0700 Subject: [PATCH 5/5] Bug Fix global server number --- tracer/tracer-driver.C | 33 ++++++++++++++++++++++++--------- tracer/tracer-driver.h | 5 +++-- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/tracer/tracer-driver.C b/tracer/tracer-driver.C index e147a3d..0f5ff46 100644 --- a/tracer/tracer-driver.C +++ b/tracer/tracer-driver.C @@ -48,6 +48,8 @@ int total_ranks; tw_stime *jobTimes; tw_stime *commTimes; tw_stime *compTimes; +tw_stime *maxCompTime; +tw_stime *maxCommTime; int num_jobs = 0; tw_stime soft_delay_mpi = 100; tw_stime nic_delay = 400; @@ -254,7 +256,8 @@ int main(int argc, char **argv) compTimes = (tw_stime*) malloc(num_jobs * sizeof(tw_stime)); commTimes = (tw_stime*) malloc(num_jobs * sizeof(tw_stime)); total_ranks = 0; - + maxCommTime = (tw_stime*) malloc(num_jobs * sizeof(tw_stime)); + maxCompTime = (tw_stime*) malloc(num_jobs * sizeof(tw_stime)); /* read per job information */ for(int i = 0; i < num_jobs; i++) { #if TRACER_BIGSIM_TRACES @@ -273,6 +276,8 @@ int main(int argc, char **argv) jobTimes[i] = 0; compTimes[i] = 0; commTimes[i] = 0; + maxCompTime[i] = 0; + maxCommTime[i] = 0; if(!rank) { printf("Job %d - ranks %d, trace folder %s, rank file %s, iters %d\n", i, jobs[i].numRanks, jobs[i].traceDir, jobs[i].map_file, jobs[i].numIters); @@ -420,22 +425,28 @@ int main(int argc, char **argv) for(int i = 0; i < num_jobs; i++) { printf("Job %d Time %f s\n", i, ns_to_s(jobTimesMax[i])); } -#if WRITE_APP_TIME +#ifdef WRITE_MPI_TIMES FILE *fptr = fopen("mpi_rank_times.txt","w"); for (int i = 0; i < num_servers; i++){ fprintf(fptr,"Job %d Rank %d Comp Time %f Comm Time %f\n", time_rank[i].jobID, time_rank[i].rank, time_rank[i].comp_time, time_rank[i].comm_time); } fclose(fptr); #endif - for (int i = 0; i < num_servers; i++){ if (time_rank[i].jobID >= 0){ commTimes[time_rank[i].jobID] += time_rank[i].comm_time; - compTimes[time_rank[i].jobID] += time_rank[i].comp_time; + compTimes[time_rank[i].jobID] += time_rank[i].comp_time; + if((time_rank[i].comm_time + time_rank[i].comp_time) > (maxCommTime[time_rank[i].jobID] + maxCompTime[time_rank[i].jobID])){ + maxCommTime[time_rank[i].jobID] = time_rank[i].comm_time; + maxCompTime[time_rank[i].jobID] = time_rank[i].comp_time; + } } } for (int i = 0; i < num_jobs; i++){ - printf("Job[%d] : Communication Time %f, Computation Time %f\n", i, commTimes[i], compTimes[i]); + printf("Job[%d] : Total Communication Time %f, Total Computation Time %f\n", i, ns_to_s(commTimes[i]), ns_to_s(compTimes[i])); + } + for (int i = 0; i < num_jobs; i++){ + printf("Job[%d] : Max Communication Time %f, Max Computation Time %f\n", i, ns_to_s(maxCommTime[i]), ns_to_s(maxCompTime[i])); } @@ -721,10 +732,10 @@ void proc_finalize( tw_stime jobTime = ns->end_ts - ns->start_ts; tw_stime commTime = ((ns->region_end_sim_time - ns->region_start_sim_time) - ns->computation_t); - time_rank[ns->my_pe_num * (ns->my_job + 1)].jobID = ns->my_job; - time_rank[ns->my_pe_num * (ns->my_job + 1)].rank = ns->my_pe_num; - time_rank[ns->my_pe_num * (ns->my_job + 1)].comp_time = ns->computation_t; - time_rank[ns->my_pe_num * (ns->my_job + 1)].comm_time = commTime; + time_rank[lpid_to_global_rank(lp->gid)].jobID = ns->my_job; + time_rank[lpid_to_global_rank(lp->gid)].rank = ns->my_pe_num; + time_rank[lpid_to_global_rank(lp->gid)].comp_time = ns->computation_t; + time_rank[lpid_to_global_rank(lp->gid)].comm_time = commTime; if(lpid_to_pe(lp->gid) == 0) printf("Job[%d]PE[%d]: FINALIZE in %f seconds.\n", ns->my_job, ns->my_pe_num, ns_to_s(tw_now(lp)-ns->start_ts)); @@ -990,6 +1001,10 @@ inline int lpid_to_job(int lp_gid){ inline int pe_to_job(int pe){ return global_rank[pe].jobID;; } +inline int lpid_to_global_rank(int lp_gid){ + int server_num = codes_mapping_get_lp_relative_id(lp_gid, 0, NULL); + return server_num; +} bool isPEonThisRank(int jobID, int i) { int lpid = pe_to_lpid(i, jobID); diff --git a/tracer/tracer-driver.h b/tracer/tracer-driver.h index 12780c1..a920349 100644 --- a/tracer/tracer-driver.h +++ b/tracer/tracer-driver.h @@ -56,8 +56,8 @@ struct proc_state tw_stime computation_t; /* store time spend in computation*/ int region_start;/* flag to mark the start of a region*/ int region_end;/* flag to mark end of a region*/ - int region_start_sim_time;/* store current simulation time when the region starts*/ - int region_end_sim_time;/* store current simulation time when region ends*/ + tw_stime region_start_sim_time;/* store current simulation time when the region starts*/ + tw_stime region_end_sim_time;/* store current simulation time when region ends*/ PE* my_pe; /* stores all core information */ #if TRACER_BIGSIM_TRACES TraceReader* trace_reader; /* for reading the bigsim traces */ @@ -144,6 +144,7 @@ int pe_to_lpid(int pe, int job); int pe_to_job(int pe); int lpid_to_pe(int lp_gid); int lpid_to_job(int lp_gid); +int lpid_to_global_rank(int lp_gid); /* change of units for time */ tw_stime ns_to_s(tw_stime ns);