diff --git a/pipit/readers/otf2_reader.py b/pipit/readers/otf2_reader.py index b685c3a7..0777d95a 100644 --- a/pipit/readers/otf2_reader.py +++ b/pipit/readers/otf2_reader.py @@ -8,14 +8,14 @@ import pandas as pd import multiprocessing as mp import pipit.trace - +import glob class OTF2Reader: """Reader for OTF2 trace files""" def __init__(self, dir_name, num_processes=None, create_cct=False): self.dir_name = dir_name # directory of otf2 file being read - self.file_name = self.dir_name + "/traces.otf2" + self.file_name = glob.glob(self.dir_name + "/*.otf2")[0] self.create_cct = create_cct num_cpus = mp.cpu_count() @@ -56,12 +56,10 @@ def field_to_val(self, field): if "otf2.definitions" in field_type: """ Example: An event can have an attribute called region which corresponds - to a definition. We strip the string and extract only the relevant - information, which is the type of definition such as Region and also - append its id (like Region 6) so that this definition can be accessed - in the Definitions DataFrame + to a definition. This region has an ID, and can be retrieved in the + Definitions DataFrame. """ - return field_type[25:-2] + " " + str(getattr(field, "_ref")) + return int(getattr(field, "_ref")) elif "_otf2" in field_type or "otf2" in field_type: """ Example: A measurement event has an attribute called measurement mode @@ -305,26 +303,25 @@ def events_reader(self, rank_size): # only add attributes for non-leave rows so that # there aren't duplicate attributes for a single event - if event_type != "Leave": - attributes_dict = {} - - # iterates through the event's attributes - # (ex: region, bytes sent, etc) - for key, value in vars(event).items(): - # only adds non-empty attributes - # and ignores time so there isn't a duplicate time - if value is not None and key != "time": - # uses field_to_val to convert all data types - # and ensure that there are no pickling errors - attributes_dict[self.field_to_val(key)] = ( - self.handle_data(value) - ) - event_attributes.append(attributes_dict) - else: - # nan attributes for leave rows - # attributes column is of object dtype - event_attributes.append(None) - + # ↑ + # This is genuinely baffling ???? + # Like + # Why would you not want this information in the case of a Leave Event + # You need to know what you just left ???? + # Anyways I removed the "if event_type != "Leave" + attributes_dict = {} + # iterates through the event's attributes + # (ex: region, bytes sent, etc) + for key, value in vars(event).items(): + # only adds non-empty attributes + # and ignores time so there isn't a duplicate time + if value is not None and key != "time": + # uses field_to_val to convert all data types + # and ensure that there are no pickling errors + attributes_dict[self.field_to_val(key)] = ( + self.handle_data(value) + ) + event_attributes.append(attributes_dict) trace.close() # close event files # returns dataframe with all events and their fields @@ -373,7 +370,7 @@ def read_definitions(self, trace): # only definition type that is not a registry if key == "clock_properties": # clock properties doesn't have an ID - def_id.append(float("NaN")) + def_id.append(-1) def_name.append(str(type(def_attribute))[25:-2]) attributes.append(self.fields_to_dict(def_attribute)) @@ -410,8 +407,7 @@ def read_definitions(self, trace): # only add ids for those definitions that have it def_id.append(def_object._ref) else: - # ID column is of float64 dtype - def_id.append(float("NaN")) + def_id.append(-1) # name of the definition def_name.append(str(type(def_object))[25:-2]) @@ -428,7 +424,7 @@ def read_definitions(self, trace): # Definition column is of categorical dtype definitions_dataframe = definitions_dataframe.astype( - {"Definition Type": "category"} + {"Definition Type": "category", "ID": "int"} ) return definitions_dataframe diff --git a/pipit/trace.py b/pipit/trace.py index 48735379..9e29df23 100644 --- a/pipit/trace.py +++ b/pipit/trace.py @@ -2,6 +2,7 @@ # Maryland. See the top-level LICENSE file for details. # # SPDX-License-Identifier: MIT +import sys import numpy as np import pandas as pd @@ -325,6 +326,8 @@ def calc_inc_metrics(self, columns=None): def calc_exc_metrics(self, columns=None): # calculate exc metrics for all numeric columns if not specified + # Fixme This function doesn't work properly. + print("Warning: using calc_exc_metrics but the function doesn't work properly.", file=sys.stderr) columns = self.numeric_cols if columns is None else columns # match caller and callee rows @@ -616,25 +619,31 @@ def load_imbalance(self, metric="time.exc", num_processes=1): return imbalance_df - def idle_time(self, idle_functions=["Idle"], mpi_events=False): + def idle_time(self, idle_functions=None, mpi_events=False, include_blank_spaces=False): # calculate inclusive metrics + if idle_functions is None: + idle_functions = ["Idle"] if "time.inc" not in self.events.columns: self.calc_inc_metrics() + if "time.exc" not in self.events.columns and include_blank_spaces: + self.calc_exc_metrics() + if mpi_events: idle_functions += ["MPI_Wait", "MPI_Waitall", "MPI_Recv"] def calc_idle_time(events): # assumes events is sorted by time - # Calculate idle time due to gaps in between events - # This is the total time minus exclusive time spent in functions - total_time = events["Timestamp (ns)"].max() - events["Timestamp (ns)"].min() + # Calculate idle time due to idle_functions + idle_time = events[events["Name"].isin(idle_functions)]["time.inc"].sum() + if include_blank_spaces: + # Calculate idle time due to gaps in between events + # This is the total time minus exclusive time spent in functions + total_time = events["Timestamp (ns)"].max() - events["Timestamp (ns)"].min() - idle_time = total_time - events["time.exc"].sum() + idle_time = total_time - events["time.exc"].sum() - # Calculate idle time due to idle_functions - idle_time += events[events["Name"].isin(idle_functions)]["time.inc"].sum() return idle_time return (