From d2387441ea8702ed16c02dec3f11a49293d08c08 Mon Sep 17 00:00:00 2001 From: Douglas Blank Date: Thu, 26 Mar 2026 14:14:50 -0400 Subject: [PATCH 1/2] Assign per-instance TensorBoard ports from session_state registry Each panel instance now reads COMET_PANEL_INSTANCE_ID and uses a shared session_state dict to claim a unique port in the range 6000-6009, rather than all instances hardcoding port 6007. Raises RuntimeError if all 10 ports are exhausted. Co-Authored-By: Claude Sonnet 4.6 --- panels/TensorboardGroupViewer.py | 32 ++++++++++++++- .../TensorboardGroupViewer.py | 38 ++++++++++++++--- panels/TensorboardProfileViewer.py | 34 +++++++++++++-- .../TensorboardProfileViewer.py | 41 +++++++++++++++---- .../TensorboardTorchProfilerViewer.py | 38 ++++++++++++++--- 5 files changed, 160 insertions(+), 23 deletions(-) diff --git a/panels/TensorboardGroupViewer.py b/panels/TensorboardGroupViewer.py index b452deb..f1a4747 100644 --- a/panels/TensorboardGroupViewer.py +++ b/panels/TensorboardGroupViewer.py @@ -19,6 +19,34 @@ import glob import shutil +# --- Per-instance port assignment (6000-6009) --- +# All Streamlit panels share the same session_state, so this dict persists +# across instances and can be reused by other panels that start servers. + +PORT_RANGE_START = 6000 +PORT_RANGE_END = 6010 # exclusive + + +def get_instance_port(instance_id, registry_key="instance_port_map"): + """Return the port assigned to instance_id, assigning the next available + port if this instance hasn't been seen before. Raises RuntimeError when + the port range is exhausted.""" + if registry_key not in st.session_state: + st.session_state[registry_key] = {} + registry = st.session_state[registry_key] + if instance_id not in registry: + next_port = PORT_RANGE_START + len(registry) + if next_port >= PORT_RANGE_END: + raise RuntimeError( + f"No available ports: all ports {PORT_RANGE_START}-{PORT_RANGE_END - 1} are in use." + ) + registry[instance_id] = next_port + return registry[instance_id] + + +instance_id = os.environ["COMET_PANEL_INSTANCE_ID"] +port = get_instance_port(instance_id) + st.set_page_config(layout="wide") from streamlit_js_eval import get_page_location @@ -78,7 +106,7 @@ except: pass if not running: - command = f"/home/stuser/.local/bin/tensorboard --logdir ./logs --port 6007".split() + command = f"/home/stuser/.local/bin/tensorboard --logdir ./logs --port {port}".split() env = {} # {"PYTHONPATH": "/home/st_user/.local/lib/python3.9/site-packages"} process = subprocess.Popen(command, preexec_fn=os.setsid, env=env) needs_refresh = True @@ -93,6 +121,6 @@ bar.empty() path, _ = page_location["pathname"].split("/component") - url = page_location["origin"] + path + f"/port/6007/server?x={random.random()}" + url = page_location["origin"] + path + f"/port/{port}/server?x={random.random()}" st.markdown('⛶ Open in tab' % url, unsafe_allow_html=True) components.iframe(src=url, height=700) diff --git a/panels/TensorboardGroupViewer/TensorboardGroupViewer.py b/panels/TensorboardGroupViewer/TensorboardGroupViewer.py index 65bfdd1..fd9f557 100644 --- a/panels/TensorboardGroupViewer/TensorboardGroupViewer.py +++ b/panels/TensorboardGroupViewer/TensorboardGroupViewer.py @@ -22,6 +22,34 @@ import socket import signal +# --- Per-instance port assignment (6000-6009) --- +# All Streamlit panels share the same session_state, so this dict persists +# across instances and can be reused by other panels that start servers. + +PORT_RANGE_START = 6000 +PORT_RANGE_END = 6010 # exclusive + + +def get_instance_port(instance_id, registry_key="instance_port_map"): + """Return the port assigned to instance_id, assigning the next available + port if this instance hasn't been seen before. Raises RuntimeError when + the port range is exhausted.""" + if registry_key not in st.session_state: + st.session_state[registry_key] = {} + registry = st.session_state[registry_key] + if instance_id not in registry: + next_port = PORT_RANGE_START + len(registry) + if next_port >= PORT_RANGE_END: + raise RuntimeError( + f"No available ports: all ports {PORT_RANGE_START}-{PORT_RANGE_END - 1} are in use." + ) + registry[instance_id] = next_port + return registry[instance_id] + + +instance_id = os.environ["COMET_PANEL_INSTANCE_ID"] +port = get_instance_port(instance_id) + st.set_page_config(layout="wide") from streamlit_js_eval import get_page_location @@ -154,11 +182,11 @@ def wait_for_server(port=6007, max_wait=30): print("Can't kill the server; continuing ...") # Wait for server to stop before starting new one - if not wait_for_server_stop(port=6007, max_wait=10): + if not wait_for_server_stop(port=port, max_wait=10): st.warning("Previous Tensorboard server may still be running") # Start new server - command = f"/home/stuser/.local/bin/tensorboard --logdir ./logs --port 6007".split() + command = f"/home/stuser/.local/bin/tensorboard --logdir ./logs --port {port}".split() env = ( {} ) # {"PYTHONPATH": "/home/st_user/.local/lib/python3.9/site-packages"} @@ -166,12 +194,12 @@ def wait_for_server(port=6007, max_wait=30): st.session_state["tensorboard_state"] = "group_viewer" # Wait for server to be ready - if wait_for_server(port=6007, max_wait=30): + if wait_for_server(port=port, max_wait=30): path, _ = page_location["pathname"].split("/component") url = ( page_location["origin"] + path - + f"/port/6007/server?x={random.randint(1,1_000_000)}" + + f"/port/{port}/server?x={random.randint(1,1_000_000)}" ) st.markdown( '⛶ Open in tab' @@ -189,7 +217,7 @@ def wait_for_server(port=6007, max_wait=30): url = ( page_location["origin"] + path - + f"/port/6007/server?x={random.randint(1,1_000_000)}" + + f"/port/{port}/server?x={random.randint(1,1_000_000)}" ) st.markdown( '⛶ Open in tab' % url, diff --git a/panels/TensorboardProfileViewer.py b/panels/TensorboardProfileViewer.py index 9a263e0..ccf837e 100644 --- a/panels/TensorboardProfileViewer.py +++ b/panels/TensorboardProfileViewer.py @@ -17,7 +17,35 @@ import random import signal -st.set_page_config(layout="wide") +# --- Per-instance port assignment (6000-6009) --- +# All Streamlit panels share the same session_state, so this dict persists +# across instances and can be reused by other panels that start servers. + +PORT_RANGE_START = 6000 +PORT_RANGE_END = 6010 # exclusive + + +def get_instance_port(instance_id, registry_key="instance_port_map"): + """Return the port assigned to instance_id, assigning the next available + port if this instance hasn't been seen before. Raises RuntimeError when + the port range is exhausted.""" + if registry_key not in st.session_state: + st.session_state[registry_key] = {} + registry = st.session_state[registry_key] + if instance_id not in registry: + next_port = PORT_RANGE_START + len(registry) + if next_port >= PORT_RANGE_END: + raise RuntimeError( + f"No available ports: all ports {PORT_RANGE_START}-{PORT_RANGE_END - 1} are in use." + ) + registry[instance_id] = next_port + return registry[instance_id] + + +instance_id = os.environ["COMET_PANEL_INSTANCE_ID"] +port = get_instance_port(instance_id) + +st.set_page_config(layout="wide") if "tensorboard_state" not in st.session_state: st.session_state["tensorboard_state"] = None @@ -62,7 +90,7 @@ class EmptyExperiment: [""] + sorted(os.listdir("./%s/logs/" % selected_experiment.id)) ) if selected_log: - command = f"/home/stuser/.local/bin/tensorboard --logdir ./{selected_experiment.id}/logs/{selected_log} --port 6007".split() + command = f"/home/stuser/.local/bin/tensorboard --logdir ./{selected_experiment.id}/logs/{selected_log} --port {port}".split() env = {} # {"PYTHONPATH": "/.local/lib/python3.9/site-packages"} if st.session_state["tensorboard_state"] != (selected_experiment.id, selected_log): #print("Killing the hard way...") @@ -86,6 +114,6 @@ class EmptyExperiment: bar.empty() path, _ = page_location["pathname"].split("/component") - url = page_location["origin"] + path + f"/port/6007/server?x={random.randint(1,1_000_000)}#profile" + url = page_location["origin"] + path + f"/port/{port}/server?x={random.randint(1,1_000_000)}#profile" st.markdown('⛶ Open in tab' % url, unsafe_allow_html=True) components.iframe(src=url, height=700) diff --git a/panels/TensorboardProfileViewer/TensorboardProfileViewer.py b/panels/TensorboardProfileViewer/TensorboardProfileViewer.py index 086dfd5..0720344 100644 --- a/panels/TensorboardProfileViewer/TensorboardProfileViewer.py +++ b/panels/TensorboardProfileViewer/TensorboardProfileViewer.py @@ -2,9 +2,6 @@ # Log the tensorboard profile (and other data) with # >>> experiment.log_tensorflow_folder("./logs") -# NOTE: there is only one Tensorboard Server for your -# Python Panels; logs are shared across them - from comet_ml import API import streamlit as st import streamlit.components.v1 as components @@ -19,6 +16,34 @@ import requests import socket +# --- Per-instance port assignment (6000-6009) --- +# All Streamlit panels share the same session_state, so this dict persists +# across instances and can be reused by other panels that start servers. + +PORT_RANGE_START = 6000 +PORT_RANGE_END = 6010 # exclusive + + +def get_instance_port(instance_id, registry_key="instance_port_map"): + """Return the port assigned to instance_id, assigning the next available + port if this instance hasn't been seen before. Raises RuntimeError when + the port range is exhausted.""" + if registry_key not in st.session_state: + st.session_state[registry_key] = {} + registry = st.session_state[registry_key] + if instance_id not in registry: + next_port = PORT_RANGE_START + len(registry) + if next_port >= PORT_RANGE_END: + raise RuntimeError( + f"No available ports: all ports {PORT_RANGE_START}-{PORT_RANGE_END - 1} are in use." + ) + registry[instance_id] = next_port + return registry[instance_id] + + +instance_id = os.environ["COMET_PANEL_INSTANCE_ID"] +port = get_instance_port(instance_id) + if "tensorboard_state" not in st.session_state: st.session_state["tensorboard_state"] = None @@ -152,11 +177,11 @@ def wait_for_server(port=6007, max_wait=30): print("Can't kill the server; continuing ...") # Wait for server to stop before starting new one - if not wait_for_server_stop(port=6007, max_wait=10): + if not wait_for_server_stop(port=port, max_wait=10): st.warning("Previous Tensorboard server may still be running") # Start new server - command = f"/home/stuser/.local/bin/tensorboard --logdir ./{selected_experiment.id}/logs/{selected_log} --port 6007".split() + command = f"/home/stuser/.local/bin/tensorboard --logdir ./{selected_experiment.id}/logs/{selected_log} --port {port}".split() env = {} # {"PYTHONPATH": "/.local/lib/python3.9/site-packages"} process = subprocess.Popen(command, preexec_fn=os.setsid, env=env) st.session_state["tensorboard_state"] = ( @@ -165,12 +190,12 @@ def wait_for_server(port=6007, max_wait=30): ) # Wait for server to be ready - if wait_for_server(port=6007, max_wait=30): + if wait_for_server(port=port, max_wait=30): path, _ = page_location["pathname"].split("/component") url = ( page_location["origin"] + path - + f"/port/6007/server?x={random.randint(1,1_000_000)}#profile" + + f"/port/{port}/server?x={random.randint(1,1_000_000)}#profile" ) st.markdown( '⛶ Open in tab' @@ -188,7 +213,7 @@ def wait_for_server(port=6007, max_wait=30): url = ( page_location["origin"] + path - + f"/port/6007/server?x={random.randint(1,1_000_000)}#profile" + + f"/port/{port}/server?x={random.randint(1,1_000_000)}#profile" ) st.markdown( '⛶ Open in tab' diff --git a/panels/TensorboardTorchProfilerViewer/TensorboardTorchProfilerViewer.py b/panels/TensorboardTorchProfilerViewer/TensorboardTorchProfilerViewer.py index a8243c1..4d1c302 100644 --- a/panels/TensorboardTorchProfilerViewer/TensorboardTorchProfilerViewer.py +++ b/panels/TensorboardTorchProfilerViewer/TensorboardTorchProfilerViewer.py @@ -28,6 +28,34 @@ import requests import socket +# --- Per-instance port assignment (6000-6009) --- +# All Streamlit panels share the same session_state, so this dict persists +# across instances and can be reused by other panels that start servers. + +PORT_RANGE_START = 6000 +PORT_RANGE_END = 6010 # exclusive + + +def get_instance_port(instance_id, registry_key="instance_port_map"): + """Return the port assigned to instance_id, assigning the next available + port if this instance hasn't been seen before. Raises RuntimeError when + the port range is exhausted.""" + if registry_key not in st.session_state: + st.session_state[registry_key] = {} + registry = st.session_state[registry_key] + if instance_id not in registry: + next_port = PORT_RANGE_START + len(registry) + if next_port >= PORT_RANGE_END: + raise RuntimeError( + f"No available ports: all ports {PORT_RANGE_START}-{PORT_RANGE_END - 1} are in use." + ) + registry[instance_id] = next_port + return registry[instance_id] + + +instance_id = os.environ["COMET_PANEL_INSTANCE_ID"] +port = get_instance_port(instance_id) + if "tensorboard_state" not in st.session_state: st.session_state["tensorboard_state"] = None @@ -162,11 +190,11 @@ def wait_for_server(port=6007, max_wait=30): kill_status.warning("Can't kill the server; continuing ...") kill_status.empty() # Wait for server to stop before starting new one - if not wait_for_server_stop(port=6007, max_wait=10): + if not wait_for_server_stop(port=port, max_wait=10): st.warning("Previous Tensorboard server may still be running") # Start new server - command = f"/home/stuser/.local/bin/tensorboard --logdir ./{selected_experiment.id}/logs/{selected_log} --port 6007".split() + command = f"/home/stuser/.local/bin/tensorboard --logdir ./{selected_experiment.id}/logs/{selected_log} --port {port}".split() env = {} # {"PYTHONPATH": "/.local/lib/python3.9/site-packages"} process = subprocess.Popen(command, preexec_fn=os.setsid, env=env) st.session_state["tensorboard_state"] = ( @@ -175,12 +203,12 @@ def wait_for_server(port=6007, max_wait=30): ) # Wait for server to be ready - if wait_for_server(port=6007, max_wait=30): + if wait_for_server(port=port, max_wait=30): path, _ = page_location["pathname"].split("/component") url = ( page_location["origin"] + path - + f"/port/6007/server?x={random.randint(1,1_000_000)}#pytorch_profiler" + + f"/port/{port}/server?x={random.randint(1,1_000_000)}#pytorch_profiler" ) st.markdown( '⛶ Open in tab' @@ -198,7 +226,7 @@ def wait_for_server(port=6007, max_wait=30): url = ( page_location["origin"] + path - + f"/port/6007/server?x={random.randint(1,1_000_000)}#pytorch_profiler" + + f"/port/{port}/server?x={random.randint(1,1_000_000)}#pytorch_profiler" ) st.markdown( '⛶ Open in tab' From f77168a602f6bed873819cdffcecf91607628d88 Mon Sep 17 00:00:00 2001 From: Douglas Blank Date: Thu, 26 Mar 2026 14:32:25 -0400 Subject: [PATCH 2/2] Use index-based select_experiment() to avoid pickling non-serializable experiment objects Streamlit deepcopies session state when registering widgets, which fails when experiment objects contain thread locks. Replace direct selectbox usage with a select_experiment() helper that stores only a picklable integer index and looks up the experiment by position. Co-Authored-By: Claude Sonnet 4.6 --- panels/TensorboardProfileViewer.py | 19 +++++++++++++------ .../TensorboardProfileViewer.py | 19 +++++++++++-------- .../TensorboardTorchProfilerViewer.py | 19 +++++++++++-------- 3 files changed, 35 insertions(+), 22 deletions(-) diff --git a/panels/TensorboardProfileViewer.py b/panels/TensorboardProfileViewer.py index ccf837e..2d44107 100644 --- a/panels/TensorboardProfileViewer.py +++ b/panels/TensorboardProfileViewer.py @@ -59,6 +59,17 @@ class EmptyExperiment: id = None name = "" + +def select_experiment(experiment_list): + names = [exp.name for exp in experiment_list] + selected_idx = st.selectbox( + "Select Experiment with log:", + range(len(names)), + format_func=lambda i: names[i], + ) + return experiment_list[selected_idx] + + experiments_with_log = [EmptyExperiment()] for experiment in experiments: asset_list = experiment.get_asset_list("tensorflow-file") @@ -69,13 +80,9 @@ class EmptyExperiment: st.write("No experiments with log") st.stop() elif len(experiments_with_log) == 2: - selected_experiment = experiments_with_log[1] + selected_experiment = experiments_with_log[1] else: - selected_experiment = st.selectbox( - "Select Experiment with log:", - experiments_with_log, - format_func=lambda aexp: aexp.name - ) + selected_experiment = select_experiment(experiments_with_log) if selected_experiment.id: page_location = get_page_location() diff --git a/panels/TensorboardProfileViewer/TensorboardProfileViewer.py b/panels/TensorboardProfileViewer/TensorboardProfileViewer.py index 0720344..0088ee5 100644 --- a/panels/TensorboardProfileViewer/TensorboardProfileViewer.py +++ b/panels/TensorboardProfileViewer/TensorboardProfileViewer.py @@ -60,6 +60,16 @@ class EmptyExperiment: name = "" +def select_experiment(experiment_list): + names = [exp.name for exp in experiment_list] + selected_idx = st.selectbox( + "Select Experiment with log:", + range(len(names)), + format_func=lambda i: names[i], + ) + return experiment_list[selected_idx] + + experiments_with_log = [EmptyExperiment()] for experiment in experiments: asset_list = experiment.get_asset_list("tensorflow-file") @@ -72,14 +82,7 @@ class EmptyExperiment: elif len(experiments_with_log) == 2: selected_experiment = experiments_with_log[1] else: - names = [exp.name for exp in experiments_with_log] - selected_experiment_name = st.selectbox( - "Select Experiment with log:", - names, - ) - selected_experiment = [ - exp for exp in experiments_with_log if exp.name == selected_experiment_name - ][0] + selected_experiment = select_experiment(experiments_with_log) def wait_to_load(seconds): diff --git a/panels/TensorboardTorchProfilerViewer/TensorboardTorchProfilerViewer.py b/panels/TensorboardTorchProfilerViewer/TensorboardTorchProfilerViewer.py index 4d1c302..7fa4d8c 100644 --- a/panels/TensorboardTorchProfilerViewer/TensorboardTorchProfilerViewer.py +++ b/panels/TensorboardTorchProfilerViewer/TensorboardTorchProfilerViewer.py @@ -72,6 +72,16 @@ class EmptyExperiment: name = "" +def select_experiment(experiment_list): + names = [exp.name for exp in experiment_list] + selected_idx = st.selectbox( + "Select Experiment with log:", + range(len(names)), + format_func=lambda i: names[i], + ) + return experiment_list[selected_idx] + + experiments_with_log = [EmptyExperiment()] for experiment in experiments: asset_list = experiment.get_asset_list("tensorflow-file") @@ -84,14 +94,7 @@ class EmptyExperiment: elif len(experiments_with_log) == 2: selected_experiment = experiments_with_log[1] else: - names = [exp.name for exp in experiments_with_log] - selected_experiment_name = st.selectbox( - "Select Experiment with log:", - names, - ) - selected_experiment = [ - exp for exp in experiments_with_log if exp.name == selected_experiment_name - ][0] + selected_experiment = select_experiment(experiments_with_log) def wait_to_load(seconds):