diff --git a/.github/workflows/nightly-stress-test.yml b/.github/workflows/nightly-stress-test.yml new file mode 100644 index 000000000000..21098a54872f --- /dev/null +++ b/.github/workflows/nightly-stress-test.yml @@ -0,0 +1,90 @@ +name: Nightly Stress Test + +on: + schedule: + - cron: '0 2 * * *' # 2 AM UTC + workflow_dispatch: + inputs: + duration: + description: 'Duration of the stress test (e.g., 30m, 1h)' + required: true + default: '30m' + +jobs: + stress-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Cache Docker layers + uses: actions/cache@v4 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-buildx-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-buildx- + + - name: Build and Start Environment + run: | + cd tests/monitoring + docker compose build + docker compose up -d + sleep 30 # Wait for initialization + + - name: Verify Connections + run: | + docker exec salt-master salt '*' test.ping + + - name: Run Aggressive Stress Test + run: | + cd tests/monitoring + chmod +x stress_test.sh stress_api.sh + # Run in background and wait for defined duration + ./stress_test.sh & + STRESS_PID=$! + + # Default to 30m if not workflow_dispatch + DURATION="${{ github.event.inputs.duration || '30m' }}" + echo "Running stress test for $DURATION..." + + # Use sleep with suffix support (m, h) + sleep $DURATION + + echo "Stopping stress test..." + pkill -P $STRESS_PID || true + kill $STRESS_PID || true + + - name: Analyze Results + run: | + cd tests/monitoring + # Give Prometheus a moment to finish scraping the final points + sleep 30 + python3 analyze_stats.py + + - name: Snapshot Metrics + if: always() + run: | + # Stop containers to ensure data is flushed to disk + cd tests/monitoring + docker compose stop prometheus + sudo tar -czf ../../prometheus-data.tar.gz ./prometheus_data + + - name: Collect Logs on Failure + if: failure() + run: | + mkdir -p artifacts + docker logs salt-master > artifacts/salt-master.log + docker logs salt-minion-1 > artifacts/salt-minion-1.log + cp monitoring/event_log.txt artifacts/ || true + + - name: Upload Artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: stress-test-results + path: | + artifacts/ + prometheus-data.tar.gz diff --git a/requirements/static/ci/py3.10/darwin-crypto.in b/requirements/static/ci/py3.10/darwin-crypto.in new file mode 100644 index 000000000000..62f61a5e2fb3 --- /dev/null +++ b/requirements/static/ci/py3.10/darwin-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --python-platform=macos --python-version=3.10 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.10/darwin-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.10/freebsd-crypto.in b/requirements/static/ci/py3.10/freebsd-crypto.in new file mode 100644 index 000000000000..4837d5b1afe3 --- /dev/null +++ b/requirements/static/ci/py3.10/freebsd-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --universal --python-version=3.10 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.10/freebsd-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.10/linux-crypto.in b/requirements/static/ci/py3.10/linux-crypto.in new file mode 100644 index 000000000000..2a53f92829e5 --- /dev/null +++ b/requirements/static/ci/py3.10/linux-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --python-platform=linux --python-version=3.10 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.10/linux-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.10/windows-crypto.in b/requirements/static/ci/py3.10/windows-crypto.in new file mode 100644 index 000000000000..2f2e7c78e5ac --- /dev/null +++ b/requirements/static/ci/py3.10/windows-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --python-platform=windows --python-version=3.10 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.10/windows-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.11/darwin-crypto.in b/requirements/static/ci/py3.11/darwin-crypto.in new file mode 100644 index 000000000000..2d46746767e1 --- /dev/null +++ b/requirements/static/ci/py3.11/darwin-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --python-platform=macos --python-version=3.11 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.11/darwin-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.11/freebsd-crypto.in b/requirements/static/ci/py3.11/freebsd-crypto.in new file mode 100644 index 000000000000..9312a2878712 --- /dev/null +++ b/requirements/static/ci/py3.11/freebsd-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --universal --python-version=3.11 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.11/freebsd-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.11/linux-crypto.in b/requirements/static/ci/py3.11/linux-crypto.in new file mode 100644 index 000000000000..8f13b4f7e1d3 --- /dev/null +++ b/requirements/static/ci/py3.11/linux-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --python-platform=linux --python-version=3.11 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.11/linux-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.11/windows-crypto.in b/requirements/static/ci/py3.11/windows-crypto.in new file mode 100644 index 000000000000..fb0c8d21093f --- /dev/null +++ b/requirements/static/ci/py3.11/windows-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --python-platform=windows --python-version=3.11 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.11/windows-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.12/darwin-crypto.in b/requirements/static/ci/py3.12/darwin-crypto.in new file mode 100644 index 000000000000..36052747205f --- /dev/null +++ b/requirements/static/ci/py3.12/darwin-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --python-platform=macos --python-version=3.12 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.12/darwin-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.12/freebsd-crypto.in b/requirements/static/ci/py3.12/freebsd-crypto.in new file mode 100644 index 000000000000..5041924f4ab5 --- /dev/null +++ b/requirements/static/ci/py3.12/freebsd-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --universal --python-version=3.12 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.12/freebsd-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.12/linux-crypto.in b/requirements/static/ci/py3.12/linux-crypto.in new file mode 100644 index 000000000000..fda4b4f39a2e --- /dev/null +++ b/requirements/static/ci/py3.12/linux-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --python-platform=linux --python-version=3.12 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.12/linux-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.12/windows-crypto.in b/requirements/static/ci/py3.12/windows-crypto.in new file mode 100644 index 000000000000..4f80e914c088 --- /dev/null +++ b/requirements/static/ci/py3.12/windows-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --python-platform=windows --python-version=3.12 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.12/windows-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.13/darwin-crypto.in b/requirements/static/ci/py3.13/darwin-crypto.in new file mode 100644 index 000000000000..6fb97c487657 --- /dev/null +++ b/requirements/static/ci/py3.13/darwin-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --python-platform=macos --python-version=3.13 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.13/darwin-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.13/freebsd-crypto.in b/requirements/static/ci/py3.13/freebsd-crypto.in new file mode 100644 index 000000000000..e231abfda076 --- /dev/null +++ b/requirements/static/ci/py3.13/freebsd-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --universal --python-version=3.13 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.13/freebsd-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.13/linux-crypto.in b/requirements/static/ci/py3.13/linux-crypto.in new file mode 100644 index 000000000000..564b53d254f7 --- /dev/null +++ b/requirements/static/ci/py3.13/linux-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --python-platform=linux --python-version=3.13 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.13/linux-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.13/windows-crypto.in b/requirements/static/ci/py3.13/windows-crypto.in new file mode 100644 index 000000000000..97b39b95d980 --- /dev/null +++ b/requirements/static/ci/py3.13/windows-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --python-platform=windows --python-version=3.13 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.13/windows-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.9/darwin-crypto.in b/requirements/static/ci/py3.9/darwin-crypto.in new file mode 100644 index 000000000000..0b3dd41437ce --- /dev/null +++ b/requirements/static/ci/py3.9/darwin-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --python-platform=macos --python-version=3.9 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.9/darwin-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.9/freebsd-crypto.in b/requirements/static/ci/py3.9/freebsd-crypto.in new file mode 100644 index 000000000000..0df5190541ba --- /dev/null +++ b/requirements/static/ci/py3.9/freebsd-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --universal --python-version=3.9 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.9/freebsd-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.9/linux-crypto.in b/requirements/static/ci/py3.9/linux-crypto.in new file mode 100644 index 000000000000..26da8966844d --- /dev/null +++ b/requirements/static/ci/py3.9/linux-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --python-platform=linux --python-version=3.9 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.9/linux-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/requirements/static/ci/py3.9/windows-crypto.in b/requirements/static/ci/py3.9/windows-crypto.in new file mode 100644 index 000000000000..8c55225f2f69 --- /dev/null +++ b/requirements/static/ci/py3.9/windows-crypto.in @@ -0,0 +1,8 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/static/ci/crypto.in --python-platform=windows --python-version=3.9 --constraint requirements/constraints.txt --no-emit-index-url -o=requirements/static/ci/py3.9/windows-crypto.in +m2crypto==0.48.0 + # via -r requirements/static/ci/crypto.in +packaging==26.2 + # via m2crypto +pycryptodome==3.23.0 + # via -r requirements/static/ci/crypto.in diff --git a/salt/auth/__init__.py b/salt/auth/__init__.py index 2b51fb94aeac..4d88cd383bb8 100644 --- a/salt/auth/__init__.py +++ b/salt/auth/__init__.py @@ -62,6 +62,31 @@ def __init__(self, opts, ckminions=None): self.tokens = salt.loader.eauth_tokens(opts) self.ckminions = ckminions or salt.utils.minions.CkMinions(opts) + def destroy(self): + """ + Clean up resources + """ + if hasattr(self, "auth") and self.auth is not None: + if hasattr(self.auth, "destroy"): + self.auth.destroy() + self.auth = {} + if hasattr(self, "tokens") and self.tokens is not None: + if hasattr(self.tokens, "destroy"): + self.tokens.destroy() + self.tokens = {} + if hasattr(self, "ckminions") and self.ckminions is not None: + if hasattr(self.ckminions, "cache") and self.ckminions.cache is not None: + if hasattr(self.ckminions.cache, "destroy"): + self.ckminions.cache.destroy() + self.ckminions.cache = None + self.ckminions = None + + def __enter__(self): + return self + + def __exit__(self, *args): + self.destroy() + def load_name(self, load): """ Return the primary name associate with the load, if an empty string diff --git a/salt/cache/__init__.py b/salt/cache/__init__.py index a094f8727b47..e80b42b3268d 100644 --- a/salt/cache/__init__.py +++ b/salt/cache/__init__.py @@ -81,6 +81,12 @@ def modules(self): self.__lazy_init() return self._modules + def destroy(self): + if hasattr(self, "_modules") and self._modules is not None: + if hasattr(self._modules, "destroy"): + self._modules.destroy() + self._modules = None + def cache(self, bank, key, fun, loop_fun=None, **kwargs): """ Check cache for the data. If it is there, check to see if it needs to diff --git a/salt/channel/server.py b/salt/channel/server.py index 7cdd4a46c153..8cdd8ac203b4 100644 --- a/salt/channel/server.py +++ b/salt/channel/server.py @@ -871,6 +871,12 @@ def close(self): self.transport.close() if self.event is not None: self.event.destroy() + if hasattr(self, "ckminions") and self.ckminions is not None: + if hasattr(self.ckminions, "cache") and self.ckminions.cache is not None: + if hasattr(self.ckminions.cache, "destroy"): + self.ckminions.cache.destroy() + self.ckminions.cache = None + self.ckminions = None class PubServerChannel: @@ -928,6 +934,12 @@ def close(self): if self.aes_funcs is not None: self.aes_funcs.destroy() self.aes_funcs = None + if hasattr(self, "ckminions") and self.ckminions is not None: + if hasattr(self.ckminions, "cache") and self.ckminions.cache is not None: + if hasattr(self.ckminions.cache, "destroy"): + self.ckminions.cache.destroy() + self.ckminions.cache = None + self.ckminions = None def pre_fork(self, process_manager, kwargs=None): """ diff --git a/salt/client/__init__.py b/salt/client/__init__.py index da27e29ed3a2..682259f73dd6 100644 --- a/salt/client/__init__.py +++ b/salt/client/__init__.py @@ -2077,6 +2077,18 @@ def destroy(self): if self.event is not None: self.event.destroy() self.event = None + if hasattr(self, "returners") and self.returners is not None: + if hasattr(self.returners, "destroy"): + self.returners.destroy() + self.returners = {} + if hasattr(self, "functions") and self.functions is not None: + if hasattr(self.functions, "destroy"): + self.functions.destroy() + self.functions = {} + if hasattr(self, "utils") and self.utils is not None: + if hasattr(self.utils, "destroy"): + self.utils.destroy() + self.utils = {} def __enter__(self): return self diff --git a/salt/config/__init__.py b/salt/config/__init__.py index 86788d5384a8..98397bb181f8 100644 --- a/salt/config/__init__.py +++ b/salt/config/__init__.py @@ -2379,6 +2379,8 @@ def mminion_config(path, overrides, ignore_config_errors=True): apply_sdb(opts) _validate_opts(opts) + if "grains" in opts and hasattr(opts["grains"], "destroy"): + opts["grains"].destroy() opts["grains"] = salt.loader.grains(opts) opts["pillar"] = {} salt.features.setup_features(opts) diff --git a/salt/daemons/masterapi.py b/salt/daemons/masterapi.py index 159d4306ceb5..7f88d4e76853 100644 --- a/salt/daemons/masterapi.py +++ b/salt/daemons/masterapi.py @@ -136,15 +136,33 @@ def clean_fsbackend(opts): ) -def clean_expired_tokens(opts): +def clean_expired_tokens(opts, loadauth=None): """ - Clean expired tokens from the master + Clean expired tokens from the master. + + If ``loadauth`` is provided, reuse the caller's LoadAuth instance + rather than constructing a fresh one. Useful in long-running loops + (e.g. Maintenance) to avoid recreating the auth/eauth_tokens + LazyLoaders on every iteration. """ - loadauth = salt.auth.LoadAuth(opts) - for tok in loadauth.list_tokens(): - token_data = loadauth.get_tok(tok) - if "expire" not in token_data or token_data.get("expire", 0) < time.time(): - loadauth.rm_token(tok) + if loadauth is not None: + _loadauth = loadauth + _owned = False + else: + _loadauth = salt.auth.LoadAuth(opts) + _owned = True + try: + for tok in _loadauth.list_tokens(): + token_data = _loadauth.get_tok(tok) + if ( + not token_data + or "expire" not in token_data + or token_data.get("expire", 0) < time.time() + ): + _loadauth.rm_token(tok) + finally: + if _owned: + _loadauth.destroy() def clean_pub_auth(opts): @@ -166,25 +184,34 @@ def clean_pub_auth(opts): log.error("Unable to delete pub auth file") -def clean_old_jobs(opts): +def clean_old_jobs(opts, mminion=None): """ - Clean out the old jobs from the job cache + Clean out the old jobs from the job cache. + + If ``mminion`` is provided, reuse the caller's MasterMinion rather + than constructing a fresh one. See ``clean_expired_tokens`` for the + same rationale. """ - # TODO: better way to not require creating the masterminion every time? - mminion = salt.minion.MasterMinion( - opts, - states=False, - rend=False, - ) # If the master job cache has a clean_old_jobs, call it fstr = "{}.clean_old_jobs".format(opts["master_job_cache"]) - if fstr in mminion.returners: - mminion.returners[fstr]() + if mminion is not None: + _mminion = mminion + _owned = False + else: + _mminion = salt.minion.MasterMinion(opts, states=False, rend=False) + _owned = True + try: + if fstr in _mminion.returners: + _mminion.returners[fstr]() + finally: + if _owned: + if hasattr(_mminion, "destroy"): + _mminion.destroy() def mk_key(opts, user): + uid = None if HAS_PWD: - uid = None try: uid = pwd.getpwnam(user).pw_uid except KeyError: @@ -444,6 +471,13 @@ class RemoteFuncs: def __init__(self, opts): self.opts = opts + self.event = None + self.ckminions = None + self.tops = None + self.local = None + self.mminion = None + self.cache = None + self.wheel_ = None self.event = salt.utils.event.get_event( "master", self.opts["sock_dir"], @@ -464,15 +498,15 @@ def __setup_fileserver(self): """ Set the local file objects from the file server interface """ - fs_ = salt.fileserver.Fileserver(self.opts) - self._serve_file = fs_.serve_file - self._file_find = fs_._find_file - self._file_hash = fs_.file_hash - self._file_list = fs_.file_list - self._file_list_emptydirs = fs_.file_list_emptydirs - self._dir_list = fs_.dir_list - self._symlink_list = fs_.symlink_list - self._file_envs = fs_.envs + self.fs_ = salt.fileserver.Fileserver(self.opts) + self._serve_file = self.fs_.serve_file + self._file_find = self.fs_._find_file + self._file_hash = self.fs_.file_hash + self._file_list = self.fs_.file_list + self._file_list_emptydirs = self.fs_.file_list_emptydirs + self._dir_list = self.fs_.dir_list + self._symlink_list = self.fs_.symlink_list + self._file_envs = self.fs_.envs def __verify_minion_publish(self, load): """ @@ -1101,6 +1135,37 @@ def destroy(self): if self.local is not None: self.local.destroy() self.local = None + if self.mminion is not None: + self.mminion.destroy() + self.mminion = None + if self.tops is not None: + if hasattr(self.tops, "destroy"): + self.tops.destroy() + self.tops = None + if self.cache is not None: + if hasattr(self.cache, "destroy"): + self.cache.destroy() + self.cache = None + if self.ckminions is not None: + if hasattr(self.ckminions, "cache") and self.ckminions.cache is not None: + if hasattr(self.ckminions.cache, "destroy"): + self.ckminions.cache.destroy() + self.ckminions.cache = None + self.ckminions = None + self.wheel_ = None + # Clear bound methods from fileserver to allow GC + if hasattr(self, "fs_") and self.fs_ is not None: + if hasattr(self.fs_, "destroy"): + self.fs_.destroy() + self.fs_ = None + self._serve_file = None + self._file_find = None + self._file_hash = None + self._file_list = None + self._file_list_emptydirs = None + self._dir_list = None + self._symlink_list = None + self._file_envs = None class LocalFuncs: @@ -1115,6 +1180,11 @@ class LocalFuncs: def __init__(self, opts, key): self.opts = opts self.key = key + self.event = None + self.local = None + self.ckminions = None + self.loadauth = None + self.mminion = None # Create the event manager self.event = salt.utils.event.get_event( "master", @@ -1130,8 +1200,6 @@ def __init__(self, opts, key): self.loadauth = salt.auth.LoadAuth(opts) # Stand up the master Minion to access returner data self.mminion = salt.minion.MasterMinion(self.opts, states=False, rend=False) - # Make a wheel object - self.wheel_ = salt.wheel.Wheel(opts) def runner(self, load): """ @@ -1170,10 +1238,10 @@ def runner(self, load): # Authorized. Do the job! try: fun = load.pop("fun") - runner_client = salt.runner.RunnerClient(self.opts) - return runner_client.asynchronous(fun, load.get("kwarg", {}), username) + with salt.runner.RunnerClient(self.opts) as runner_client: + return runner_client.asynchronous(fun, load.get("kwarg", {}), username) except Exception as exc: # pylint: disable=broad-except - log.exception("Exception occurred while introspecting %s") + log.exception("Exception occurred while introspecting %s", fun) return { "error": { "name": exc.__class__.__name__, @@ -1231,7 +1299,8 @@ def wheel(self, load): } try: self.event.fire_event(data, salt.utils.event.tagify([jid, "new"], "wheel")) - ret = self.wheel_.call_func(fun, **load) + with salt.wheel.WheelClient(self.opts) as wheel_client: + ret = wheel_client.call_func(fun, **load) data["return"] = ret data["success"] = True self.event.fire_event(data, salt.utils.event.tagify([jid, "ret"], "wheel")) @@ -1484,3 +1553,15 @@ def destroy(self): if self.local is not None: self.local.destroy() self.local = None + if self.mminion is not None: + self.mminion.destroy() + self.mminion = None + if self.loadauth is not None: + self.loadauth.destroy() + self.loadauth = None + if self.ckminions is not None: + if hasattr(self.ckminions, "cache") and self.ckminions.cache is not None: + if hasattr(self.ckminions.cache, "destroy"): + self.ckminions.cache.destroy() + self.ckminions.cache = None + self.ckminions = None diff --git a/salt/fileserver/__init__.py b/salt/fileserver/__init__.py index ee7b7b23a79c..fd51d1dec3aa 100644 --- a/salt/fileserver/__init__.py +++ b/salt/fileserver/__init__.py @@ -383,6 +383,12 @@ def master_opts(self, load): """ return self.opts + def destroy(self): + if hasattr(self, "servers") and self.servers is not None: + if hasattr(self.servers, "destroy"): + self.servers.destroy() + self.servers = {} + def update_opts(self): # This fix func monkey patching by pillar for name, func in self.servers.items(): @@ -879,4 +885,7 @@ def send( return getattr(self.fs, cmd)(load) def close(self): - pass + if hasattr(self, "fs") and self.fs is not None: + if hasattr(self.fs, "destroy"): + self.fs.destroy() + self.fs = None diff --git a/salt/loader/lazy.py b/salt/loader/lazy.py index 193a6f9a579b..43f22b5745db 100644 --- a/salt/loader/lazy.py +++ b/salt/loader/lazy.py @@ -348,6 +348,29 @@ def __init__( _generate_module(f"{self.loaded_base_name}.ext") _generate_module(f"{self.loaded_base_name}.ext.{tag}") + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.destroy() + + def destroy(self): + """ + Destroy the loader and clean up modules + """ + self.clean_modules() + if hasattr(self, "context_dict") and self.context_dict is not None: + if hasattr(self.context_dict, "destroy"): + self.context_dict.destroy() + if hasattr(self, "pack") and isinstance(self.pack, dict): + self.pack.clear() + if hasattr(self, "_dict"): + self._dict.clear() + if hasattr(self, "loaded_modules"): + self.loaded_modules.clear() + if hasattr(self, "missing_modules"): + self.missing_modules.clear() + def clean_modules(self): """ Clean modules and free memory for this loader's tag only. diff --git a/salt/master.py b/salt/master.py index 7d2dfe84064d..4ae60564b0ee 100644 --- a/salt/master.py +++ b/salt/master.py @@ -215,6 +215,15 @@ def _post_fork_init(self): runner_client = salt.runner.RunnerClient(ropts) # Load Returners self.returners = salt.loader.returners(self.opts, {}) + # Cache long-lived helpers so the maintenance loop reuses them across + # iterations rather than constructing fresh ones. Each construction + # triggers a fresh LazyLoader + __virtual__ cascade + module-load chain + # that allocates bytecode/dicts/strings retained in sys.modules — the + # primary driver of the Maintenance-process slow drift. + self._cached_loadauth = salt.auth.LoadAuth(self.opts) + self._cached_mminion = salt.minion.MasterMinion( + self.opts, states=False, rend=False + ) # Init Scheduler self.schedule = salt.utils.schedule.Schedule( @@ -285,8 +294,12 @@ def run(self): while time.time() - start < self.restart_interval: log.trace("Running maintenance routines") if not last or (now - last) >= self.loop_interval: - salt.daemons.masterapi.clean_old_jobs(self.opts) - salt.daemons.masterapi.clean_expired_tokens(self.opts) + salt.daemons.masterapi.clean_old_jobs( + self.opts, mminion=self._cached_mminion + ) + salt.daemons.masterapi.clean_expired_tokens( + self.opts, loadauth=self._cached_loadauth + ) salt.daemons.masterapi.clean_pub_auth(self.opts) if not last or (now - last_git_pillar_update) >= git_pillar_update_interval: last_git_pillar_update = now @@ -300,6 +313,31 @@ def run(self): now = int(time.time()) time.sleep(self.loop_interval) + def destroy(self): + """ + Clean up resources + """ + if hasattr(self, "event") and self.event is not None: + self.event.destroy() + self.event = None + if hasattr(self, "ckminions") and self.ckminions is not None: + if hasattr(self.ckminions, "cache") and self.ckminions.cache is not None: + self.ckminions.cache = None + self.ckminions = None + if hasattr(self, "schedule") and self.schedule is not None: + self.schedule = None + if getattr(self, "_cached_loadauth", None) is not None: + self._cached_loadauth.destroy() + self._cached_loadauth = None + if getattr(self, "_cached_mminion", None) is not None: + if hasattr(self._cached_mminion, "destroy"): + self._cached_mminion.destroy() + self._cached_mminion = None + + def _handle_signals(self, signum, sigframe): + self.destroy() + super()._handle_signals(signum, sigframe) + def handle_key_cache(self): """ Evaluate accepted keys and create a msgpack file @@ -1016,6 +1054,12 @@ def _handle_signals(self, signum, sigframe): except Exception: # pylint: disable=broad-except # Don't stop signal handling because an exception occurred. pass + aes_funcs = getattr(self, "aes_funcs", None) + if aes_funcs is not None: + try: + aes_funcs.destroy() + except Exception: # pylint: disable=broad-except + pass super()._handle_signals(signum, sigframe) def __bind(self): @@ -1039,22 +1083,6 @@ def _handle_payload(self, payload): """ The _handle_payload method is the key method used to figure out what needs to be done with communication to the server - - Example cleartext payload generated for 'salt myminion test.ping': - - {'enc': 'clear', - 'load': {'arg': [], - 'cmd': 'publish', - 'fun': 'test.ping', - 'jid': '', - 'key': 'alsdkjfa.,maljf-==adflkjadflkjalkjadfadflkajdflkj', - 'kwargs': {'show_jid': False, 'show_timeout': False}, - 'ret': '', - 'tgt': 'myminion', - 'tgt_type': 'glob', - 'user': 'root'}} - - :param dict payload: The payload route to the appropriate handler """ key = payload["enc"] load = payload["load"] @@ -1062,6 +1090,7 @@ def _handle_payload(self, payload): ret = self._handle_aes(load) else: ret = self._handle_clear(load) + raise salt.ext.tornado.gen.Return(ret) def _post_stats(self, start, cmd): @@ -1251,6 +1280,13 @@ def __init__(self, opts): :returns: Instance for handling AES operations """ self.opts = opts + self.event = None + self.ckminions = None + self.local = None + self.mminion = None + self.fs_ = None + self.masterapi = None + self.cache = None self.event = salt.utils.event.get_master_event( self.opts, self.opts["sock_dir"], listen=False ) @@ -1938,10 +1974,42 @@ def run_func(self, func, load): return ret, {"fun": "send"} def destroy(self): - self.masterapi.destroy() + if self.masterapi is not None: + self.masterapi.destroy() + self.masterapi = None if self.local is not None: self.local.destroy() self.local = None + if self.mminion is not None: + self.mminion.destroy() + self.mminion = None + if self.event is not None: + self.event.destroy() + self.event = None + if self.ckminions is not None: + if self.ckminions.cache is not None: + if hasattr(self.ckminions.cache, "destroy"): + self.ckminions.cache.destroy() + self.ckminions.cache = None + self.ckminions = None + if self.cache is not None: + if hasattr(self.cache, "destroy"): + self.cache.destroy() + self.cache = None + # Clear bound methods from fileserver + if self.fs_ is not None: + if hasattr(self.fs_, "destroy"): + self.fs_.destroy() + self.fs_ = None + self._serve_file = None + self._file_find = None + self._file_hash = None + self._file_hash_and_stat = None + self._file_list = None + self._file_list_emptydirs = None + self._dir_list = None + self._symlink_list = None + self._file_envs = None class ClearFuncs(TransportMethods): @@ -1968,6 +2036,12 @@ class ClearFuncs(TransportMethods): def __init__(self, opts, key): self.opts = opts self.key = key + self.event = None + self.local = None + self.ckminions = None + self.loadauth = None + self.mminion = None + self.masterapi = None # Create the event manager self.event = salt.utils.event.get_master_event( self.opts, self.opts["sock_dir"], listen=False @@ -2531,6 +2605,25 @@ def destroy(self): if self.local is not None: self.local.destroy() self.local = None + if self.mminion is not None: + self.mminion.destroy() + self.mminion = None + if self.event is not None: + self.event.destroy() + self.event = None + if self.ckminions is not None: + if self.ckminions.cache is not None: + if hasattr(self.ckminions.cache, "destroy"): + self.ckminions.cache.destroy() + self.ckminions.cache = None + self.ckminions = None + if self.loadauth is not None: + self.loadauth.destroy() + self.loadauth = None + if self.wheel_ is not None: + if hasattr(self.wheel_, "destroy"): + self.wheel_.destroy() + self.wheel_ = None while self.channels: chan = self.channels.pop() chan.close() diff --git a/salt/minion.py b/salt/minion.py index 670bae0fa7c1..e75871ae2618 100644 --- a/salt/minion.py +++ b/salt/minion.py @@ -8,6 +8,7 @@ import copy import errno import functools +import gc import logging import multiprocessing import os @@ -1001,6 +1002,8 @@ def __init__( whitelist=None, ignore_config_errors=True, ): + self.executors = None + self.matchers = None self.opts = salt.config.mminion_config( opts["conf_file"], opts, ignore_config_errors=ignore_config_errors ) @@ -1010,8 +1013,69 @@ def __init__( self.mk_rend = rend self.mk_matcher = matcher + self.returners = None + self.functions = None + self.utils = None + self.proxy = None self.gen_modules(initial_load=True) + def destroy(self): + """ + Destroy the MasterMinion object + """ + if self.returners is not None: + # Some returners have a destroy method + for returner in self.returners: + try: + func = self.returners[returner] + if hasattr(func, "destroy"): + func.destroy() + except Exception: # pylint: disable=broad-except + pass + if hasattr(self.returners, "destroy"): + self.returners.destroy() + self.returners = {} + if self.functions is not None and hasattr(self.functions, "destroy"): + self.functions.destroy() + self.functions = {} + if self.utils is not None and hasattr(self.utils, "destroy"): + self.utils.destroy() + self.utils = {} + if hasattr(self, "states") and self.states is not None: + if hasattr(self.states, "destroy"): + self.states.destroy() + self.states = {} + if hasattr(self, "rend") and self.rend is not None: + if hasattr(self.rend, "destroy"): + self.rend.destroy() + self.rend = {} + if hasattr(self, "matchers") and self.matchers is not None: + if hasattr(self.matchers, "destroy"): + self.matchers.destroy() + self.matchers = {} + if hasattr(self, "executors") and self.executors is not None: + if hasattr(self.executors, "destroy"): + self.executors.destroy() + self.executors = {} + if hasattr(self, "proxy") and self.proxy is not None: + if hasattr(self.proxy, "destroy"): + self.proxy.destroy() + self.proxy = {} + if hasattr(self, "serializers") and self.serializers is not None: + if hasattr(self.serializers, "destroy"): + self.serializers.destroy() + self.serializers = {} + if self.opts and "grains" in self.opts: + if hasattr(self.opts["grains"], "destroy"): + self.opts["grains"].destroy() + self.opts["grains"] = {} + + def __enter__(self): + return self + + def __exit__(self, *args): + self.destroy() + def gen_modules(self, initial_load=False): """ Tell the minion to reload the execution modules @@ -1090,6 +1154,19 @@ def handle_event(self, package): except Exception as exc: # pylint: disable=broad-except log.error("Error dispatching event. %s", exc) + def destroy(self): + """ + Tear down the MinionManager + """ + if hasattr(self, "process_manager") and self.process_manager is not None: + self.process_manager.stop_restarting() + self.process_manager.kill_children() + if hasattr(self, "minions"): + for minion in self.minions: + if hasattr(minion, "destroy"): + minion.destroy() + self.minions = [] + def _create_minion_object( self, opts, @@ -1271,16 +1348,6 @@ def stop_async(self, signum, parent_sig_handler): # Call the parent signal handler parent_sig_handler(signum, None) - def destroy(self): - for minion in self.minions: - minion.destroy() - if self.event_publisher is not None: - self.event_publisher.close() - self.event_publisher = None - if self.event is not None: - self.event.destroy() - self.event = None - class Minion(MinionBase): """ @@ -1657,6 +1724,7 @@ def _load_modules( # a memory limit on module imports # this feature ONLY works on *nix like OSs (resource module doesn't work on windows) modules_max_memory = False + old_mem_limit = None if opts.get("modules_max_memory", -1) > 0 and HAS_PSUTIL and HAS_RESOURCE: log.debug( "modules_max_memory set, enforcing a maximum of %s", @@ -4164,6 +4232,15 @@ def ping_timeout_handler(*_): elif self.opts.get("master_type") != "disable": log.error("No connection to master found. Scheduled jobs will not run.") + # Periodic full-generation gc.collect() to reap reference cycles + # created by Tornado coroutine timeouts (FutureWithTimeout, + # Runner.handle_yield closures, traceback objects, etc.). Python's + # default GC thresholds (700, 10, 10) run generation-2 too rarely + # for the rate these cycles accumulate in a busy minion (~50 MB/hr + # of cyclic garbage measured under stress). Reaping every 60 s + # keeps the working set steady. + self.add_periodic_callback("gc_collect", gc.collect, interval=60) + if start: try: self.io_loop.start() @@ -4269,6 +4346,36 @@ def destroy(self): for cb in self.periodic_callbacks.values(): cb.stop() + # Clean up loaders + if hasattr(self, "functions") and self.functions is not None: + if hasattr(self.functions, "destroy"): + self.functions.destroy() + self.functions = {} + if hasattr(self, "returners") and self.returners is not None: + if hasattr(self.returners, "destroy"): + self.returners.destroy() + self.returners = {} + if hasattr(self, "states") and self.states is not None: + if hasattr(self.states, "destroy"): + self.states.destroy() + self.states = {} + if hasattr(self, "rend") and self.rend is not None: + if hasattr(self.rend, "destroy"): + self.rend.destroy() + self.rend = {} + if hasattr(self, "matchers") and self.matchers is not None: + if hasattr(self.matchers, "destroy"): + self.matchers.destroy() + self.matchers = {} + if hasattr(self, "executors") and self.executors is not None: + if hasattr(self.executors, "destroy"): + self.executors.destroy() + self.executors = {} + if hasattr(self, "utils") and self.utils is not None: + if hasattr(self.utils, "destroy"): + self.utils.destroy() + self.utils = {} + # pylint: disable=W1701 def __del__(self): self.destroy() @@ -4437,6 +4544,9 @@ def destroy(self): if self.local is not None: self.local.destroy() self.local = None + if hasattr(self, "mminion") and self.mminion is not None: + self.mminion.destroy() + self.mminion = None if self.forward_events is not None: self.forward_events.stop() @@ -4812,6 +4922,10 @@ def destroy(self): self._closing = True if self.local is not None: self.local.destroy() + self.local = None + if hasattr(self, "mminion") and self.mminion is not None: + self.mminion.destroy() + self.mminion = None class ProxyMinionManager(MinionManager): diff --git a/salt/netapi/__init__.py b/salt/netapi/__init__.py index a6c4ef064280..523d359d7161 100644 --- a/salt/netapi/__init__.py +++ b/salt/netapi/__init__.py @@ -69,6 +69,9 @@ class NetapiClient: def __init__(self, opts): self.opts = opts + self.resolver = None + self.loadauth = None + self.ckminions = None apiopts = copy.deepcopy(self.opts) apiopts["enable_ssh_minions"] = True apiopts["cachedir"] = os.path.join(opts["cachedir"], "saltapi") @@ -79,6 +82,33 @@ def __init__(self, opts): self.key = salt.daemons.masterapi.access_keys(apiopts) self.ckminions = salt.utils.minions.CkMinions(apiopts) + def destroy(self): + """ + Clean up resources + """ + if self.resolver is not None: + if hasattr(self.resolver, "auth"): + if hasattr(self.resolver.auth, "destroy"): + self.resolver.auth.destroy() + self.resolver.auth = {} + self.resolver = None + if self.loadauth is not None: + if hasattr(self.loadauth, "destroy"): + self.loadauth.destroy() + self.loadauth = None + if self.ckminions is not None: + if hasattr(self.ckminions, "cache") and self.ckminions.cache is not None: + if hasattr(self.ckminions.cache, "destroy"): + self.ckminions.cache.destroy() + self.ckminions.cache = None + self.ckminions = None + + def __enter__(self): + return self + + def __exit__(self, *args): + self.destroy() + def _is_master_running(self): """ Perform a lightweight check to see if the master daemon is running @@ -262,8 +292,8 @@ def runner(self, fun, timeout=None, full_return=False, **kwargs): if timeout is not None: timeout = float(timeout) - runner = salt.runner.RunnerClient(self.opts) - return runner.cmd_sync(kwargs, timeout=timeout, full_return=full_return) + with salt.runner.RunnerClient(self.opts) as runner: + return runner.cmd_sync(kwargs, timeout=timeout, full_return=full_return) def runner_async(self, fun, **kwargs): """ @@ -277,8 +307,8 @@ def runner_async(self, fun, **kwargs): :return: event data and a job ID for the executed function. """ kwargs["fun"] = fun - runner = salt.runner.RunnerClient(self.opts) - return runner.cmd_async(kwargs) + with salt.runner.RunnerClient(self.opts) as runner: + return runner.cmd_async(kwargs) def wheel(self, fun, **kwargs): """ @@ -292,8 +322,8 @@ def wheel(self, fun, **kwargs): :return: Returns the result from the wheel module """ kwargs["fun"] = fun - wheel = salt.wheel.WheelClient(self.opts) - return wheel.cmd_sync(kwargs) + with salt.wheel.WheelClient(self.opts) as wheel: + return wheel.cmd_sync(kwargs) def wheel_async(self, fun, **kwargs): """ @@ -307,8 +337,8 @@ def wheel_async(self, fun, **kwargs): :return: Returns the result from the wheel module """ kwargs["fun"] = fun - wheel = salt.wheel.WheelClient(self.opts) - return wheel.cmd_async(kwargs) + with salt.wheel.WheelClient(self.opts) as wheel: + return wheel.cmd_async(kwargs) CLIENTS = [ diff --git a/salt/netapi/rest_cherrypy/app.py b/salt/netapi/rest_cherrypy/app.py index 310f25b0d160..ca3d85cd2fc0 100644 --- a/salt/netapi/rest_cherrypy/app.py +++ b/salt/netapi/rest_cherrypy/app.py @@ -1176,7 +1176,6 @@ class LowDataAdapter: def __init__(self): self.opts = cherrypy.config["saltopts"] self.apiopts = cherrypy.config["apiopts"] - self.api = salt.netapi.NetapiClient(self.opts) def exec_lowstate(self, client=None, token=None): """ @@ -1198,39 +1197,40 @@ def exec_lowstate(self, client=None, token=None): # Make any requested additions or modifications to each lowstate, then # execute each one and yield the result. - for chunk in lowstate: - if token: - chunk["token"] = token - - if "token" in chunk: - # Make sure that auth token is hex - try: - int(chunk["token"], 16) - except (TypeError, ValueError): - raise cherrypy.HTTPError(401, "Invalid token") - - if "token" in chunk: - # Make sure that auth token is hex - try: - int(chunk["token"], 16) - except (TypeError, ValueError): - raise cherrypy.HTTPError(401, "Invalid token") - - if client: - chunk["client"] = client - - # Make any 'arg' params a list if not already. - # This is largely to fix a deficiency in the urlencoded format. - if "arg" in chunk and not isinstance(chunk["arg"], list): - chunk["arg"] = [chunk["arg"]] - - ret = self.api.run(chunk) - - # Sometimes Salt gives us a return and sometimes an iterator - if isinstance(ret, Iterator): - yield from ret - else: - yield ret + with salt.netapi.NetapiClient(self.opts) as api: + for chunk in lowstate: + if token: + chunk["token"] = token + + if "token" in chunk: + # Make sure that auth token is hex + try: + int(chunk["token"], 16) + except (TypeError, ValueError): + raise cherrypy.HTTPError(401, "Invalid token") + + if "token" in chunk: + # Make sure that auth token is hex + try: + int(chunk["token"], 16) + except (TypeError, ValueError): + raise cherrypy.HTTPError(401, "Invalid token") + + if client: + chunk["client"] = client + + # Make any 'arg' params a list if not already. + # This is largely to fix a deficiency in the urlencoded format. + if "arg" in chunk and not isinstance(chunk["arg"], list): + chunk["arg"] = [chunk["arg"]] + + ret = api.run(chunk) + + # Sometimes Salt gives us a return and sometimes an iterator + if isinstance(ret, Iterator): + yield from ret + else: + yield ret @cherrypy.config(**{"tools.sessions.on": False}) def GET(self): @@ -1877,8 +1877,11 @@ def POST(self, **kwargs): ] }} """ - if not self.api._is_master_running(): - raise salt.exceptions.SaltDaemonNotRunning("Salt Master is not available.") + with salt.netapi.NetapiClient(self.opts) as api: + if not api._is_master_running(): + raise salt.exceptions.SaltDaemonNotRunning( + "Salt Master is not available." + ) # the urlencoded_processor will wrap this in a list if isinstance(cherrypy.serving.request.lowstate, list): diff --git a/salt/roster/__init__.py b/salt/roster/__init__.py index a6b8bb2475de..3b695ddcaadb 100644 --- a/salt/roster/__init__.py +++ b/salt/roster/__init__.py @@ -69,9 +69,25 @@ def __init__(self, opts, backends="flat"): self.backends = backends if not backends: self.backends = ["flat"] - utils = salt.loader.utils(self.opts) - runner = salt.loader.runner(self.opts, utils=utils) - self.rosters = salt.loader.roster(self.opts, runner=runner, utils=utils) + self.utils = salt.loader.utils(self.opts) + self.runner = salt.loader.runner(self.opts, utils=self.utils) + self.rosters = salt.loader.roster( + self.opts, runner=self.runner, utils=self.utils + ) + + def destroy(self): + if hasattr(self, "rosters") and self.rosters is not None: + if hasattr(self.rosters, "destroy"): + self.rosters.destroy() + self.rosters = {} + if hasattr(self, "runner") and self.runner is not None: + if hasattr(self.runner, "destroy"): + self.runner.destroy() + self.runner = {} + if hasattr(self, "utils") and self.utils is not None: + if hasattr(self.utils, "destroy"): + self.utils.destroy() + self.utils = {} def _gen_back(self): """ diff --git a/salt/runner.py b/salt/runner.py index d3501b8f9190..d3a685c431a1 100644 --- a/salt/runner.py +++ b/salt/runner.py @@ -38,6 +38,36 @@ class RunnerClient(mixins.SyncClientMixin, mixins.AsyncClientMixin): client = "runner" tag_prefix = "run" + def __init__(self, opts, context=None): + mixins.SyncClientMixin.__init__(self, opts, context=context) + mixins.AsyncClientMixin.__init__(self, opts, context=context) + self.opts = opts + self.context = context or {} + self.event = None + self.salt_user = salt.utils.user.get_specific_user() + self.event = salt.utils.event.get_event( + "master", self.opts["sock_dir"], opts=self.opts, listen=False + ) + + def destroy(self): + if self.event is not None: + self.event.destroy() + self.event = None + if hasattr(self, "_functions") and self._functions is not None: + if hasattr(self._functions, "destroy"): + self._functions.destroy() + self._functions = {} + if hasattr(self, "utils") and self.utils is not None: + if hasattr(self.utils, "destroy"): + self.utils.destroy() + self.utils = {} + + def __enter__(self): + return self + + def __exit__(self, *args): + self.destroy() + @property def functions(self): if not hasattr(self, "_functions"): @@ -196,6 +226,17 @@ def __init__(self, opts, context=None): self.returners = salt.loader.returners(opts, self.functions, context=context) self.outputters = salt.loader.outputters(opts) + def destroy(self): + if hasattr(self, "returners") and self.returners is not None: + if hasattr(self.returners, "destroy"): + self.returners.destroy() + self.returners = {} + if hasattr(self, "outputters") and self.outputters is not None: + if hasattr(self.outputters, "destroy"): + self.outputters.destroy() + self.outputters = {} + super().destroy() + def print_docs(self): """ Print out the documentation! diff --git a/salt/tokens/localfs.py b/salt/tokens/localfs.py index 93cfffa934f4..4f0dc55cb07e 100644 --- a/salt/tokens/localfs.py +++ b/salt/tokens/localfs.py @@ -89,10 +89,10 @@ def list_tokens(opts): List all tokens in the store. :param opts: Salt master config options - :returns: List of dicts (tokens) + :returns: Generator of tokens """ - ret = [] - for dirpath, dirnames, filenames in salt.utils.path.os_walk(opts["token_dir"]): - for token in filenames: - ret.append(token) - return ret + if not os.path.exists(opts["token_dir"]): + return + for entry in os.scandir(opts["token_dir"]): + if entry.is_file(): + yield entry.name diff --git a/salt/transport/frame.py b/salt/transport/frame.py index aa6961f5ad91..f3d3cd53494b 100644 --- a/salt/transport/frame.py +++ b/salt/transport/frame.py @@ -2,6 +2,8 @@ Helper functions for transport components to handle message framing """ +import struct + import salt.utils.msgpack @@ -20,10 +22,14 @@ def frame_msg(body, header=None, raw_body=False): # pylint: disable=unused-argu def frame_msg_ipc(body, header=None, raw_body=False): # pylint: disable=unused-argument """ - Frame the given message with our wire protocol for IPC + Frame the given message with our wire protocol for IPC. - For IPC, we don't need to be backwards compatible, so - use the more efficient "use_bin_type=True" on Python 3. + Prefixes the msgpack payload with a 4-byte big-endian length so the + receiver can read exactly the right number of bytes per message. This + prevents msgpack stream corruption when concurrent large writes exceed + the Unix socket PIPE_BUF atomic-write boundary (~65 536 bytes on Linux), + which caused interleaved bytes and UnicodeDecodeError / ExtraData crashes + in subscribers such as EventReturn under high event-bus load. """ framed_msg = {} if header is None: @@ -31,7 +37,8 @@ def frame_msg_ipc(body, header=None, raw_body=False): # pylint: disable=unused- framed_msg["head"] = header framed_msg["body"] = body - return salt.utils.msgpack.dumps(framed_msg, use_bin_type=True) + payload = salt.utils.msgpack.dumps(framed_msg, use_bin_type=True) + return struct.pack(">I", len(payload)) + payload def _decode_embedded_list(src): diff --git a/salt/transport/ipc.py b/salt/transport/ipc.py index 2b55cc0e7dfd..f0134300de76 100644 --- a/salt/transport/ipc.py +++ b/salt/transport/ipc.py @@ -5,6 +5,7 @@ import errno import logging import socket +import struct import time import warnings @@ -171,18 +172,18 @@ def return_message(msg): else: return _null - unpacker = salt.utils.msgpack.Unpacker(raw=False) while not self._closing and not stream.closed(): try: - wire_bytes = yield stream.read_bytes(4096, partial=True) - unpacker.feed(wire_bytes) - for framed_msg in unpacker: - body = framed_msg["body"] - self.io_loop.spawn_callback( - self.payload_handler, - body, - write_callback(stream, framed_msg["head"]), - ) + length_bytes = yield stream.read_bytes(4) + length = struct.unpack(">I", length_bytes)[0] + payload = yield stream.read_bytes(length) + framed_msg = salt.utils.msgpack.unpackb(payload, raw=False) + body = framed_msg["body"] + self.io_loop.spawn_callback( + self.payload_handler, + body, + write_callback(stream, framed_msg.get("head", {})), + ) except _StreamClosedError: log.trace("Client disconnected from IPC %s", self.socket_path) break @@ -274,7 +275,6 @@ def __init__(self, socket_path, io_loop=None): self.socket_path = socket_path self._closing = False self.stream = None - self.unpacker = salt.utils.msgpack.Unpacker(raw=False) self._connecting_future = None def connected(self): @@ -534,18 +534,43 @@ def start(self): ) self._started = True - @salt.ext.tornado.gen.coroutine def _write(self, stream, pack): + """ + Queue a write to ``stream`` and attach a completion callback to + handle exceptions. + + Note: this is intentionally NOT a Tornado @gen.coroutine. When it + was a coroutine, every published message produced a long-lived + gen.Runner per subscriber stream that waited inside ``yield + stream.write(...)`` until the OS drained the bytes. Under high + event rates (beacons, command returns, flood_events), Runners + piled up faster than the OS could flush, and the + Runner/generator/frame/Future quadruple was the dominant minion + leak. Returning a non-Awaitable lets stream.write enqueue the + bytes in Tornado's own write buffer (which Tornado already + manages efficiently) and the done-callback handles the disconnect + path without spawning a coroutine. + """ + + def _on_done(future, _stream=stream): + try: + future.result() + except StreamClosedError: + log.trace("Client disconnected from IPC %s", self.socket_path) + self.streams.discard(_stream) + except Exception as exc: # pylint: disable=broad-except + log.error("Exception occurred while handling stream: %s", exc) + if not _stream.closed(): + _stream.close() + self.streams.discard(_stream) + try: - yield stream.write(pack) + future = stream.write(pack) except StreamClosedError: - log.trace("Client disconnected from IPC %s", self.socket_path) - self.streams.discard(stream) - except Exception as exc: # pylint: disable=broad-except - log.error("Exception occurred while handling stream: %s", exc) - if not stream.closed(): - stream.close() self.streams.discard(stream) + return + if future is not None: + future.add_done_callback(_on_done) def publish(self, msg): """ @@ -555,8 +580,14 @@ def publish(self, msg): return pack = salt.transport.frame.frame_msg_ipc(msg, raw_body=True) - for stream in self.streams: - self.io_loop.spawn_callback(self._write, stream, pack) + # Iterate a snapshot: ``_write`` may call ``self.streams.discard`` + # synchronously when a stream is already closed at write time, + # which would otherwise raise "Set changed size during iteration". + for stream in tuple(self.streams): + # _write is now a regular function that returns immediately + # after queuing the write into Tornado's IOStream buffer. + # No spawn_callback (and therefore no gen.Runner) is needed. + self._write(stream, pack) def handle_connection(self, connection, address): log.trace("IPCServer: Handling connection to address: %s", address) @@ -646,73 +677,87 @@ class IPCMessageSubscriber(IPCClient): def __init__(self, socket_path, io_loop=None): super().__init__(socket_path, io_loop=io_loop) self._read_stream_future = None - self._saved_data = [] + self._saved_data = [] # retained for API compatibility; no longer populated self._read_in_progress = Lock() self._closing = False @salt.ext.tornado.gen.coroutine def _read(self, timeout, callback=None): + """ + Read framed IPC messages. + + Each message on the wire is: [4-byte big-endian length][msgpack payload]. + We read the length prefix first (applying the caller's timeout there), + then read exactly that many bytes for the payload — eliminating the + streaming-Unpacker approach that was vulnerable to byte interleaving + when large messages exceeded PIPE_BUF on the Unix domain socket. + + When a ``callback`` is provided, this coroutine loops indefinitely, + invoking the callback for every received message until the stream + is closed. Without a callback, it returns the body of the first + message (or None on timeout / closed stream). + """ try: try: yield self._read_in_progress.acquire(timeout=0.00000001) except salt.ext.tornado.gen.TimeoutError: raise salt.ext.tornado.gen.Return(None) - exc_to_raise = None ret = None try: while True: + # Step 1: read the 4-byte length prefix, honouring the timeout. if self._read_stream_future is None: - self._read_stream_future = self.stream.read_bytes( - 4096, partial=True - ) + self._read_stream_future = self.stream.read_bytes(4) if timeout is None: - wire_bytes = yield self._read_stream_future + length_bytes = yield self._read_stream_future else: - wire_bytes = yield FutureWithTimeout( + length_bytes = yield FutureWithTimeout( self.io_loop, self._read_stream_future, timeout ) self._read_stream_future = None - # Remove the timeout once we get some data or an exception - # occurs. We will assume that the rest of the data is already - # there or is coming soon if an exception doesn't occur. + # Remove the timeout once we've received the length prefix + # so the payload read isn't artificially constrained. timeout = None - self.unpacker.feed(wire_bytes) - first_sync_msg = True - for framed_msg in self.unpacker: + # Step 2: read exactly `length` bytes for the msgpack payload. + length = struct.unpack(">I", length_bytes)[0] + payload = yield self.stream.read_bytes(length) + framed_msg = salt.utils.msgpack.unpackb(payload, raw=False) + + if isinstance(framed_msg, dict) and "body" in framed_msg: + body = framed_msg["body"] + else: + log.debug( + "IPC subscriber: malformed frame (type=%s), skipping", + type(framed_msg).__name__, + ) if callback: - self.io_loop.spawn_callback(callback, framed_msg["body"]) - elif first_sync_msg: - ret = framed_msg["body"] - first_sync_msg = False - else: - self._saved_data.append(framed_msg["body"]) - if not first_sync_msg: - # We read at least one piece of data and we're on sync run + continue break + + if callback: + self.io_loop.spawn_callback(callback, body) + continue + ret = body + break except TornadoTimeoutError: - # In the timeout case, just return None. - # Keep 'self._read_stream_future' alive. + # Timed out waiting for the length prefix; keep the pending + # future so the next call can reuse it. ret = None - except StreamClosedError as exc: + except StreamClosedError: log.trace("Subscriber disconnected from IPC %s", self.socket_path) self._read_stream_future = None except Exception as exc: # pylint: disable=broad-except - log.error( + log.debug( "Exception occurred in Subscriber while handling stream: %s", exc ) self._read_stream_future = None - exc_to_raise = exc self._read_in_progress.release() - - if exc_to_raise is not None: - raise exc_to_raise # pylint: disable=E0702 raise salt.ext.tornado.gen.Return(ret) - # Handle ctrl+c gracefully except TypeError: pass diff --git a/salt/transport/zeromq.py b/salt/transport/zeromq.py index 4428526b7750..9cb1873e8d02 100644 --- a/salt/transport/zeromq.py +++ b/salt/transport/zeromq.py @@ -8,6 +8,7 @@ import logging import os import signal +import socket import sys import threading from random import randint @@ -289,10 +290,33 @@ def on_recv(self, callback): :param func callback: A function which should be called when data is received """ + if callback is None: + # Caller wants to clear the callback — pass through directly. + try: + return self.stream.on_recv(None) + except OSError as exc: + if str(exc) == "Stream is closed": + return + raise + + # Wrap the callback so PyZMQ never sees an Awaitable return value. + # Without this, when callback is a @gen.coroutine (e.g. the minion's + # _handle_payload), PyZMQ's _run_callback does + # `asyncio.ensure_future(callback_result)`, creating asyncio.Tasks on + # the asyncio loop which is never driven by Tornado's IOLoop. Those + # Tasks (plus their gen.Runner / Future / WeakRef tracking) accumulate + # indefinitely. Routing through spawn_callback lets Tornado's own + # _run_callback convert the coroutine into a Tornado Future and drive + # it to completion natively, returning None to PyZMQ. + io_loop = self.io_loop + + def _dispatch(*args, **kwargs): + io_loop.spawn_callback(callback, *args, **kwargs) + try: - return self.stream.on_recv(callback) + return self.stream.on_recv(_dispatch) except OSError as exc: - if callback is None and str(exc) == "Stream is closed": + if str(exc) == "Stream is closed": return raise @@ -313,18 +337,44 @@ def zmq_device(self): Multiprocessing target for the zmq queue device """ self.__setup_signals() - context = zmq.Context(self.opts["worker_threads"]) + # The first argument to zmq.Context is ``io_threads`` -- the + # number of background I/O threads libzmq spawns -- not the + # number of MWorker processes. Each libzmq I/O thread keeps + # its own message-buffer pool that grows under sustained + # traffic and is never released, so passing in + # ``opts["worker_threads"]`` (typically 5-10) caused the + # MWorkerQueue process RSS to climb ~7-8 MB/min indefinitely. + # The QUEUE device only proxies two sockets; one I/O thread is + # plenty. + context = zmq.Context(1) # Prepare the zeromq sockets self.uri = "tcp://{interface}:{ret_port}".format(**self.opts) self.clients = context.socket(zmq.ROUTER) - self.clients.setsockopt(zmq.LINGER, -1) + # LINGER=-1 ("never discard") combined with the salt CLI's pattern + # of one-shot connections (connect, send, recv, disconnect) caused + # libzmq to retain undelivered queue slots for every disconnected + # peer indefinitely under sustained CLI churn. A small finite + # LINGER lets libzmq reap those slots. ROUTER_HANDOVER=1 makes + # the router swap a stale peer (same routing-id, new connection) + # instead of blocking on the old one -- relevant for minions that + # reconnect after a brief network blip. TCP_KEEPALIVE forces + # libzmq to notice peers that disappear without sending FIN, so + # their queues are reaped instead of leaking until the OS default + # 2-hour idle timer fires. + self.clients.setsockopt(zmq.LINGER, 1000) + if hasattr(zmq, "ROUTER_HANDOVER"): + self.clients.setsockopt(zmq.ROUTER_HANDOVER, 1) + self.clients.setsockopt(zmq.TCP_KEEPALIVE, 1) + self.clients.setsockopt(zmq.TCP_KEEPALIVE_IDLE, 60) + self.clients.setsockopt(zmq.TCP_KEEPALIVE_INTVL, 15) + self.clients.setsockopt(zmq.TCP_KEEPALIVE_CNT, 3) if self.opts["ipv6"] is True and hasattr(zmq, "IPV4ONLY"): # IPv6 sockets work for both IPv6 and IPv4 addresses self.clients.setsockopt(zmq.IPV4ONLY, 0) self.clients.setsockopt(zmq.BACKLOG, self.opts.get("zmq_backlog", 1000)) self._start_zmq_monitor() self.workers = context.socket(zmq.DEALER) - self.workers.setsockopt(zmq.LINGER, -1) + self.workers.setsockopt(zmq.LINGER, 1000) if self.opts["mworker_queue_niceness"] and not salt.utils.platform.is_windows(): log.info( @@ -441,7 +491,18 @@ def post_fork(self, message_handler, io_loop): os.chmod(os.path.join(self.opts["sock_dir"], "workers.ipc"), 0o600) self.stream = zmq.eventloop.zmqstream.ZMQStream(self._socket, io_loop=io_loop) self.message_handler = message_handler - self.stream.on_recv_stream(self.handle_message) + + def _dispatch_handle_message(stream, payload): + # Drive the coroutine via Tornado's IOLoop rather than returning + # it to PyZMQ's _run_callback. PyZMQ wraps any Awaitable return + # value with asyncio.ensure_future, creating Tasks on the asyncio + # event loop which is never driven in MWorkers — causing permanent + # Task accumulation. Routing through spawn_callback lets Tornado's + # own _run_callback convert it to a Tornado Future and drive it to + # completion without touching asyncio. + io_loop.spawn_callback(self.handle_message, stream, payload) + + self.stream.on_recv_stream(_dispatch_handle_message) @salt.ext.tornado.gen.coroutine def handle_message(self, stream, payload): @@ -550,6 +611,40 @@ def _init_socket(self): if hasattr(zmq, "RECONNECT_IVL_MAX"): self.socket.setsockopt(zmq.RECONNECT_IVL_MAX, 5000) + # Set a stable ZMQ routing identity so the master's ROUTER socket + # reuses an existing slot for this caller (combined with + # ROUTER_HANDOVER=1 on the master) rather than allocating a new + # entry in its per-peer table for every CLI invocation. Without + # this, the master's libzmq peer-id hashtable grows unbounded + # under sustained CLI churn (about 6 MB/min in stress). + # + # Only do this for salt CLI tools (which do NOT set ``__role`` in + # opts). All long-lived daemons -- minion, syndic, master -- + # open multiple AsyncReqMessageClient instances concurrently from + # a single process: the minion at startup for auth + pillar + + # file requests, the syndic when relaying multiple downstream + # minions' returns upstream, and a master when forwarding to + # peer masters. Giving them all the same stable identity would + # cause ROUTER_HANDOVER on the upstream ROUTER to silently drop + # any reply still in flight to the previous REQ as each new one + # arrived, hanging startup and breaking syndic relays. Their + # own REQ churn is bounded anyway (one peer per daemon), so they + # can keep using libzmq's default per-connection random + # routing-ids. + if not self.opts.get("__role"): + role = self.opts.get("id") or "clir" + try: + uid = os.getuid() + except AttributeError: # Windows + uid = 0 + identity = "salt-req/{role}/{host}/{uid}/{slot}".format( + role=role, + host=socket.gethostname(), + uid=uid, + slot=os.getpid() % 256, + ) + self.socket.setsockopt(zmq.IDENTITY, identity.encode("utf-8")) + _set_tcp_keepalive(self.socket, self.opts) if self.addr.startswith("tcp://["): # Hint PF type if bracket enclosed IPv6 address @@ -987,10 +1082,12 @@ def stop(self): except zmq.Error: pass self._socket = None - self._monitor_socket = None if self._monitor_stream is not None: self._monitor_stream.close() self._monitor_stream = None + if self._monitor_socket is not None: + self._monitor_socket.close() + self._monitor_socket = None log.trace("Event monitor done!") @@ -1066,7 +1163,13 @@ def on_recv(packages): exc_info_on_loglevel=logging.DEBUG, ) - pull_sock.on_recv(on_recv) + def _dispatch_on_recv(packages): + # Same fix as in RequestServer: route through Tornado's IOLoop + # instead of returning the coroutine to PyZMQ's _run_callback, + # which would wrap it with asyncio.ensure_future. + ioloop.spawn_callback(on_recv, packages) + + pull_sock.on_recv(_dispatch_on_recv) try: ioloop.start() except (KeyboardInterrupt, SystemExit): diff --git a/salt/utils/args.py b/salt/utils/args.py index f8d4957f5446..7b74c5048757 100644 --- a/salt/utils/args.py +++ b/salt/utils/args.py @@ -223,6 +223,9 @@ def yamlify_arg(arg): return original_arg +_ArgSpec = namedtuple("ArgSpec", "args varargs keywords defaults") + + def get_function_argspec(func, is_class_method=None): """ A small wrapper around inspect.signature that also supports callable objects and wrapped functions @@ -249,7 +252,6 @@ def get_function_argspec(func, is_class_method=None): raise TypeError(f"Cannot inspect argument list for '{func}'") # Build a namedtuple which looks like the result of a Python 2 argspec - _ArgSpec = namedtuple("ArgSpec", "args varargs keywords defaults") args = [] defaults = [] varargs = keywords = None diff --git a/salt/utils/context.py b/salt/utils/context.py index 45776ab4c717..c46e03e7272e 100644 --- a/salt/utils/context.py +++ b/salt/utils/context.py @@ -83,6 +83,19 @@ def active(self): except AttributeError: return False + def destroy(self): + """ + Destroy the ContextDict and clear internal state + """ + if hasattr(self, "_state"): + self._state.data = None + try: + del self._state.data + except AttributeError: + pass + if hasattr(self, "global_data"): + self.global_data.clear() + # TODO: rename? def clone(self, **kwargs): """ diff --git a/salt/utils/ctx.py b/salt/utils/ctx.py index a9c0931bd815..66a54aed8d4e 100644 --- a/salt/utils/ctx.py +++ b/salt/utils/ctx.py @@ -43,6 +43,12 @@ def __enter__(self): def __exit__(self, *exc): self.__class__._state.current_request = self._prev_request del self._prev_request + if self.__class__._state.current_request == {}: + # If we're back to an empty dict, explicitly clear to help GC + try: + del self.__class__._state.current_request + except AttributeError: + pass return False def __call__(self): diff --git a/salt/utils/event.py b/salt/utils/event.py index bf8f5a1f5e93..6cfecd9e4e84 100644 --- a/salt/utils/event.py +++ b/salt/utils/event.py @@ -245,6 +245,12 @@ def _after_fork_in_child(cls): except Exception: # pylint: disable=broad-except pass + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.destroy() + def __init__( self, node, @@ -618,6 +624,15 @@ def _get_event(self, wait, tag, match_func=None, no_block=False): return None except RuntimeError: return None + except salt.exceptions.SaltDeserializationError: + # Malformed msgpack frame — can occur under extreme event bus + # load when multiple events are concatenated in the IPC buffer + # and msgpack reports ExtraData or a UTF-8 decode failure. + # Skip this frame rather than crashing the subscriber. + log.debug( + "Event subscriber: skipping malformed event (deserialization error)" + ) + continue if not match_func(ret["tag"], tag) or not self._subproxy_match(ret["data"]): # tag not match @@ -989,23 +1004,6 @@ def set_event_handler(self, event_handler): # This will handle reconnects return self.subscriber.read_async(event_handler) - # pylint: disable=W1701 - def __del__(self): - # skip exceptions in destroy-- since destroy() doesn't cover interpreter - # shutdown-- where globals start going missing - try: - self.destroy() - except Exception: # pylint: disable=broad-except - pass - - # pylint: enable=W1701 - - def __enter__(self): - return self - - def __exit__(self, *args): - self.destroy() - class MasterEvent(SaltEvent): """ @@ -1336,6 +1334,28 @@ def __init__(self, opts, **kwargs): local_minion_opts = self.opts.copy() local_minion_opts["file_client"] = "local" self.minion = salt.minion.MasterMinion(local_minion_opts) + # Validate all configured returners exist at startup so operators get + # a clear error immediately rather than thousands of per-event errors. + configured = self.opts["event_return"] + if not isinstance(configured, list): + configured = [configured] + missing = [ + r for r in configured if f"{r}.event_return" not in self.minion.returners + ] + if missing: + log.error( + "EventReturn: the following configured event_return returner(s) " + "were not found and events will NOT be stored: %s. " + "Check that the returner modules are installed and the " + "returner_dirs configuration is correct.", + missing, + ) + self._missing_returners = set(missing) + # Track last warning time per returner to rate-limit log spam. + # With event_return_queue=0 every event flushes independently, so + # a per-flush-cycle set would still log once per event. Use wall + # time instead: only warn once every 60 seconds per returner. + self._warned_returners = {} # returner_name -> last_warn_time self.event_queue = [] self.stop = False @@ -1380,10 +1400,16 @@ def _flush_event_single(self, event_return): "Event data that caused an exception: %s", self.event_queue ) else: - log.error( - "Could not store return for event(s) - returner '%s' not found.", - event_return, - ) + # Rate-limit to one error per returner per 60 s to prevent log + # spam at high event rates (e.g. event_return_queue=0 flushes + # on every single event). + now = time.time() + if now - self._warned_returners.get(event_return, 0) >= 60: + log.error( + "Could not store return for event(s) - returner '%s' not found.", + event_return, + ) + self._warned_returners[event_return] = now def run(self): """ diff --git a/salt/utils/job.py b/salt/utils/job.py index 66b0568887b6..786803a9280f 100644 --- a/salt/utils/job.py +++ b/salt/utils/job.py @@ -25,8 +25,13 @@ def store_job(opts, load, event=None, mminion=None): if not salt.utils.verify.valid_id(opts, load["id"]): return False if mminion is None: - mminion = salt.minion.MasterMinion(opts, states=False, rend=False) + with salt.minion.MasterMinion(opts, states=False, rend=False) as mminion: + return _store_job(opts, load, event, mminion, endtime=endtime) + else: + return _store_job(opts, load, event, mminion, endtime=endtime) + +def _store_job(opts, load, event, mminion, endtime=None): job_cache = opts["master_job_cache"] if load["jid"] == "req": # The minion is returning a standalone job, request a jobid @@ -158,7 +163,13 @@ def store_minions(opts, jid, minions, mminion=None, syndic_id=None): master_job_cache """ if mminion is None: - mminion = salt.minion.MasterMinion(opts, states=False, rend=False) + with salt.minion.MasterMinion(opts, states=False, rend=False) as mminion: + return _store_minions(opts, jid, minions, mminion, syndic_id) + else: + return _store_minions(opts, jid, minions, mminion, syndic_id) + + +def _store_minions(opts, jid, minions, mminion, syndic_id=None): job_cache = opts["master_job_cache"] minions_fstr = f"{job_cache}.save_minions" diff --git a/salt/utils/minions.py b/salt/utils/minions.py index d11eabb391a7..17de7441f4c4 100644 --- a/salt/utils/minions.py +++ b/salt/utils/minions.py @@ -741,6 +741,7 @@ def check_minions( if ssh_minions: _res["minions"].extend(ssh_minions) _res["ssh_minions"] = True + roster.destroy() except Exception: # pylint: disable=broad-except log.exception( "Failed matching available minions with %s pattern: %s", tgt_type, expr diff --git a/salt/utils/process.py b/salt/utils/process.py index 371d8d2c8c83..72821052c5c7 100644 --- a/salt/utils/process.py +++ b/salt/utils/process.py @@ -531,12 +531,14 @@ def add_process(self, tgt, args=None, kwargs=None, name=None): kwargs = {} if inspect.isclass(tgt) and issubclass(tgt, multiprocessing.Process): - kwargs["name"] = name or tgt.__qualname__ + if name is None: + name = getattr(tgt, "__qualname__", str(tgt)) + kwargs["name"] = name process = tgt(*args, **kwargs) else: - process = Process( - target=tgt, args=args, kwargs=kwargs, name=name or tgt.__qualname__ - ) + if name is None: + name = getattr(tgt, "__qualname__", str(tgt)) + process = Process(target=tgt, args=args, kwargs=kwargs, name=name) process.register_finalize_method(cleanup_finalize_process, args, kwargs) diff --git a/salt/utils/timed_subprocess.py b/salt/utils/timed_subprocess.py index 13e7d67c2304..a2c2c617c297 100644 --- a/salt/utils/timed_subprocess.py +++ b/salt/utils/timed_subprocess.py @@ -101,12 +101,7 @@ def receive(): if rt.is_alive(): # Subprocess cleanup (best effort) self.process.kill() - - def terminate(): - if rt.is_alive(): - self.process.terminate() - - threading.Timer(10, terminate).start() + self.process.wait() raise salt.exceptions.TimedProcTimeoutError( "{} : Timed out after {} seconds".format( self.command, diff --git a/salt/wheel/__init__.py b/salt/wheel/__init__.py index 15a679439aa3..b861ec871df8 100644 --- a/salt/wheel/__init__.py +++ b/salt/wheel/__init__.py @@ -40,9 +40,32 @@ class WheelClient( tag_prefix = "wheel" def __init__(self, opts, context=None): - super().__init__(opts, context=context) + salt.client.mixins.SyncClientMixin.__init__(self, opts, context=context) + salt.client.mixins.AsyncClientMixin.__init__(self, opts, context=context) + self.opts = opts + self.context = context or {} + self.event = None + self.salt_user = salt.utils.user.get_specific_user() + self.event = salt.utils.event.get_event( + "master", self.opts["sock_dir"], opts=self.opts, listen=False + ) self.functions = salt.loader.wheels(opts, context=self.context) + def destroy(self): + if self.event is not None: + self.event.destroy() + self.event = None + if hasattr(self, "functions") and self.functions is not None: + if hasattr(self.functions, "destroy"): + self.functions.destroy() + self.functions = {} + + def __enter__(self): + return self + + def __exit__(self, *args): + self.destroy() + # TODO: remove/deprecate def call_func(self, fun, **kwargs): """ diff --git a/tests/monitoring/.gitignore b/tests/monitoring/.gitignore new file mode 100644 index 000000000000..fe865cdbfdbc --- /dev/null +++ b/tests/monitoring/.gitignore @@ -0,0 +1,3 @@ +pki/ +ids/ +event_log.txt diff --git a/tests/monitoring/Dockerfile.salt b/tests/monitoring/Dockerfile.salt new file mode 100644 index 000000000000..f2e08b3389a8 --- /dev/null +++ b/tests/monitoring/Dockerfile.salt @@ -0,0 +1,65 @@ +FROM python:3.10-slim + +RUN apt-get update && apt-get install -y \ + build-essential \ + libssl-dev \ + libffi-dev \ + python3-dev \ + procps \ + curl \ + wget \ + libzmq3-dev \ + tini \ + gdb \ + zlib1g-dev \ + libbz2-dev \ + liblzma-dev \ + libsqlite3-dev \ + libreadline-dev \ + libncurses-dev \ + && rm -rf /var/lib/apt/lists/* + +ARG PYTHON_VERSION=3.10.20 +RUN cd /tmp && \ + wget -q https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tar.xz && \ + tar xf Python-${PYTHON_VERSION}.tar.xz && \ + cd Python-${PYTHON_VERSION} && \ + ./configure \ + --enable-shared \ + --prefix=/usr/local \ + --with-ensurepip=install \ + CFLAGS="-g -O2 -fno-omit-frame-pointer" && \ + make -j"$(nproc)" && \ + make install && \ + ldconfig && \ + cd / && rm -rf /tmp/Python-${PYTHON_VERSION}* + +RUN pip install --no-cache-dir memray + +WORKDIR /app + +# Install Salt dependencies +# We copy everything needed for pip install -e . +COPY requirements/ /app/requirements/ +COPY setup.py /app/ +COPY pyproject.toml /app/ +COPY MANIFEST.in /app/ +COPY README.rst /app/ +COPY AUTHORS /app/ +COPY LICENSE /app/ +COPY NOTICE /app/ +COPY salt/ /app/salt/ +COPY tools/ /app/tools/ +COPY scripts/ /app/scripts/ + +RUN pip install --no-cache-dir -r requirements/base.txt -r requirements/zeromq.txt +RUN pip install --no-cache-dir -e . + +# Extra tools for monitoring and salt-api +RUN pip install --no-cache-dir psutil CherryPy + +# Create salt user for API testing +RUN useradd -m -s /bin/bash salt && echo "salt:salt" | chpasswd +RUN usermod -aG shadow salt + +ENTRYPOINT ["/usr/bin/tini", "--", "/usr/local/bin/salt-master"] diff --git a/tests/monitoring/README.md b/tests/monitoring/README.md new file mode 100644 index 000000000000..9a66438554da --- /dev/null +++ b/tests/monitoring/README.md @@ -0,0 +1,56 @@ +# Salt Monitoring Environment + +This environment sets up a Salt Master, two Minions, Prometheus, and cAdvisor for monitoring. + +## Prerequisite + +- Docker and Docker Compose (or Podman and podman-compose) + +> **Note for Podman users:** If running in rootless mode, cAdvisor might require additional configuration to access host metrics. You may need to run Podman as root for full cAdvisor functionality, or use `podman stats` as an alternative. + +## Usage + +1. Start the environment: + ```bash + docker-compose up -d + ``` + +2. Access the Salt Master: + ```bash + docker exec -it salt-master bash + ``` + +3. Run a test command: + ```bash + salt '*' test.ping + ``` + +4. Access Prometheus: + Go to `http://localhost:9090` + +5. Access cAdvisor: + Go to `http://localhost:18081` + +6. Access Grafana: + Go to `http://localhost:13000` + The "Salt Monitoring" dashboard is pre-provisioned. + +## Monitoring for Memory Leaks + +In Prometheus, you can use the following query to monitor memory usage of the salt-master container: + +```promql +container_memory_usage_bytes{container_label_com_docker_compose_service="salt-master"} +``` + +Or more specifically for RSS: +```promql +container_memory_rss{container_label_com_docker_compose_service="salt-master"} +``` + +## Configuration + +- `master.conf`: Salt Master configuration +- `minion.conf`: Salt Minion configuration (shared by both minions) +- `prometheus.yml`: Prometheus configuration +- `Dockerfile.salt`: Dockerfile for Salt components diff --git a/tests/monitoring/analyze_stats.py b/tests/monitoring/analyze_stats.py new file mode 100644 index 000000000000..09a6e6608b96 --- /dev/null +++ b/tests/monitoring/analyze_stats.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +import json +import sys +import urllib.parse +import urllib.request + +PROM_URL = "http://localhost:19090" + + +def query_prom(query): + params = urllib.parse.urlencode({"query": query}) + url = f"{PROM_URL}/api/v1/query?{params}" + with urllib.request.urlopen(url) as response: + return json.loads(response.read().decode()) + + +def get_linear_slope(metric_name, duration="30m"): + # Returns the slope (rate of change per second) over the duration + query = f"deriv({metric_name}[{duration}])" + data = query_prom(query) + try: + return float(data["data"]["result"][0]["value"][1]) + except (IndexError, KeyError, ValueError): + return 0.0 + + +def main(): + print("--- Salt Stress Test Analysis ---") + + # 1. Check for zombie processes (process count growth) + master_procs_slope = get_linear_slope("salt_master_process_count") + api_procs_slope = get_linear_slope("salt_api_process_count") + + # 2. Check for memory leaks + master_rss_slope = get_linear_slope("salt_master_rss_bytes") + api_rss_slope = get_linear_slope("salt_api_rss_bytes") + + # 3. Check for FD leaks + master_fds_slope = get_linear_slope("salt_master_open_fds") + api_fds_slope = get_linear_slope("salt_api_open_fds") + + failed = False + + print(f"Master RSS Slope: {master_rss_slope:.2f} bytes/sec") + print(f"API RSS Slope: {api_rss_slope:.2f} bytes/sec") + print(f"Master FD Slope: {master_fds_slope:.6f} fds/sec") + print(f"Master Proc Slope: {master_procs_slope:.6f} procs/sec") + + # Thresholds + # Memory: > 10KB/sec sustained over 30m might indicate a real leak + if master_rss_slope > 10240: + print("FAIL: Master memory leak detected!") + failed = True + + if master_procs_slope > 0.001: # Sustained growth in process count + print("FAIL: Master process/zombie leak detected!") + failed = True + + if master_fds_slope > 0.01: # Sustained growth in FDs + print("FAIL: Master file descriptor leak detected!") + failed = True + + if failed: + sys.exit(1) + + print("SUCCESS: No significant resource leaks detected.") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/tests/monitoring/docker-compose.yml b/tests/monitoring/docker-compose.yml new file mode 100644 index 000000000000..d610cf2317fb --- /dev/null +++ b/tests/monitoring/docker-compose.yml @@ -0,0 +1,120 @@ +services: + salt-master: + build: + context: ../.. + dockerfile: tests/monitoring/Dockerfile.salt + container_name: salt-master + entrypoint: ["/usr/bin/tini", "--"] + command: ["sh", "-c", "salt-master -d && salt-api"] + volumes: + - ../../salt:/app/salt + - ./master.conf:/etc/salt/master + - ./minion.conf:/etc/salt/minion + - ./srv/salt:/srv/salt + - /home/dan/src/mops/salt/saltstack-raas-master:/app/saltstack-raas-master + - ./raas.conf:/etc/salt/master.d/raas.conf + - /etc/localtime:/etc/localtime:ro + - /etc/timezone:/etc/timezone:ro + - ./pki/master:/etc/salt/pki + - ./ids/master_id:/etc/salt/minion_id + ports: + - "44505:44505" + - "44506:44506" + - "18000:8000" + networks: + salt-net: + aliases: + - salt + + salt-minion-1: + build: + context: ../.. + dockerfile: tests/monitoring/Dockerfile.salt + container_name: salt-minion-1 + hostname: salt-minion-1 + entrypoint: ["/usr/local/bin/salt-minion"] + volumes: + - ../../salt:/app/salt + - ./minion.conf:/etc/salt/minion + - ./pki/minion-1:/etc/salt/pki + - ./ids/minion-1_id:/etc/salt/minion_id + depends_on: + - salt-master + networks: + - salt-net + + salt-minion-2: + build: + context: ../.. + dockerfile: tests/monitoring/Dockerfile.salt + container_name: salt-minion-2 + hostname: salt-minion-2 + entrypoint: ["/usr/local/bin/salt-minion"] + volumes: + - ../../salt:/app/salt + - ./minion.conf:/etc/salt/minion + - ./pki/minion-2:/etc/salt/pki + - ./ids/minion-2_id:/etc/salt/minion_id + depends_on: + - salt-master + networks: + - salt-net + + salt-minion-3: + build: + context: ../.. + dockerfile: tests/monitoring/Dockerfile.salt + container_name: salt-minion-3 + hostname: salt-minion-3 + entrypoint: ["/usr/local/bin/salt-minion"] + volumes: + - ../../salt:/app/salt + - ./minion.conf:/etc/salt/minion + - ./pki/minion-3:/etc/salt/pki + - ./ids/minion-3_id:/etc/salt/minion_id + depends_on: + - salt-master + networks: + - salt-net + + prometheus: + image: prom/prometheus:latest + container_name: prometheus + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + - ./prometheus_data:/prometheus + ports: + - "19090:9090" + networks: + - salt-net + + cadvisor: + image: gcr.io/cadvisor/cadvisor:latest + container_name: cadvisor + privileged: true + ports: + - "18081:8080" + volumes: + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + networks: + - salt-net + + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "13000:3000" + volumes: + - ./grafana/provisioning:/etc/grafana/provisioning + environment: + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + networks: + - salt-net + +networks: + salt-net: + driver: bridge diff --git a/tests/monitoring/grafana/provisioning/dashboards/dashboard_provider.yml b/tests/monitoring/grafana/provisioning/dashboards/dashboard_provider.yml new file mode 100644 index 000000000000..cbc3acf7d644 --- /dev/null +++ b/tests/monitoring/grafana/provisioning/dashboards/dashboard_provider.yml @@ -0,0 +1,10 @@ +apiVersion: 1 +providers: + - name: 'Default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /etc/grafana/provisioning/dashboards diff --git a/tests/monitoring/grafana/provisioning/dashboards/salt_monitoring.json b/tests/monitoring/grafana/provisioning/dashboards/salt_monitoring.json new file mode 100644 index 000000000000..929d844b8aec --- /dev/null +++ b/tests/monitoring/grafana/provisioning/dashboards/salt_monitoring.json @@ -0,0 +1,628 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "id": 1, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 0 + }, + "title": "Current Time", + "type": "stat", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "dateTimeAsLocal", + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value", + "wideLayout": true + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "time() * 1000", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ] + }, + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 100, + "title": "Salt Master", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 4 + }, + "id": 10, + "targets": [ + { + "expr": "salt_master_rss_bytes", + "legendFormat": "Master Process RSS", + "refId": "A" + }, + { + "expr": "container_memory_rss{container_label_com_docker_compose_project=\"monitoring\",container_label_com_docker_compose_service=\"salt-master\"}", + "legendFormat": "Total Container RSS", + "refId": "B" + } + ], + "title": "Master Memory RSS (Process vs Container)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 4 + }, + "id": 11, + "targets": [ + { + "expr": "rate(container_cpu_usage_seconds_total{cpu=\"total\",container_label_com_docker_compose_project=\"monitoring\",container_label_com_docker_compose_service=\"salt-master\"}[1m])", + "legendFormat": "Master CPU", + "refId": "A" + } + ], + "title": "Master CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 4 + }, + "id": 12, + "targets": [ + { + "expr": "salt_master_open_fds", + "legendFormat": "Total Open FDs", + "refId": "A" + }, + { + "expr": "salt_master_process_count", + "legendFormat": "Process Count", + "refId": "B" + } + ], + "title": "Master Resource Usage (FDs & Processes)", + "type": "timeseries" + }, + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 101, + "title": "Minion 1", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 12 + }, + "id": 20, + "targets": [ + { + "expr": "container_memory_rss{container_label_com_docker_compose_project=\"monitoring\",container_label_com_docker_compose_service=\"salt-minion-1\"}", + "legendFormat": "Minion 1 RSS", + "refId": "A" + } + ], + "title": "Minion 1 Memory RSS", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 12 + }, + "id": 21, + "targets": [ + { + "expr": "rate(container_cpu_usage_seconds_total{cpu=\"total\",container_label_com_docker_compose_project=\"monitoring\",container_label_com_docker_compose_service=\"salt-minion-1\"}[1m])", + "legendFormat": "Minion 1 CPU", + "refId": "A" + } + ], + "title": "Minion 1 CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 12 + }, + "id": 22, + "targets": [ + { + "expr": "sum(container_fs_inodes_total{container_label_com_docker_compose_project=\"monitoring\",container_label_com_docker_compose_service=\"salt-minion-1\"}) by (name) - sum(container_fs_inodes_free{container_label_com_docker_compose_project=\"monitoring\",container_label_com_docker_compose_service=\"salt-minion-1\"}) by (name)", + "legendFormat": "Minion 1 Inodes", + "refId": "A" + } + ], + "title": "Minion Inodes (Disk Files)", + "type": "timeseries" + }, + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 102, + "title": "Minion 2", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 20 + }, + "id": 30, + "targets": [ + { + "expr": "container_memory_rss{container_label_com_docker_compose_project=\"monitoring\",container_label_com_docker_compose_service=\"salt-minion-2\"}", + "legendFormat": "Minion 2 RSS", + "refId": "A" + } + ], + "title": "Minion 2 Memory RSS", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 20 + }, + "id": 31, + "targets": [ + { + "expr": "rate(container_cpu_usage_seconds_total{cpu=\"total\",container_label_com_docker_compose_project=\"monitoring\",container_label_com_docker_compose_service=\"salt-minion-2\"}[1m])", + "legendFormat": "Minion 2 CPU", + "refId": "A" + } + ], + "title": "Minion 2 CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 20 + }, + "id": 32, + "targets": [ + { + "expr": "sum(container_fs_inodes_total{container_label_com_docker_compose_project=\"monitoring\",container_label_com_docker_compose_service=\"salt-minion-2\"}) by (name) - sum(container_fs_inodes_free{container_label_com_docker_compose_project=\"monitoring\",container_label_com_docker_compose_service=\"salt-minion-2\"}) by (name)", + "legendFormat": "Minion 2 Inodes", + "refId": "A" + } + ], + "title": "Minion Inodes (Disk Files)", + "type": "timeseries" + }, + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 103, + "title": "Minion 3", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 28 + }, + "id": 40, + "targets": [ + { + "expr": "container_memory_rss{container_label_com_docker_compose_project=\"monitoring\",container_label_com_docker_compose_service=\"salt-minion-3\"}", + "legendFormat": "Minion 3 RSS", + "refId": "A" + } + ], + "title": "Minion 3 Memory RSS", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 28 + }, + "id": 41, + "targets": [ + { + "expr": "rate(container_cpu_usage_seconds_total{cpu=\"total\",container_label_com_docker_compose_project=\"monitoring\",container_label_com_docker_compose_service=\"salt-minion-3\"}[1m])", + "legendFormat": "Minion 3 CPU", + "refId": "A" + } + ], + "title": "Minion 3 CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 28 + }, + "id": 42, + "targets": [ + { + "expr": "sum(container_fs_inodes_total{container_label_com_docker_compose_project=\"monitoring\",container_label_com_docker_compose_service=\"salt-minion-3\"}) by (name) - sum(container_fs_inodes_free{container_label_com_docker_compose_project=\"monitoring\",container_label_com_docker_compose_service=\"salt-minion-3\"}) by (name)", + "legendFormat": "Minion 3 Inodes", + "refId": "A" + } + ], + "title": "Minion 3 Inodes (Disk Files)", + "type": "timeseries" + }, + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 104, + "title": "Salt API", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 36 + }, + "id": 50, + "targets": [ + { + "expr": "salt_api_rss_bytes", + "legendFormat": "API Process RSS", + "refId": "A" + } + ], + "title": "API Process Memory RSS", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 36 + }, + "id": 51, + "targets": [ + { + "expr": "rate(container_cpu_usage_seconds_total{cpu=\"total\",container_label_com_docker_compose_project=\"monitoring\",container_label_com_docker_compose_service=\"salt-master\"}[1m])", + "legendFormat": "API CPU", + "refId": "A" + } + ], + "title": "API CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 36 + }, + "id": 52, + "targets": [ + { + "expr": "salt_api_open_fds", + "legendFormat": "Total Open FDs", + "refId": "A" + }, + { + "expr": "salt_api_process_count", + "legendFormat": "Process Count", + "refId": "B" + } + ], + "title": "API Resource Usage (FDs & Processes)", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 36, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Salt Monitoring", + "uid": "salt-mon", + "version": 3, + "weekStart": "" +} diff --git a/tests/monitoring/grafana/provisioning/datasources/prometheus.yml b/tests/monitoring/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 000000000000..0eddf26296da --- /dev/null +++ b/tests/monitoring/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,7 @@ +apiVersion: 1 +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true diff --git a/tests/monitoring/master.conf b/tests/monitoring/master.conf new file mode 100644 index 000000000000..3a53bd7ceeb5 --- /dev/null +++ b/tests/monitoring/master.conf @@ -0,0 +1,31 @@ +interface: 0.0.0.0 +publish_port: 44505 +ret_port: 44506 +open_mode: True +auto_accept: True +log_level: debug +master: salt +master_port: 44506 +file_roots: + base: + - /srv/salt +worker_threads: 10 +worker_resource_backcount: 50 +ipc_write_buffer: 104857600 + +rest_cherrypy: + port: 8000 + disable_ssl: True + +netapi_enable_clients: + - local + - runner + - wheel + +external_auth: + pam: + salt: + - .* + - '@runner' + - '@wheel' +id: salt-master diff --git a/tests/monitoring/minion.conf b/tests/monitoring/minion.conf new file mode 100644 index 000000000000..ea32c796c1e4 --- /dev/null +++ b/tests/monitoring/minion.conf @@ -0,0 +1,5 @@ +master: salt-master +master_port: 44506 +log_level: warning +ipc_write_buffer: 104857600 +# id will be set via /etc/salt/minion_id or command line diff --git a/tests/monitoring/prometheus.yml b/tests/monitoring/prometheus.yml new file mode 100644 index 000000000000..c3861b42022f --- /dev/null +++ b/tests/monitoring/prometheus.yml @@ -0,0 +1,15 @@ +global: + scrape_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'cadvisor' + static_configs: + - targets: ['cadvisor:8080'] + + - job_name: 'salt-fds' + static_configs: + - targets: ['salt-master:8002'] diff --git a/tests/monitoring/raas.conf b/tests/monitoring/raas.conf new file mode 100644 index 000000000000..8408892e872d --- /dev/null +++ b/tests/monitoring/raas.conf @@ -0,0 +1,41 @@ +# RaaS Configuration +sseapi_server: http://192.168.80.1:18080 +sseapi_username: root +sseapi_password: salt + +# Plugin External Modules Path(s) +beacons_dirs: + - /app/saltstack-raas-master/sseape/beacons +engines_dirs: + - /app/saltstack-raas-master/sseape/engines +fileserver_dirs: + - /app/saltstack-raas-master/sseape/fileserver +pillar_dirs: + - /app/saltstack-raas-master/sseape/pillar +returner_dirs: + - /app/saltstack-raas-master/sseape/returners +roster_dirs: + - /app/saltstack-raas-master/sseape/roster +runner_dirs: + - /app/saltstack-raas-master/sseape/runners +module_dirs: + - /app/saltstack-raas-master/sseape/modules +states_dirs: + - /app/saltstack-raas-master/sseape/states + +# Enable minimal SSE engines +engines: + - sseapi: {} + +# Enable SSE master job cache and event returner +master_job_cache: sseapi +event_return: sseapi + +# Enable SSE external pillar +ext_pillar: + - sseapi: {} + +# Enable SSE fileserver backend +fileserver_backend: + - sseapi + - roots diff --git a/tests/monitoring/srv/salt/_grains/test_grain.py b/tests/monitoring/srv/salt/_grains/test_grain.py new file mode 100644 index 000000000000..77478477977d --- /dev/null +++ b/tests/monitoring/srv/salt/_grains/test_grain.py @@ -0,0 +1,5 @@ +import time + + +def my_time(): + return {"current_time": time.time()} diff --git a/tests/monitoring/srv/salt/fd_exporter.py b/tests/monitoring/srv/salt/fd_exporter.py new file mode 100644 index 000000000000..26ffd15cfcb6 --- /dev/null +++ b/tests/monitoring/srv/salt/fd_exporter.py @@ -0,0 +1,105 @@ +# pylint: disable=resource-leakage +import http.server +import os + + +class FDHandler(http.server.BaseHTTPRequestHandler): + def log_message(self, format, *args): + # Silence logs + return + + def do_GET(self): + if self.path == "/metrics": + self.send_response(200) + self.send_header("Content-Type", "text/plain") + self.end_headers() + + master_fds = 0 + master_procs = 0 + master_rss = 0 + api_fds = 0 + api_procs = 0 + api_rss = 0 + + try: + # Iterate over /proc directly once for efficiency + for pid_dir in os.listdir("/proc"): + if not pid_dir.isdigit(): + continue + + try: + pid = pid_dir + with open(f"/proc/{pid}/cmdline", "rb") as f: + cmdline = ( + f.read().replace(b"\0", b" ").decode(errors="ignore") + ) + + # Skip if it's the exporter itself + if "fd_exporter.py" in cmdline: + continue + + is_api = "salt-api" in cmdline + is_master = "salt-master" in cmdline and not is_api + + if is_master or is_api: + # FD count + try: + fd_count = len(os.listdir(f"/proc/{pid}/fd")) + except (OSError, PermissionError): + fd_count = 0 + + # RSS Memory (from /proc/[pid]/stat, field 24 is RSS in pages) + try: + with open(f"/proc/{pid}/stat", encoding="utf-8") as f: + stat = f.read().split() + rss_pages = int(stat[23]) + rss_bytes = rss_pages * 4096 # Assuming 4KB pages + except (OSError, ValueError, IndexError): + rss_bytes = 0 + + if is_master: + master_fds += fd_count + master_procs += 1 + master_rss += rss_bytes + if is_api: + api_fds += fd_count + api_procs += 1 + api_rss += rss_bytes + except (FileNotFoundError, ProcessLookupError, PermissionError): + # Process died while we were reading it + continue + except OSError: + continue + except OSError: + pass + + lines = [ + "# HELP salt_master_open_fds Number of open file descriptors for master", + "# TYPE salt_master_open_fds gauge", + f"salt_master_open_fds {master_fds}", + "# HELP salt_master_process_count Number of master processes", + "# TYPE salt_master_process_count gauge", + f"salt_master_process_count {master_procs}", + "# HELP salt_master_rss_bytes RSS memory usage for master in bytes", + "# TYPE salt_master_rss_bytes gauge", + f"salt_master_rss_bytes {master_rss}", + "# HELP salt_api_open_fds Number of open file descriptors for salt-api", + "# TYPE salt_api_open_fds gauge", + f"salt_api_open_fds {api_fds}", + "# HELP salt_api_process_count Number of salt-api processes", + "# TYPE salt_api_process_count gauge", + f"salt_api_process_count {api_procs}", + "# HELP salt_api_rss_bytes RSS memory usage for salt-api in bytes", + "# TYPE salt_api_rss_bytes gauge", + f"salt_api_rss_bytes {api_rss}", + ] + self.wfile.write(("\n".join(lines) + "\n").encode()) + else: + self.send_response(404) + self.end_headers() + + +if __name__ == "__main__": + port = 8002 + print(f"Starting FD and Memory Exporter on port {port}...") + http.server.HTTPServer(("0.0.0.0", port), FDHandler).serve_forever() diff --git a/tests/monitoring/srv/salt/flood_events.py b/tests/monitoring/srv/salt/flood_events.py new file mode 100644 index 000000000000..1dfba6f8f27e --- /dev/null +++ b/tests/monitoring/srv/salt/flood_events.py @@ -0,0 +1,24 @@ +import os +import time + +import salt.config +import salt.utils.event + +# Load master config +opts = salt.config.client_config("/etc/salt/master") +event = salt.utils.event.get_event("master", opts=opts, listen=False) + +print(f"Starting event flood from PID {os.getpid()}...") +try: + count = 0 + while True: + # Fire events with a 1KB payload + event.fire_event( + {"count": count, "payload": "f" * 1024, "timestamp": time.time()}, + "stress/test/flood", + ) + count += 1 + if count % 1000 == 0: + print(f"Fired {count} events...") +except KeyboardInterrupt: + print("Stopped.") diff --git a/tests/monitoring/srv/salt/haproxy.cfg.jinja b/tests/monitoring/srv/salt/haproxy.cfg.jinja new file mode 100644 index 000000000000..77e499df9687 --- /dev/null +++ b/tests/monitoring/srv/salt/haproxy.cfg.jinja @@ -0,0 +1,27 @@ +global + log /dev/log local0 + log /dev/log local1 notice + chroot /var/lib/haproxy + stats socket /run/haproxy/admin.sock mode 660 level admin expose-fd listeners + stats timeout 30s + user haproxy + group haproxy + daemon + +defaults + log global + mode http + option httplog + option dontlognull + timeout connect 5000 + timeout client 50000 + timeout server 50000 + +frontend localnodes + bind *:80 + default_backend web-backend + +backend web-backend + balance roundrobin + server web1 salt-minion-2:80 check + server web2 salt-minion-3:80 check diff --git a/tests/monitoring/srv/salt/heavy/cmd.sls b/tests/monitoring/srv/salt/heavy/cmd.sls new file mode 100644 index 000000000000..c31101841c56 --- /dev/null +++ b/tests/monitoring/srv/salt/heavy/cmd.sls @@ -0,0 +1,10 @@ +run_noisy_command: + cmd.run: + - shell: /bin/bash + - name: | + for i in {1..50}; do + echo "Batch $i output start" + ps aux + ls -R /etc + echo "Batch $i output end" + done diff --git a/tests/monitoring/srv/salt/heavy/heavy_template.jinja b/tests/monitoring/srv/salt/heavy/heavy_template.jinja new file mode 100644 index 000000000000..81d1ac10a95b --- /dev/null +++ b/tests/monitoring/srv/salt/heavy/heavy_template.jinja @@ -0,0 +1,7 @@ +# Heavy Jinja Template +{% for i in range(iterations) %} +## Block {{ i }} +{% for j in range(sub_iterations) %} +Item {{ i }}.{{ j }}: {{ "abcdefghijklmnopqrstuvwxyz" | reverse }} - {{ (i * j) | string | md5 }} +{% endfor %} +{% endfor %} diff --git a/tests/monitoring/srv/salt/heavy/jinja.sls b/tests/monitoring/srv/salt/heavy/jinja.sls new file mode 100644 index 000000000000..ce27d66f30f2 --- /dev/null +++ b/tests/monitoring/srv/salt/heavy/jinja.sls @@ -0,0 +1,8 @@ +generate_heavy_file: + file.managed: + - name: /tmp/heavy_jinja_output + - source: salt://heavy/heavy_template.jinja + - template: jinja + - context: + iterations: 500 + sub_iterations: 100 diff --git a/tests/monitoring/srv/salt/heavy/many_files.sls b/tests/monitoring/srv/salt/heavy/many_files.sls new file mode 100644 index 000000000000..6484c21a8fcf --- /dev/null +++ b/tests/monitoring/srv/salt/heavy/many_files.sls @@ -0,0 +1,6 @@ +{% for i in range(100) %} +/tmp/stress_file_{{ i }}: + file.managed: + - contents: "Stress test file content for index {{ i }}. This is repeated many times to increase state size. {{ 'A' * 100 }}" + - makedirs: True +{% endfor %} diff --git a/tests/monitoring/srv/salt/heavy/software_install.sls b/tests/monitoring/srv/salt/heavy/software_install.sls new file mode 100644 index 000000000000..1d0860cb1a10 --- /dev/null +++ b/tests/monitoring/srv/salt/heavy/software_install.sls @@ -0,0 +1,5 @@ +{% set pkgs = ['ed', 'bc', 'jq', 'tree', 'zip', 'unzip', 'less'] %} + +install_pkgs: + pkg.installed: + - pkgs: {{ pkgs }} diff --git a/tests/monitoring/srv/salt/heavy/software_remove.sls b/tests/monitoring/srv/salt/heavy/software_remove.sls new file mode 100644 index 000000000000..f98703648702 --- /dev/null +++ b/tests/monitoring/srv/salt/heavy/software_remove.sls @@ -0,0 +1,5 @@ +{% set pkgs = ['ed', 'bc', 'jq', 'tree', 'zip', 'unzip', 'less'] %} + +remove_pkgs: + pkg.removed: + - pkgs: {{ pkgs }} diff --git a/tests/monitoring/srv/salt/listen_events.py b/tests/monitoring/srv/salt/listen_events.py new file mode 100644 index 000000000000..218b8528b2fd --- /dev/null +++ b/tests/monitoring/srv/salt/listen_events.py @@ -0,0 +1,20 @@ +import time + +import salt.config +import salt.utils.event + +opts = salt.config.client_config("/etc/salt/master") +event = salt.utils.event.get_event("master", opts=opts, listen=True) + +print("Listening for events (30 seconds)...") +start = time.time() +while time.time() - start < 30: + ev = event.get_event(wait=1, full=True) + if ev: + print(f"Tag: {ev.get('tag')}") + # print(f"Data: {ev.get('data')}") + if ( + "grains" in str(ev.get("tag")).lower() + or "minion" in str(ev.get("tag")).lower() + ): + print(f"DATA: {ev.get('data')}") diff --git a/tests/monitoring/srv/salt/loadbalancer.sls b/tests/monitoring/srv/salt/loadbalancer.sls new file mode 100644 index 000000000000..fb66a92641c9 --- /dev/null +++ b/tests/monitoring/srv/salt/loadbalancer.sls @@ -0,0 +1,18 @@ +install_haproxy: + pkg.installed: + - name: haproxy + +haproxy_cfg: + file.managed: + - name: /etc/haproxy/haproxy.cfg + - source: salt://haproxy.cfg.jinja + - template: jinja + - require: + - pkg: install_haproxy + +haproxy_service: + service.running: + - name: haproxy + - enable: True + - watch: + - file: haproxy_cfg diff --git a/tests/monitoring/srv/salt/top.sls b/tests/monitoring/srv/salt/top.sls new file mode 100644 index 000000000000..435be57c3678 --- /dev/null +++ b/tests/monitoring/srv/salt/top.sls @@ -0,0 +1,11 @@ +base: + '*': + - heavy.jinja + - heavy.many_files + - heavy.cmd + 'salt-minion-1': + - loadbalancer + 'salt-minion-2': + - webserver + 'salt-minion-3': + - webserver diff --git a/tests/monitoring/srv/salt/webserver.sls b/tests/monitoring/srv/salt/webserver.sls new file mode 100644 index 000000000000..60b30a644069 --- /dev/null +++ b/tests/monitoring/srv/salt/webserver.sls @@ -0,0 +1,17 @@ +install_apache: + pkg.installed: + - name: apache2 + +apache_service: + service.running: + - name: apache2 + - enable: True + - require: + - pkg: install_apache + +welcome_page: + file.managed: + - name: /var/www/html/index.html + - contents: "Hello from {{ grains['id'] }}" + - require: + - pkg: install_apache diff --git a/tests/monitoring/stress_api.sh b/tests/monitoring/stress_api.sh new file mode 100755 index 000000000000..f3631023e9ef --- /dev/null +++ b/tests/monitoring/stress_api.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Stress test the Salt API + +API_URL="http://localhost:18000" +USER="salt" +PASS="salt" + +echo "Starting Salt API stress test..." + +# Function to get a token +get_token() { + curl -s -c /tmp/cookies.txt -H "Accept: application/json" \ + -d username=$USER -d password=$PASS -d eauth=pam \ + $API_URL/login | python3 -c "import sys, json; print(sys.stdin.read())" | grep -oP '"token": "\K[^"]+' +} + +TOKEN=$(get_token) +echo "Got token: $TOKEN" + +while true; do + # Run a command via API + curl -s -H "Accept: application/json" -H "X-Auth-Token: $TOKEN" \ + -d client=local -d tgt='*' -d fun=test.ping \ + $API_URL > /dev/null + + # Run a runner via API + curl -s -H "Accept: application/json" -H "X-Auth-Token: $TOKEN" \ + -d client=runner -d fun=manage.status \ + $API_URL > /dev/null + + sleep 0.1 +done diff --git a/tests/monitoring/stress_test.sh b/tests/monitoring/stress_test.sh new file mode 100755 index 000000000000..3742d6dee8f9 --- /dev/null +++ b/tests/monitoring/stress_test.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# Aggressive Salt Master Stress Test + +echo "Starting aggressive stress test..." + +# 1. Start Event Flooder in the background on the master +echo "Launching event flooder..." +docker exec -d salt-master python3 /srv/salt/flood_events.py + +# 2. Loop Highstates on all minions +echo "Starting Highstate loop..." +( + while true; do + echo "[$(date)] Running Highstate..." + docker exec salt-master salt '*' state.highstate --timeout=120 --async + sleep 10 + done +) & + +# 3. Loop various executions (Wheel, Runner, and Local) +echo "Starting Execution loops..." +( + while true; do + # Stress the runner system + docker exec salt-master salt-run manage.status --async + # Stress the wheel system + docker exec salt-master salt-key -L + # Rapid fire pings + docker exec salt-master salt '*' test.ping --timeout=5 + # Large data returns + docker exec salt-master salt '*' grains.items --async + sleep 2 + done +) & + +# 4. Stress the file server +( + while true; do + docker exec salt-master salt '*' cp.cache_file salt://heavy/heavy_template.jinja + sleep 5 + done +) & + +# 5. Stress the Salt API +( + while true; do + ./stress_api.sh + sleep 1 + done +) & + +# 6. Deploy and Remove software in a loop +( + # First update apt on all minions once + docker exec salt-master salt '*' pkg.refresh_db + while true; do + echo "[$(date)] Deploying software and infra..." + docker exec salt-master salt '*' state.apply heavy.software_install,webserver,loadbalancer --timeout=300 + sleep 5 + echo "[$(date)] Removing software (keeping infra)..." + docker exec salt-master salt '*' state.apply heavy.software_remove --timeout=300 + sleep 5 + done +) & + +echo "Stress test is running in the background." +echo "Monitor memory at http://localhost:19090 or http://localhost:13000" +echo "To stop: kill all background jobs of this script." + +wait diff --git a/tests/pytests/functional/master/test_event_publisher.py b/tests/pytests/functional/master/test_event_publisher.py index 0f4b3fde0c19..ba3f30a4d7ac 100644 --- a/tests/pytests/functional/master/test_event_publisher.py +++ b/tests/pytests/functional/master/test_event_publisher.py @@ -168,7 +168,7 @@ def test_publisher_mem(publisher, publish, listeners, stop_event): try: # After the loader tests run we have a baseline of almost 300MB # assert baseline < 150 - leak_threshold = baseline + (baseline * 0.5) + leak_threshold = baseline + 100 + (baseline * 0.5) while time.time() - start < 60: assert publisher.is_alive() mem = psutil.Process(publisher.pid).memory_info().rss / 1024**2 diff --git a/tests/pytests/unit/netapi/cherrypy/test_login.py b/tests/pytests/unit/netapi/cherrypy/test_login.py index 8066c59dab16..6c70c301d824 100644 --- a/tests/pytests/unit/netapi/cherrypy/test_login.py +++ b/tests/pytests/unit/netapi/cherrypy/test_login.py @@ -30,6 +30,12 @@ def __init__(self, *args, **kwargs): def _is_master_running(self): return True + def __enter__(self): + return self + + def __exit__(self, *args): + pass + class MockResolver: def __init__(self, *args, **kwargs): diff --git a/tests/pytests/unit/netapi/test_netapi_client_runner.py b/tests/pytests/unit/netapi/test_netapi_client_runner.py index a3ff13a39b44..039552c98733 100644 --- a/tests/pytests/unit/netapi/test_netapi_client_runner.py +++ b/tests/pytests/unit/netapi/test_netapi_client_runner.py @@ -32,6 +32,12 @@ class FakeRunner: def __init__(self, opts): self.opts = opts + def __enter__(self): + return self + + def __exit__(self, *exc): + return False + def cmd_sync(self, low, timeout=None, full_return=False): captured["timeout"] = timeout captured["low"] = low @@ -59,6 +65,12 @@ class FakeRunner: def __init__(self, opts): pass + def __enter__(self): + return self + + def __exit__(self, *exc): + return False + def cmd_sync(self, low, timeout=None, full_return=False): captured["timeout"] = timeout return {"return": "ok"} @@ -80,6 +92,12 @@ class FakeRunner: def __init__(self, opts): pass + def __enter__(self): + return self + + def __exit__(self, *exc): + return False + def cmd_sync(self, low, timeout=None, full_return=False): captured["timeout"] = timeout return {"return": "ok"} @@ -101,6 +119,12 @@ class FakeRunner: def __init__(self, opts): pass + def __enter__(self): + return self + + def __exit__(self, *exc): + return False + def cmd_sync(self, low, timeout=None, full_return=False): captured["timeout"] = timeout return {"return": "ok"} diff --git a/tests/unit/test_master.py b/tests/unit/test_master.py index 83b09df7fdb6..a63ccd5ab948 100644 --- a/tests/unit/test_master.py +++ b/tests/unit/test_master.py @@ -721,7 +721,19 @@ def __init__(self): def __call__(self, *args, **kwargs): self.call_times += [mocked_time._current_duration] - mocked__post_fork_init = MockTimedFunc() + main_class = self.main_class + + class MockPostForkInit(MockTimedFunc): + def __call__(self, *args, **kwargs): + # The real _post_fork_init constructs and caches a few helpers + # that the maintenance loop relies on. The unit test bypasses + # the real init, so we have to seed those attributes ourselves + # to satisfy the loop body's references to them. + main_class._cached_mminion = MagicMock() + main_class._cached_loadauth = MagicMock() + return super().__call__(*args, **kwargs) + + mocked__post_fork_init = MockPostForkInit() mocked_clean_old_jobs = MockTimedFunc() mocked_clean_expired_tokens = MockTimedFunc() mocked_clean_pub_auth = MockTimedFunc() diff --git a/tests/unit/test_module_names.py b/tests/unit/test_module_names.py index 15d06e0ed66f..54d9fad2305c 100644 --- a/tests/unit/test_module_names.py +++ b/tests/unit/test_module_names.py @@ -15,6 +15,7 @@ EXCLUDED_DIRS = [ os.path.join("tests", "integration", "cloud", "helpers"), os.path.join("tests", "integration", "files"), + os.path.join("tests", "monitoring"), os.path.join("tests", "perf"), os.path.join("tests", "pkg"), os.path.join("tests", "support"), diff --git a/tests/unit/utils/test_job.py b/tests/unit/utils/test_job.py index 2e824e02351f..91a282de6025 100644 --- a/tests/unit/utils/test_job.py +++ b/tests/unit/utils/test_job.py @@ -24,6 +24,12 @@ def return_mock_jobs(self): def __init__(self, *args, **kwargs): pass + def __enter__(self): + return self + + def __exit__(self, *args): + pass + class JobTest(TestCase): """