diff --git a/CHANGELOG.md b/CHANGELOG.md index ab3f4939..7d744bfe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Change log +## 09/05/2025 + +- add support for container-scoped user delegation SAS (skip account-level container enumeration to prevent 403 AuthorizationFailure) +- parse and expose user delegation key parameters (skt, ske, sktid, skoid) in `SasParameters` +- adjust `BlobManager.Connect` / `EnumerateContainers` logic to validate single container when using user delegation SAS + ## 08/20/2025 - fix deprecated GitHub Actions: upgrade upload-artifact from v1 to v4 and checkout from v1 to v4 diff --git a/docs/kustoQuickStart.md b/docs/kustoQuickStart.md index edddf4b8..3d3f4005 100644 --- a/docs/kustoQuickStart.md +++ b/docs/kustoQuickStart.md @@ -1,39 +1,38 @@ # CollectSFData Quickstart for Kusto Cluster -## Outline +## Outline -[Overview](#use-these-steps-to-setup-and-ingest-service-fabric-diagnostic-data-into-kusto-database) -[Generating Shared Access Signature (Sas Uri / Saskey)](#generating-shared-access-signature-(sas-uri-/-saskey)) +[Overview](#use-these-steps-to-setup-and-ingest-service-fabric-diagnostic-data-into-kusto-database)[Generating Shared Access Signature (Sas Uri / Saskey)](#generating-shared-access-signature-(sas-uri-/-saskey)) -- [Azure Portal](#azure-portal) +- [Azure Portal](#azure-portal) -[CollectSFData GatherTypes](#collectsfdata-gathertypes) +[CollectSFData GatherTypes](#collectsfdata-gathertypes) -- [counter](#gathertype-counter) -- [exception](#gathertype-exception) -- [setup](#gathertype-setup) -- [table](#gathertype-table) -- [trace](#gathertype-trace) -- [any](#gathertype-any) +- [counter](#gathertype-counter) +- [exception](#gathertype-exception) +- [setup](#gathertype-setup) +- [table](#gathertype-table) +- [trace](#gathertype-trace) +- [any](#gathertype-any) -[Download](#download) -[Execute](#execute) -[collectsfdata.options.json](#collectsfdata.options.json) -[Cleanup](#cleanup) -[Kusto Commands](#kusto-commands) -[Troubleshooting](#Troubleshooting) -[Reference](#Reference) +[Download](#download) +[Execute](#execute) +[collectsfdata.options.json](#collectsfdata.options.json) +[Cleanup](#cleanup) +[Kusto Commands](#kusto-commands) +[Troubleshooting](#Troubleshooting) +[Reference](#Reference) -## Use these steps to setup and ingest service fabric diagnostic data into kusto database +## Use these steps to setup and ingest service fabric diagnostic data into kusto database 1. download and extract latest release of [collectsfdata](https://github.com/microsoft/CollectServiceFabricData/releases/latest) -1. open cmd/powershell prompt and navigate to extracted directory -1. save [configuration](#collectsfdata.options.json) to extracted directory or generate new config with .\collectsfdata.exe -save collectsfdata.options.json or pass arguments on command line -1. modify saskey, starttimestamp, endtimestamp, kustocluster, kustotable, and gathertype - * service fabric kusto cluster ingest url: `https://ingest-{{cluster}}.{{location}}.kusto.windows.net/{{database}}` - * 30 minutes is always added to EndTimeUtc due to the way sf uploads traces to storage account. -1. execute utility. -1. analyze data `https://dataexplorer.azure.com/clusters/{{cluster}}/databases/{{database}}` +2. open cmd/powershell prompt and navigate to extracted directory +3. save [configuration](#collectsfdata.options.json) to extracted directory or generate new config with .\collectsfdata.exe -save collectsfdata.options.json or pass arguments on command line +4. modify saskey, starttimestamp, endtimestamp, kustocluster, kustotable, and gathertype + * service fabric kusto cluster ingest url: `https://ingest-{{cluster}}.{{location}}.kusto.windows.net/{{database}}` + * 30 minutes is always added to EndTimeUtc due to the way sf uploads traces to storage account. +5. execute utility. +6. analyze data `https://dataexplorer.azure.com/clusters/{{cluster}}/databases/{{database}}` ## Generating Shared Access Signature (Sas Uri / Saskey) @@ -41,57 +40,64 @@ CollectSFData uses a sas uri similar to Traceviewer to connect to the 'sflogs' s ### Azure portal -From Azure portal https://portal.azure.com navigate to service fabric cluster resource. There will typically be two storage accounts created for service fabric. One is for windows azure diagnostics (wad) and the other for service fabric diagnostic logs and events. The service fabric storage account is usually prefixed with 'sflogs'. If not, the correct account can be verified by determining which storage account has the three containers: +From Azure portal https://portal.azure.com navigate to service fabric cluster resource. There will typically be two storage accounts created for service fabric. One is for windows azure diagnostics (wad) and the other for service fabric diagnostic logs and events. The service fabric storage account is usually prefixed with 'sflogs'. If not, the correct account can be verified by determining which storage account has the three containers: - fabriccounters-* - fabriccrashdumps-* - fabriclogs-* - - ![](media/azure.portal.1.png) - - ![](media/azure.portal.2.png) +#### Account Shared Access signature + Once correct storage account is identified, select 'Shared access signature' and then 'Generate SAS and connection string'. Copy 'Blob service SAS URL' or 'Connection Sting'. This is the value that will be used for CollectSFData argument 'SasKey'. Ensure at least the following permissions are selected: - Allowed Services + - Blob - Table - - Allowed Resource Types + - Service - Container - Object - - Allowed Permissions + - Read - List +- ![img](media/azure.portal.3.png) +- ![img](media/azure.portal.4.png) + +#### User Delegation Key -- ![](media/azure.portal.3.png) +Alternatively, a user delegation key can be generated for the storage account. This requires that the user has been granted the 'Storage Blob Data Contributor' role for the storage account. The user delegation key can be generated from the Azure portal or programmatically using Azure SDKs. The user delegation key will have a start time and expiry time that must be specified when generating the key. User delegation keys are useful for scenarios where a more granular level of access control is needed or when Shared Key has been disabled. -- ![](media/azure.portal.4.png) +This is only available on blob containers and has to be generated from the container view. -## CollectSFData GatherTypes +![user delegation key](media/azure.portal.5.png) -All gather types ingest into Kusto or Log Analytics as a single table per gather type except for gather type 'any'. Any will download files locally. The gather type is also prepended to the table name during ingestion. This is to allow different gather types to be collected without having to change configuration for table name and for table naming constraints. +## CollectSFData GatherTypes -Example: configured table name 'jagilber_0000000000000001' would be prepended with 'counter_' for gather type counter 'counter_jagilber_0000000000000001'. +All gather types ingest into Kusto or Log Analytics as a single table per gather type except for gather type 'any'. Any will download files locally. The gather type is also prepended to the table name during ingestion. This is to allow different gather types to be collected without having to change configuration for table name and for table naming constraints. -### GatherType counter +Example: configured table name 'jagilber_0000000000000001' would be prepended with 'counter_' for gather type counter 'counter_jagilber_0000000000000001'. -#### data type: sf node performance monitor .blg files. +### GatherType counter -#### time range: can typically gather 6 - 24+ hours without issue. +#### data type: sf node performance monitor .blg files. -#### start here: +#### time range: can typically gather 6 - 24+ hours without issue. + +#### start here: -- use [perfmon kusto query](https://github.com/microsoft/CollectServiceFabricData/blob/master/KustoQueries/sfcounters-general.csl) from [azure data explorer](https://dataexplorer.azure.com) to graph results - * change value in query from '['counter_serviceFabricLogs']' to '['counter_<%user%>_<%case%>']' for table name to be graphed. +- use [perfmon kusto query](https://github.com/microsoft/CollectServiceFabricData/blob/master/KustoQueries/sfcounters-general.csl) from [azure data explorer](https://dataexplorer.azure.com) to graph results + * change value in query from '['counter_serviceFabricLogs']' to '['counter_<%user%>_<%case%>']' for table name to be graphed. - to view results, uncomment one counter line in query and execute. - * // | where CounterName contains "Avg. Disk Queue Length" and CounterName contains "c:" - * | where CounterName contains "Avg. Disk Queue Length" and CounterName contains "c:" - + * // | where CounterName contains "Avg. Disk Queue Length" and CounterName contains "c:" + * | where CounterName contains "Avg. Disk Queue Length" and CounterName contains "c:" + #### counter record ```text @@ -105,13 +111,13 @@ Example: configured table name 'jagilber_0000000000000001' would be prepended wi } ``` -### GatherType exception +### GatherType exception -- sf fabric*.exe dumps. +- sf fabric*.exe dumps. - adds 'exception_' prefix to table name. -- creates table with records containing url, pid, and process name for dumps from storage account. -- console output lists download url with sas. -- can run for any time range. +- creates table with records containing url, pid, and process name for dumps from storage account. +- console output lists download url with sas. +- can run for any time range. #### example exception record @@ -143,11 +149,11 @@ https://dataexplorer.azure.com 1:Execute:total execution time in minutes: 0.41 ``` -### GatherType setup +### GatherType setup -- sf install .trace files. -- adds 'setup_' prefix to table name. -- can run for any time range as data is small but typically only needed for time of issue. +- sf install .trace files. +- adds 'setup_' prefix to table name. +- can run for any time range as data is small but typically only needed for time of issue. #### setup record @@ -164,20 +170,20 @@ https://dataexplorer.azure.com } ``` -### GatherType table +### GatherType table -#### data type: sf cluster and node storage account table events. same as available in sfx explorer events. +#### data type: sf cluster and node storage account table events. same as available in sfx explorer events. -#### time range: can typically gather 2 - 7+ days without issue. +#### time range: can typically gather 2 - 7+ days without issue. -#### start here: +#### start here: - best viewed with kusto function 'TableView('table_%user%_%case%')'. -- Example: - * TableView('table_jagilber_000000000000001') | facet by TaskName - * TableView('table_jagilber_000000000000001') | where TaskName contains "hosting" +- Example: + * TableView('table_jagilber_000000000000001') | facet by TaskName + * TableView('table_jagilber_000000000000001') | where TaskName contains "hosting" -#### table record with TableView where TaskName is 'Hosting': +#### table record with TableView where TaskName is 'Hosting': ```text { @@ -208,7 +214,7 @@ https://dataexplorer.azure.com } ``` -#### table record without TableView: +#### table record without TableView: ```text { @@ -223,40 +229,38 @@ https://dataexplorer.azure.com } ``` -### GatherType trace +### GatherType trace -#### data type: sf node sflog .etl/.dtr/.zip files. same data as traceviewer +#### data type: sf node sflog .etl/.dtr/.zip files. same data as traceviewer -#### time range: can typically gather 2 - 4+ hours without issue. +#### time range: can typically gather 2 - 4+ hours without issue. -#### best practice: +#### best practice: - use 'KustoUseBlobAsSource' for fastest ingest. - use 'KustoCompressed' to reduce network traffic - use 'UriFilter' and set to 'fabric_' for fabric only traces - use 'UriFilter' and set to 'lease_' for lease only traces - use 'NodeFilter' to ingest only certain nodes -- use regex / string based 'NodeFilter' and add name of node(s) to gather from if all nodes are not needed. -examples: - * NodeFilter: "\_nt_0" to collect data for only node 0 - * NodeFilter: "\_nt_0|_nt_1" to collect data for only nodes 0 and 1 - * NodeFilter: "\_nt_0|_nt_1|_nt_2" to collect data for only nodes 0 - 2 - * NodeFilter: "\_nt_[0-1]" to collect data for only nodes 0 - 1 - * NodeFilter: "\_nt_[135]" to collect data for only nodes 1, 3, and 5 - * NodeFilter: "\_nt_" to collect data for only nodes for nodetype 'nt' +- use regex / string based 'NodeFilter' and add name of node(s) to gather from if all nodes are not needed.examples: + * NodeFilter: "\_nt_0" to collect data for only node 0 + * NodeFilter: "\_nt_0|_nt_1" to collect data for only nodes 0 and 1 + * NodeFilter: "\_nt_0|_nt_1|_nt_2" to collect data for only nodes 0 - 2 + * NodeFilter: "\_nt_[0-1]" to collect data for only nodes 0 - 1 + * NodeFilter: "\_nt_[135]" to collect data for only nodes 1, 3, and 5 + * NodeFilter: "\_nt_" to collect data for only nodes for nodetype 'nt' #### start here: -- use kusto functions. functions are viewable in tree view from azure data explorer or by typing '.show functions' -from [azure data explorer](https://dataexplorer.azure.com) console, type the name of the function and pass name of table in single quotes. -- Example: TraceSummary('trace_jagilber_000000000000001') +- use kusto functions. functions are viewable in tree view from azure data explorer or by typing '.show functions'from [azure data explorer](https://dataexplorer.azure.com) console, type the name of the function and pass name of table in single quotes. +- Example: TraceSummary('trace_jagilber_000000000000001') - ![kusto-function-tree-view.png](./media/kusto-function-tree-view.png) + ![kusto-function-tree-view.png](./media/kusto-function-tree-view.png) +- use [kusto queries](https://github.com/microsoft/CollectServiceFabricData/blob/master/KustoQueries/kusto-example-queries.csl) from [azure data explorer](https://dataexplorer.azure.com) to view results -- use [kusto queries](https://github.com/microsoft/CollectServiceFabricData/blob/master/KustoQueries/kusto-example-queries.csl) from [azure data explorer](https://dataexplorer.azure.com) to view results - * change value in queries from '%kusto table name%' to 'trace_<%user%>_<%case%>' for trace table name to be viewed. + * change value in queries from '%kusto table name%' to 'trace_<%user%>_<%case%>' for trace table name to be viewed. -#### trace record: +#### trace record: ```text { @@ -273,18 +277,18 @@ from [azure data explorer](https://dataexplorer.azure.com) console, type the nam ``` -### GatherType any +### GatherType any -#### data type: any blob file. +#### data type: any blob file. -#### time range: can typically gather 6 - 24+ hours without issue. +#### time range: can typically gather 6 - 24+ hours without issue. -#### best practice: +#### best practice: - use 'StartTimeUtc' and 'EndTimeUtc' - use 'UriFilter' to filter files - use 'NodeFilter' to ingest only certain nodes -- use regex / string based 'NodeFilter' and add name of node(s) to gather from if all nodes are not needed. +- use regex / string based 'NodeFilter' and add name of node(s) to gather from if all nodes are not needed. ## download @@ -397,32 +401,28 @@ pause ### collectsfdata.exe commands to delete / drop table when no longer needed 1. to view all tables: collectsfdata.exe -kp list -1. to drop table: collectsfdata.exe -kp <%table name%> - ex: collectsfdata.exe -kp trace_jagilber_0000000000000001 -1. to verify table has been deleted / dropped (not necessary): collectsfdata.exe -kp list +2. to drop table: collectsfdata.exe -kp <%table name%>ex: collectsfdata.exe -kp trace_jagilber_0000000000000001 +3. to verify table has been deleted / dropped (not necessary): collectsfdata.exe -kp list ### kusto commands to delete / drop table when no longer needed 1. open url: https://dataexplorer.azure.com -1. to view all tables: .show tables -1. to view all tables with name match: .show tables | where TableName contains "<%filter%>" - ex: .show tables | where TableName contains "jagilber" -1. to drop table: .drop table <%table name%> - ex: .drop table trace_jagilber_0000000000000001 - note: on success, output will display all tables remaining -1. to verify table has been deleted / dropped (not necessary): .show table <%table name%> +2. to view all tables: .show tables +3. to view all tables with name match: .show tables | where TableName contains "<%filter%>"ex: .show tables | where TableName contains "jagilber" +4. to drop table: .drop table <%table name%>ex: .drop table trace_jagilber_0000000000000001note: on success, output will display all tables remaining +5. to verify table has been deleted / dropped (not necessary): .show table <%table name%> ## kusto commands -- to show all tables: .show tables -- query to display previous queries: .show queries +- to show all tables: .show tables +- query to display previous queries: .show queries - drop table: .drop table <%table name%> - ingestion failures: .show ingestion failures - table ingestion time: <%table name%> | top 1 by Timestamp asc | project ingestion_time() -## troubleshooting +## troubleshooting -1. E_WRONG_NUBER_OF_FIELDS. this can be caused by setting 'UseKustoBlobAsSource' to true. some events are still not csv compliant. to resolve, set 'UseKustoBlobAsSource' to false. +1. E_WRONG_NUBER_OF_FIELDS. this can be caused by setting 'UseKustoBlobAsSource' to true. some events are still not csv compliant. to resolve, set 'UseKustoBlobAsSource' to false. ```text 5:QueueMonitor:error: Ingestion error total:(46): { @@ -474,16 +474,16 @@ pause ``` -4. Microsoft.Identity.Client.MsalServiceException. - Verify configuration settings are correct: - - azureClientId - - azureTenantId - - azureClientSecret - - azureClientCertificate +4. Microsoft.Identity.Client.MsalServiceException.Verify configuration settings are correct: + + - azureClientId + - azureTenantId + - azureClientSecret + - azureClientCertificate - .net framework will use ADAL and .net core will use MSAL. - If using .net framework (net462), use cmd.exe or powershell.exe to execute collectsfdata.exe. - If using .net core (net6.0+), use pwsh.exe (powershell core) to execute collectsfdata.exe. + .net framework will use ADAL and .net core will use MSAL. + If using .net framework (net462), use cmd.exe or powershell.exe to execute collectsfdata.exe. + If using .net core (net6.0+), use pwsh.exe (powershell core) to execute collectsfdata.exe. ```text Authenticate:exception: AggregateException:System.AggregateException: One or more errors occurred. ---> Microsoft.Identity.Client.MsalServiceException: diff --git a/docs/media/azure.portal.5.png b/docs/media/azure.portal.5.png new file mode 100644 index 00000000..5a10d353 Binary files /dev/null and b/docs/media/azure.portal.5.png differ diff --git a/kusto/functions/sflogs/base/ExtendTableRetentionByDays.csl b/kusto/functions/sflogs/base/ExtendTableRetentionByDays.csl new file mode 100644 index 00000000..47500c32 --- /dev/null +++ b/kusto/functions/sflogs/base/ExtendTableRetentionByDays.csl @@ -0,0 +1,12 @@ +.create-or-alter function with (docstring = "[tableName:string] - Name of the table to modify retention for, [daysToExtend:int] - Number of days to extend retention from current date. Usage: sflogs.base.ExtendTableRetentionByDays('MyTable', 30)", folder = "sflogs/base") + ExtendTableRetentionByDays(tableName:string, daysToExtend:int) { + let currentTime = now(); + let retentionDate = currentTime + (daysToExtend * 1d); + // Generate the management command + print strcat( + ".alter table ['", tableName, "'] policy retention ", + "```{\"SoftDeletePeriod\": \"", + format_timespan(retentionDate - currentTime, "d.hh:mm:ss"), + "\", \"Recoverability\": \"Enabled\"}```" + ) +} diff --git a/kusto/functions/sflogs/base/ExtendTableRetentionToDate.csl b/kusto/functions/sflogs/base/ExtendTableRetentionToDate.csl new file mode 100644 index 00000000..188b4a32 --- /dev/null +++ b/kusto/functions/sflogs/base/ExtendTableRetentionToDate.csl @@ -0,0 +1,11 @@ +.create-or-alter function with (docstring = "[tableName:string] - Name of the table to modify retention for, [extensionDate:datetime] - Specific date until which to retain data. Usage: sflogs.base.ExtendTableRetentionToDate('MyTable', datetime('2025-12-31'))", folder = "sflogs/base") + ExtendTableRetentionToDate(tableName:string, extensionDate:datetime) { + let currentTime = now(); + // Generate the management command + print strcat( + ".alter table ['", tableName, "'] policy retention ", + "```{\"SoftDeletePeriod\": \"", + format_timespan(extensionDate - currentTime, "d.hh:mm:ss"), + "\", \"Recoverability\": \"Enabled\"}```" + ) +} diff --git a/kusto/functions/sflogs/base/GetTableRetentionCommand.csl b/kusto/functions/sflogs/base/GetTableRetentionCommand.csl new file mode 100644 index 00000000..3811bf64 --- /dev/null +++ b/kusto/functions/sflogs/base/GetTableRetentionCommand.csl @@ -0,0 +1,8 @@ +.create-or-alter function with (docstring = "[tableName:string] - Name of the table to query retention policy for. Returns JSON with key retention fields. Usage: sflogs.base.GetTableRetentionCommand('MyTable')", folder = "sflogs/base") + GetTableRetentionCommand(tableName:string) { + // Generate KQL query that returns JSON with essential retention info + print KQLQuery = strcat( + ".show table ['", tableName, "'] details | project MinExtentsCreationTime, RetentionPolicy;\n" + ), + Note = "Copy and run the KQLQuery to get response with Creation and Expiration Days. Use ExtendTableRetentionToDate or ExtendTableRetentionByDays to modify" +} diff --git a/kusto/functions/sflogs/errors/TraceFalsePositiveTabular.csl b/kusto/functions/sflogs/errors/TraceFalsePositiveTabular.csl index ebc4999d..6ec8edb2 100644 --- a/kusto/functions/sflogs/errors/TraceFalsePositiveTabular.csl +++ b/kusto/functions/sflogs/errors/TraceFalsePositiveTabular.csl @@ -107,6 +107,7 @@ // add false positive signatures here using one of the provided functions in format issue description, string pattern, string pattern // use *Matches (regex) only when needed as it is remarkably slower | invoke TextContains("normal ctrl-c process exit", "terminated with exit code 3221225786") + | invoke TextContains("error should be ignored", "ignoring error") | invoke TypeAndTextIMatches("normal application / process exit by service fabric abort", "Hosting", "exitcode\\s?=?\\s?7148") | invoke TypeAndTextIMatches("normal application / process exit by service fabric deactivate", "Hosting", "ExitCode\\s?=?\\s?7147") | invoke TextContains("normal connection close", "FABRIC_E_CONNECTION_CLOSED_BY_REMOTE_END") @@ -139,4 +140,5 @@ | invoke TypeAndTextContains("only issue if using docker.","Hosting.DockerProcessManager","StopDockerNtService: ShutdownDockerService() returned with ErrorCode=FABRIC_E_SERVICE_DOES_NOT_EXIST.") | invoke TypeAndTextContains("only issue if using docker.","Hosting.ContainerHelper","Container Log Root not found at:") | invoke TypeAndTextContains("only issue if using docker.","Hosting.ContainerActivator","OnContainerServiceStarted: ErrorCode=FABRIC_E_SERVICE_DOES_NOT_EXIST") + | invoke TypeAndTextContains("common warning that usually is noise","ASYNC_REQUEST","An Async request was failed") } diff --git a/kusto/functions/sflogs/errors/TraceKnownIssue.csl b/kusto/functions/sflogs/errors/TraceKnownIssue.csl index 76664d77..9f0236cc 100644 --- a/kusto/functions/sflogs/errors/TraceKnownIssue.csl +++ b/kusto/functions/sflogs/errors/TraceKnownIssue.csl @@ -243,5 +243,6 @@ TypeContains(T, "possible FM deadlock issue. queue full.","FM.QueueFull_Failure"), TypeAndTextContains(T, "customer application code issue.", "Api.Finish", "Error = 2148734227"), TypeAndTextContains(T, "certificate revoked", "Transport.SecurityContext", "FABRIC_E_CONNECTION_DENIED: 0x80092010"), - TypeAndTextContains(T, "possible issue if file:StartStopNode.txt is present on node. file will cause node to be in a Status == Down and HealthState == Error", "TestabilityComponent.NodeTestabilitySubsystem", "Killing node on command RestartNode") + TypeAndTextContains(T, "possible issue if file:StartStopNode.txt is present on node. file will cause node to be in a Status == Down and HealthState == Error", "TestabilityComponent.NodeTestabilitySubsystem", "Killing node on command RestartNode"), + TypeAndTextContains(T, "client application initiated an abort", "Transport.Connection", "0x80072745") } diff --git a/kusto/functions/sflogs/hosting/TraceHostProcessesNew.csl b/kusto/functions/sflogs/hosting/TraceHostProcessesNew.csl new file mode 100644 index 00000000..9abd926c --- /dev/null +++ b/kusto/functions/sflogs/hosting/TraceHostProcessesNew.csl @@ -0,0 +1,395 @@ +.create-or-alter function with (docstring = "[T:string] where T=table name. function to search service fabric sflogs for host and guest process id's and names", folder = "sflogs/hosting") + TraceHostProcessesNew(T:string) { + let minTimestamp = toscalar(table(T) | top 1 by Timestamp asc | project Timestamp); + let maxTimestamp = toscalar(table(T) | top 1 by Timestamp desc | project Timestamp); + let allPids = materialize( + table(T) + | where isnotempty(PID) and PID != "0" + | extend userProcess = toint(PID) + | where isnotempty(userProcess) and userProcess > 0 + | summarize + minTime=min(Timestamp), + maxTime=max(Timestamp), + logCount=count(), + sampleTypes=take_any(Type, 3), + sampleText=take_any(Text, 1) + by userProcess, NodeName + | order by userProcess + ); + let processPatterns = materialize( + table(T) + | where isnotempty(PID) and PID != "0" + | extend userProcess = toint(PID) + | where userProcess > 0 + | summarize + logCount=count(), + sampleText=take_any(Text, 2) + by Type + | where logCount > 10 + | order by logCount desc + ); + let fabricHostStart = materialize( + table(T) + | where Type startswith 'Hosting.Hosted' + or Type startswith 'Hosting.CertificateAclingManager' + or Type startswith 'Hosting.FabricActivator' + or Type startswith 'Hosting.ActivationManager' + or Type contains 'FabricHost' + | extend userProcess = toint(PID) + | where userProcess > 0 + | extend processName = case( + Type contains 'FabricActivator', + 'FabricActivator', + Type contains 'ActivationManager', + 'FabricActivationManager', + 'FabricHost' + ) + | summarize Timestamp=min(Timestamp) by userProcess, processName, NodeName + | project startTimestamp=Timestamp, userProcess, processName, NodeName, Timestamp); + let fabricHostEnd = materialize( + table(T) + | where Type startswith 'Hosting.Hosted' + or Type startswith 'Hosting.CertificateAclingManager' + or Type startswith 'Hosting.FabricActivator' + or Type startswith 'Hosting.ActivationManager' + or Type contains 'FabricHost' + or Type contains 'ProcessExit' + or Type contains 'ProcessTerminated' + | extend userProcess = toint(PID) + | where userProcess > 0 + | extend processName = case( + Type contains 'FabricActivator', + 'FabricActivator', + Type contains 'ActivationManager', + 'FabricActivationManager', + 'FabricHost' + ) + | summarize Timestamp=max(Timestamp) by userProcess, processName, NodeName + | project endTimestamp=Timestamp, userProcess, NodeName, Timestamp); + let fabricServiceStart = materialize( + table(T) + | where Type startswith 'Hosting.HostedServiceActivated' + or (Type startswith 'Hosting' and Text has 'activated successfully with ProcessId') + or (Type startswith 'Hosting' and Text has 'ProcessId') + or Type startswith 'Hosting.ApplicationServiceActivated' + | extend userProcess = toint( + coalesce( + extract(@".+ProcessId[:\s=]+(\d+)", 1, Text), + extract(@".+process[:\s]+id[:\s=]+(\d+)", 1, Text), + extract(@".+pid[:\s=]+(\d+)", 1, Text) +) + ) + | where userProcess > 0 + | extend processName = coalesce( + extract(@".+_(.+?) activated successfully", 1, Text), + extract(@".+ExeName[:\s=]+([^,\s]+)", 1, Text), + extract(@".+ProcessName[:\s=]+([^,\s]+)", 1, Text), + extract(@".+ServiceName[:\s=]+([^,\s]+)", 1, Text) + ) + | where isnotempty(processName) + | project startTimestamp=Timestamp, userProcess, processName, NodeName, Timestamp); + let fabricServiceEnd = materialize( + table(T) + | where Type startswith 'Hosting.ApplicationService' + or Type startswith 'Hosting.HostedServiceDeactivated' + or (Type startswith 'Hosting' and Text has 'process id') + or Type contains 'ProcessExit' + or Type contains 'ProcessTerminated' + or Type contains 'ServiceDeactivated' + | extend userProcess = toint( + coalesce( + extract(@".+process[:\s]+id[:\s=]+(\d+)", 1, Text), + extract(@".+ProcessId[:\s=]+(\d+)", 1, Text), + extract(@".+pid[:\s=]+(\d+)", 1, Text) +) + ) + | where userProcess > 0 + | summarize endTimestamp=max(Timestamp) by userProcess, NodeName, Timestamp + | project endTimestamp, userProcess, NodeName, Timestamp); + let userStart = materialize( + table(T) + | where Type startswith 'Hosting.ApplicationService' + or Type startswith 'Hosting.ProcessActivated' + or (Type startswith 'Hosting' and Text has 'was activated with process id') + or (Type startswith 'Hosting' and Text has 'ProcessId' and Text has 'ExeName') + or Type contains 'ServiceActivated' + or Type contains 'ProcessStart' + | extend userProcess = toint( + coalesce( + extract(@".+process[:\s]+id[:\s=]+(\d+)", 1, Text), + extract(@".+ProcessId[:\s=]+(\d+)", 1, Text), + extract(@".+pid[:\s=]+(\d+)", 1, Text) +) + ) + | where userProcess > 0 + | extend processName = coalesce( + extract(@"ExeName[:\s=]+([^,\s]+)", 1, Text), + extract(@"ProcessName[:\s=]+([^,\s]+)", 1, Text), + extract(@"ServiceName[:\s=]+([^,\s]+)", 1, Text), + extract(@"ApplicationName[:\s=]+([^,\s]+)", 1, Text), + extract(@".+\\([^\\]+?)\.exe", 1, Text) + ) + | where isnotempty(processName) + | project startTimestamp=Timestamp, userProcess, processName, NodeName, Timestamp); + let userEnd = materialize( + table(T) + | where Type contains "Hosting._ApplicationsOps_ProcessExitedOperational" + or Type == 'Lease.ProcessCleanup' + or Type == 'Lease.CleanupApplication' + or Type contains 'ProcessExit' + or Type contains 'ProcessTerminated' + or Type contains 'ServiceDeactivated' + or Type contains 'ApplicationTerminated' + or (Type startswith 'Hosting' and Text has 'process') + or Text has 'exit code' + or Text has 'terminated' + | extend userProcess = toint( + coalesce( + extract(@"ProcessId[:\s=]+(\d+)", 1, Text), + extract(@"PID[:\s=]+(\d+)", 1, Text), + extract(@"process[:\s]+id[:\s=]+(\d+)", 1, Text), + extract(@"pid[:\s=]+(\d+)", 1, Text), + extract(@".+CtrlCSender.+?(\d+?)\.", 1, Text) +) + ) + | where userProcess > 0 + | extend processName = coalesce( + extract(@"ExeName[:\s=]+([^,\s]+)", 1, Text), + extract(@"ProcessName[:\s=]+([^,\s]+)", 1, Text), + extract(@"ServiceName[:\s=]+([^,\s]+)", 1, Text), + extract(@"ApplicationName[:\s=]+([^,\s]+)", 1, Text) + ) + | summarize endTimestamp=max(Timestamp), processNames=make_set(processName) by userProcess, NodeName, Timestamp + | extend processName = tostring(processNames[0]) + | project endTimestamp, userProcess, processName, NodeName, Timestamp); + let userStartJoin = materialize ( + userStart + | join kind=leftouter ( + userEnd + | extend joinKey = strcat(userProcess, "|", NodeName) + ) + on $left.userProcess == $right.userProcess, $left.NodeName == $right.NodeName + | where isempty(endTimestamp) or (endTimestamp >= startTimestamp and endTimestamp <= startTimestamp + 1d) + | project + Timestamp=startTimestamp, + startTimestamp, + endTimestamp, + userProcess, + processName, + NodeName); + let userEndJoin = materialize ( + userEnd + | join kind=leftouter ( + userStart + | extend joinKey = strcat(userProcess, "|", NodeName) + ) + on $left.userProcess == $right.userProcess, $left.NodeName == $right.NodeName + | where isempty(startTimestamp) or (startTimestamp <= endTimestamp and startTimestamp >= endTimestamp - 1d) + | project + Timestamp=endTimestamp, + startTimestamp, + endTimestamp, + userProcess, + processName=coalesce(processName, processName1), + NodeName); + let fabricHostStartJoin = materialize ( + fabricHostStart + | join kind=leftouter ( + fabricHostEnd + | extend joinKey = strcat(userProcess, "|", NodeName) + ) + on $left.userProcess == $right.userProcess, $left.NodeName == $right.NodeName + | where isempty(endTimestamp) or (endTimestamp >= startTimestamp and endTimestamp <= startTimestamp + 1d) + | project + Timestamp=startTimestamp, + startTimestamp, + endTimestamp, + userProcess, + processName, + NodeName); + let fabricHostEndJoin = materialize ( + fabricHostEnd + | join kind=leftouter ( + fabricHostStart + | extend joinKey = strcat(userProcess, "|", NodeName) + ) + on $left.userProcess == $right.userProcess, $left.NodeName == $right.NodeName + | where isempty(startTimestamp) or (startTimestamp <= endTimestamp and startTimestamp >= endTimestamp - 1d) + | project + Timestamp=endTimestamp, + startTimestamp, + endTimestamp, + userProcess, + processName, + NodeName); + let fabricServiceStartJoin = materialize ( + fabricServiceStart + | join kind=leftouter ( + fabricServiceEnd + | extend joinKey = strcat(userProcess, "|", NodeName) + ) + on $left.userProcess == $right.userProcess, $left.NodeName == $right.NodeName + | where isempty(endTimestamp) or (endTimestamp >= startTimestamp and endTimestamp <= startTimestamp + 1d) + | project + Timestamp=startTimestamp, + startTimestamp, + endTimestamp, + userProcess, + processName, + NodeName); + let fabricServiceEndJoin = materialize ( + fabricServiceEnd + | join kind=leftouter ( + fabricServiceStart + | extend joinKey = strcat(userProcess, "|", NodeName) + ) + on $left.userProcess == $right.userProcess, $left.NodeName == $right.NodeName + | where isempty(startTimestamp) or (startTimestamp <= endTimestamp and startTimestamp >= endTimestamp - 1d) + | project + Timestamp=endTimestamp, + startTimestamp, + endTimestamp, + userProcess, + processName, + NodeName); + union + fabricHostStartJoin, + fabricHostEndJoin, + fabricServiceStartJoin, + fabricServiceEndJoin, + userStartJoin, + userEndJoin + | where isnotempty(processName) + | extend processName = trim(@"[^\w\.]", processName) + | extend processName = case( + processName endswith ".exe", + substring(processName, 0, strlen(processName) - 4), + processName + ) + | union ( + table(T) + | where isnotempty(PID) and PID != "0" + | extend userProcess = toint(PID) + | where userProcess > 0 + | extend processHint = case( + userProcess == 4, + "System", + Type contains "FabricHost", + "FabricHost", + Type contains "BackupRestore", + "BackupRestoreService", + Type contains "Fabric", + "FabricService", + Type contains "DCA", + "DiagnosticsCollectorAgent", + Type contains "FileStoreService", + "FileStoreService", + Type contains "EventStore", + "EventStoreService", + Type contains "RepairManager", + "RepairManagerService", + Type contains "CentralSecretService", + "CentralSecretService", + Type contains "System", + "SystemService", + Text has ".exe", + extract(@"([^\\]+?)\.exe", 1, Text), + "UnknownProcess" + ) + | where processHint != "UnknownProcess" + | summarize startTimestamp=min(Timestamp), endTimestamp=max(Timestamp) by userProcess, NodeName, processHint + | project + Timestamp=startTimestamp, + startTimestamp, + endTimestamp, + userProcess, + processName=processHint, + NodeName + | join kind=leftanti ( + union + fabricHostStartJoin, + fabricHostEndJoin, + fabricServiceStartJoin, + fabricServiceEndJoin, + userStartJoin, + userEndJoin + | where isnotempty(processName) + ) + on userProcess, NodeName + ) + | project-away Timestamp + | sort by userProcess, NodeName, startTimestamp + | extend + prevEndTime = prev(endTimestamp, 1), + prevPid = prev(userProcess, 1), + prevNode = prev(NodeName, 1) + | extend isNewInstance = case( + userProcess != prevPid or NodeName != prevNode, + true, + isempty(prevEndTime) or isempty(startTimestamp), + true, + startTimestamp > prevEndTime + 5m, + true, + false + ) + | extend processInstanceId = row_cumsum(case(isNewInstance, 1, 0)) + | summarize + startTime=min(startTimestamp), + endTime=max(endTimestamp), + processNames=make_set(processName), + logSources=make_set(case(isnotempty(startTimestamp), "start", "end")), + recordCount=count() + by userProcess, NodeName, processInstanceId + | extend + duration = case(isnotempty(endTime) and isnotempty(startTime), endTime - startTime, timespan(null)), + hasStart = logSources has "start", + hasEnd = logSources has "end" + | extend processName = case( + array_length(processNames) == 1, + tostring(processNames[0]), + // Prioritize specific service names over generic ones + processNames has "BackupRestoreService", + "BackupRestoreService", + processNames has "DiagnosticsCollectorAgent", + "DiagnosticsCollectorAgent", + processNames has "FileStoreService", + "FileStoreService", + processNames has "EventStoreService", + "EventStoreService", + processNames has "RepairManagerService", + "RepairManagerService", + processNames has "CentralSecretService", + "CentralSecretService", + processNames has "FabricHost", + "FabricHost", + processNames has "FabricService", + "FabricService", + // Default to first non-generic name + tostring(processNames[0]) + ) + | extend processStatus = case( + hasStart and hasEnd, + "Complete", + hasStart and not(hasEnd), + "Running/NoExit", + not(hasStart) and hasEnd, + "Orphaned/NoStart", + "Unknown" + ) + | order by userProcess, NodeName, processInstanceId + | extend + nextStartTime = next(startTime, 1), + nextPid = next(userProcess, 1), + nextNode = next(NodeName, 1) + | project + userProcess, + NodeName, + processInstanceId, + processName, + startTime, + endTime, + duration, + processStatus, + recordCount + | order by userProcess, NodeName, processInstanceId + } diff --git a/kusto/functions/sflogs/reconfiguration/TraceReconfigurationPLB.csl b/kusto/functions/sflogs/reconfiguration/TraceReconfigurationPLB.csl index 1f9f5479..5b1618e2 100644 --- a/kusto/functions/sflogs/reconfiguration/TraceReconfigurationPLB.csl +++ b/kusto/functions/sflogs/reconfiguration/TraceReconfigurationPLB.csl @@ -7,14 +7,14 @@ | extend decisionId = extract(@"DecisionId: (?P\S+)", 1, Text) | extend affects = split(extract(@"Affects Services with Metrics: \{ (?P.+?) \}", 1, Text), " ") | extend placement = extract(@"\t\tPlacement was scheduled because:.*?(?P.*?)(Balanc.*? was|Constraint .*? was)", 1, Text) - | extend placementList = split(replace(replacePattern,"",placement),'.') + | extend placementList = split(replace_string(replacePattern,"",placement),'.') | extend balancing = extract(@"\t\tBalance Checking was scheduled because:.*?(?P.*?)(Placement was|Constraint .*? was)", 1, Text) - | extend balanceList = split(replace(replacePattern,"",balancing),'.') + | extend balanceList = split(replace_string(replacePattern,"",balancing),'.') | extend constraint = extract(@"\t\tConstraint Violation Checking was scheduled because:.*?(?P.*?)(Balanc.*? was|Placement was)", 1, Text) - | extend constraintList = split(replace(replacePattern,"",constraint),'.') + | extend constraintList = split(replace_string(replacePattern,"",constraint),'.') | extend imbalance = extract(@"Imbalanced Metric Information:.*?Number of Metric Imbalances (?P.*)", 1, Text) | extend imbalanceCount = extract(@"^(\d+?)",1, imbalance) - | extend imbalanceList = split(replace(replacePattern,"",imbalance),'.') + | extend imbalanceList = split(replace_string(replacePattern,"",imbalance),'.') | extend metric = extract(@"(?P--Metric:.*)",1,imbalance) | extend metricList = split(metric,"--Metric: ") | where isnotempty(placement) or isnotempty(balancing) or isnotempty(constraint) diff --git a/src/CollectSFDataDll/Azure/BlobManager.cs b/src/CollectSFDataDll/Azure/BlobManager.cs index ad86df27..4b4a0519 100644 --- a/src/CollectSFDataDll/Azure/BlobManager.cs +++ b/src/CollectSFDataDll/Azure/BlobManager.cs @@ -270,6 +270,22 @@ private List EnumerateContainers(string containerPrefix = " try { + // If we are using a user delegation SAS (container-level), skip attempting to enumerate at the account level. + if (_config.SasEndpointInfo?.Parameters?.IsUserDelegationKey == true) + { + Log.Info("skipping account-level container enumeration for user delegation sas"); + if (!string.IsNullOrEmpty(_config.SasEndpointInfo.AbsolutePath)) + { + BlobContainerClient single = _blobServiceClient.GetBlobContainerClient(_config.SasEndpointInfo.AbsolutePath); + AddContainerToList(single.Name); + } + else + { + Log.Warning("no absolute path specified for user delegation sas. unable to enumerate containers."); + } + return ContainerList; + } + Log.Info("account sas"); Log.Info($"containerPrefix:{containerPrefix} containerFilter:{containerFilter}"); blobContainers = _blobServiceClient.GetBlobContainers(BlobContainerTraits.Metadata, diff --git a/src/CollectSFDataDll/Azure/SasEndpoints.cs b/src/CollectSFDataDll/Azure/SasEndpoints.cs index 90498ef2..42009d73 100644 --- a/src/CollectSFDataDll/Azure/SasEndpoints.cs +++ b/src/CollectSFDataDll/Azure/SasEndpoints.cs @@ -112,6 +112,12 @@ public bool IsValid() Log.Error("Sas is not time valid", Parameters); retval = false; } + if (Parameters.SignedKeyStartUtc > DateTime.Now.ToUniversalTime() + | Parameters.SignedKeyExpiryUtc < DateTime.Now.ToUniversalTime()) + { + Log.Error("Sas signed key is not time valid", Parameters); + retval = false; + } else if (Parameters.SignedExpiryUtc.AddHours(-1) < DateTime.Now.ToUniversalTime()) { Log.Warning("Sas expiring in less than 1 hour", Parameters); @@ -133,6 +139,15 @@ public bool IsValid() } } + if(Parameters.IsUserDelegationKey) + { + if (string.IsNullOrEmpty(Parameters.SignedKeyId) | string.IsNullOrEmpty(Parameters.SignedKeyObjectId)) + { + Log.Error("Sas user delegation key missing signed key id or object id", Parameters); + retval = false; + } + } + Log.Info($"exit: {retval}"); return retval; } diff --git a/src/CollectSFDataDll/Azure/SasParameters.cs b/src/CollectSFDataDll/Azure/SasParameters.cs index 0331c9ad..17e1f69b 100644 --- a/src/CollectSFDataDll/Azure/SasParameters.cs +++ b/src/CollectSFDataDll/Azure/SasParameters.cs @@ -11,34 +11,25 @@ namespace CollectSFData.Azure public class SasParameters { public string ApiVersion { get; set; } - public bool IsServiceSas { get; set; } - + public bool IsUserDelegationKey { get; set; } public string SasToken { get; private set; } public string Signature { get; set; } - public string SignedExpiry { get; set; } - - public DateTime SignedExpiryLocal { get; set; } = DateTime.MinValue; - - public DateTime SignedExpiryUtc { get; set; } = DateTime.MinValue; - + public DateTime SignedExpiryLocal { get; set; } = DateTime.MaxValue; + public DateTime SignedExpiryUtc { get; set; } = DateTime.MaxValue; public string SignedIp { get; set; } - + public DateTime SignedKeyStartUtc { get; set; } = DateTime.MinValue; + public DateTime SignedKeyExpiryUtc { get; set; } = DateTime.MaxValue; + public string SignedKeyId { get; set; } + public string SignedKeyObjectId { get; set; } public string SignedPermission { get; set; } - public string SignedProtocol { get; set; } - public string SignedResourceTypes { get; set; } - public string SignedServices { get; set; } - public string SignedStart { get; set; } - public DateTime SignedStartLocal { get; set; } = DateTime.MinValue; - public DateTime SignedStartUtc { get; set; } = DateTime.MinValue; - public string SignedVersion { get; set; } public SasParameters() @@ -83,6 +74,16 @@ public SasParameters(string sasToken) { IsServiceSas = true; } + + if (paramName.Equals("skt") | paramName.Equals("ske") | paramName.Equals("sktid") | paramName.Equals("skoid")) + { + IsUserDelegationKey = true; + } + if (paramName.Equals("skt")) { SignedKeyStartUtc = ParseDate(paramValue); } + if (paramName.Equals("ske")) { SignedKeyExpiryUtc = ParseDate(paramValue); } + if (paramName.Equals("sktid")) { SignedKeyId = paramValue; } + if (paramName.Equals("skoid")) { SignedKeyObjectId = paramValue; } + } }