From 87879c201caabfed14dfa531d94d68e90c57e9b0 Mon Sep 17 00:00:00 2001 From: kpattaswamy <62078498+kpattaswamy@users.noreply.github.com> Date: Tue, 12 Jul 2022 15:35:50 -0700 Subject: [PATCH] Add Visualization Type to Descriptions (#75) * Added visualization type to descriptions Signed-off-by: Kartik Pattaswamy * Modified description for script Signed-off-by: Kartik Pattaswamy * Modified and verified all script descriptions Signed-off-by: Kartik Pattaswamy * Changed wording of script visualization description Signed-off-by: Kartik Pattaswamy --- src/pxl_scripts/http-data-filtered.json | 2 +- src/pxl_scripts/http-errors-per-service.json | 2 +- src/pxl_scripts/http-request-throughput-per-service.json | 2 +- src/pxl_scripts/http-request-throughput.json | 2 +- src/pxl_scripts/http-service-map.json | 2 +- src/pxl_scripts/inbound-connections-node-graph.json | 2 +- src/pxl_scripts/namespaces-metrics.json | 2 +- src/pxl_scripts/network-traffic-node-graph.json | 2 +- src/pxl_scripts/node-metrics.json | 2 +- src/pxl_scripts/outbound-connections-node-graph.json | 2 +- src/pxl_scripts/pods-metrics.json | 2 +- src/pxl_scripts/service-metrics.json | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/pxl_scripts/http-data-filtered.json b/src/pxl_scripts/http-data-filtered.json index e8c296d..dbbc1b1 100644 --- a/src/pxl_scripts/http-data-filtered.json +++ b/src/pxl_scripts/http-data-filtered.json @@ -1,6 +1,6 @@ { "name": "Raw HTTP Events (Long Format)", - "description": "Query for raw HTTP events. Shouldn't be used with time series.", + "description": "Use with the Table visualization. Query for raw HTTP events. Shouldn't be used with time series.", "script": "'''\nThis query outputs a table of HTTP events (request and response pairs).\nIt produces output identical to Pixie's \\`px/http_data_filtered\\` script in the Live UI.\n\nTo filter the HTTP events, uncomment lines 41-43. Alternatively, use Grafana's\ntable column filtering feature:\nhttps://grafana.com/docs/grafana/latest/visualizations/table/filter-table-columns/\n\nThis query is for use with Grafana's Pixie Datasource Plugin only,\nas it uses Grafana macros for adding Grafana dashboard context.\n'''\n\n# $pixieCluster - work around to update the panel if this dashboard variable is present\n\n# Import Pixie's module for querying data.\nimport px\n\n# Import HTTP events table.\ndf = px.DataFrame(table='http_events', start_time=__time_from)\n\n# Add columns for service, pod info.\ndf.svc = df.ctx['service']\ndf.pod = df.ctx['pod']\ndf = df.drop('upid')\n\n# EXAMPLE OPTIONAL FILTERS\n#df = df[px.contains(df.svc, 'catalogue')]\n#df = df[px.contains(df.pod, 'catalogue')]\n#df = df[df.req_path == '/healthz']\n\n# Avoid conversion to wide format\ndf.timestamp = df.time_\ndf = df.drop(columns=['time_'])\n\n# Keep only the selected columns (and order them in the following order)\ndf = df[[$__columns]]\n\n# Output the DataFrame\npx.display(df)\n", "isGroupBy": true, "isColDisplay": true, diff --git a/src/pxl_scripts/http-errors-per-service.json b/src/pxl_scripts/http-errors-per-service.json index d4bb51b..3c9b6fb 100644 --- a/src/pxl_scripts/http-errors-per-service.json +++ b/src/pxl_scripts/http-errors-per-service.json @@ -1,5 +1,5 @@ { "name": "HTTP Error Rate by Service (Wide Format)", - "description": "", + "description": "Use with the Table visualization. Query outputs HTTP error and total request count per service", "script": "'''\nThis query outputs a table of HTTP error and total request count per service.\n\nThis query is for use with Grafana's Pixie Datasource Plugin only,\nas it uses Grafana macros for adding Grafana dashboard context.\n'''\n\n# $pixieCluster - work around to update the panel if this dashboard variable is present\n\n# Import Pixie's module for querying data.\nimport px\n\n# Import HTTP events table.\ndf = px.DataFrame(table='http_events', start_time=__time_from)\n\n# Add columns for service, namespace info.\ndf.namespace = df.ctx['namespace']\ndf.service = df.ctx['service']\n\n# Filter out requests that don't have a service defined.\ndf = df[df.service != '']\n\n# Filter out requests from the Pixie (pl) namespace.\ndf = df[df.namespace != 'pl']\n\n# Add column for HTTP response status errors.\ndf.error = df.resp_status >= 400\n\n# Group HTTP events by service, counting errors and total HTTP events.\ndf = df.groupby(['service']).agg(\n error_count=('error', px.sum),\n total_requests=('resp_status', px.count)\n)\n\n# Output the DataFrame.\npx.display(df)\n" } diff --git a/src/pxl_scripts/http-request-throughput-per-service.json b/src/pxl_scripts/http-request-throughput-per-service.json index 2794d1d..6673811 100644 --- a/src/pxl_scripts/http-request-throughput-per-service.json +++ b/src/pxl_scripts/http-request-throughput-per-service.json @@ -1,5 +1,5 @@ { "name": "HTTP Request Throughput by Service", - "description": "Displays time series showing overall HTTP request throughput per service.", + "description": "Use with Table or Time series visualization. Displays overall HTTP request throughput per service.", "script": "# Copyright 2018- The Pixie Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# SPDX-License-Identifier: Apache-2.0\n\n'''\nThis query outputs a table of time series data showing overall HTTP\nrequest throughput per service.\n\nThis query is for use with Grafana's Pixie Datasource Plugin only,\nas it uses Grafana macros for adding Grafana dashboard context.\n'''\n\n# $pixieCluster - work around to update the panel if this dashboard variable is present\n\n# Import Pixie's module for querying data.\nimport px\n\n# Load data from Pixie's `http_events` table into a Dataframe.\ndf = px.DataFrame(table='http_events', start_time=__time_from)\n\n# Add K8s metadata context.\ndf.service = df.ctx['service']\ndf.namespace = df.ctx['namespace']\n\n# Filter out requests that don't have a service defined.\ndf = df[df.service != '']\n\n# Bin the 'time_' column using the interval provided by Grafana.\ndf.timestamp = px.bin(df.time_, __interval)\n\n# Group data by unique pairings of 'timestamp' and 'service'\n# and count the total number of requests per unique pairing.\nper_ns_df = df.groupby(['timestamp', 'service']).agg(\n throughput_total=('latency', px.count)\n )\n\n# Calculate throughput by dividing # of requests by the time interval.\nper_ns_df.request_throughput = per_ns_df.throughput_total / __interval\nper_ns_df.request_throughput = per_ns_df.request_throughput * 1e9\n\n# Rename 'timestamp' column to 'time_'. The Grafana plugin expects a 'time_'\n# column to display data in a Graph or Time series.\nper_ns_df.time_ = per_ns_df.timestamp\n\n# Output select columns of the DataFrame.\npx.display(per_ns_df['time_', 'service', 'request_throughput'])" } diff --git a/src/pxl_scripts/http-request-throughput.json b/src/pxl_scripts/http-request-throughput.json index 022acfe..b1c0379 100644 --- a/src/pxl_scripts/http-request-throughput.json +++ b/src/pxl_scripts/http-request-throughput.json @@ -1,5 +1,5 @@ { "name": "Raw HTTP Requests", - "description": "Query outputs a table of time series data showing overall HTTP request throughput", + "description": "Use with Table or Time series visualization. Query outputs a table of time series data showing overall HTTP request throughput.", "script": "# Copyright 2018- The Pixie Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# SPDX-License-Identifier: Apache-2.0\n\n'''\nThis query outputs a table of time series data showing overall HTTP request throughput.\n\nThis query is for use with Grafana's Pixie Datasource Plugin only,\nas it uses Grafana macros for adding Grafana dashboard context.\n'''\n\n# $pixieCluster - work around to update the panel if this dashboard variable is present\n\n# Import Pixie's module for querying data.\nimport px\n\n# Load data from Pixie's `http_events` table into a Dataframe.\ndf = px.DataFrame(table='http_events', start_time=__time_from)\n\n# Add context.\ndf.pod = df.ctx['pod']\ndf.service = df.ctx['service']\ndf.namespace = df.ctx['namespace']\ndf.node = df.ctx['node']\n\n# Add optional filtering.\n# df = df[df.service == 'px-sock-shop/front-end']\n# df = df[px.contains(df.pod, 'front-end')]\n\n# Bin the 'time_' column using the interval provided by Grafana.\ndf.timestamp = px.bin(df.time_, __interval)\n\n# Group data by unique 'timestamp' and count the total number of\n# requests per unique timestamp.\nper_ns_df = df.groupby(['timestamp']).agg(\n throughput_total=('latency', px.count)\n )\n\n# Calculate throughput by dividing # of requests by the time interval.\nper_ns_df.request_throughput = per_ns_df.throughput_total / __interval\n\n# Rename 'timestamp' column to 'time_'. The Grafana plugin expects a 'time_'\n# column to display data in a Graph or Time series.\nper_ns_df.time_ = per_ns_df.timestamp\nper_ns_df.request_throughput = per_ns_df.request_throughput * 1e9\n\n# Output select columns of the DataFrame.\npx.display(per_ns_df['time_', 'request_throughput'])" } diff --git a/src/pxl_scripts/http-service-map.json b/src/pxl_scripts/http-service-map.json index ad5ecd0..5687f48 100644 --- a/src/pxl_scripts/http-service-map.json +++ b/src/pxl_scripts/http-service-map.json @@ -1,5 +1,5 @@ { "name": "HTTP Service Map", - "description": "This query outputs a graph of the HTTP traffic between the services in your cluster.", + "description": "Use with the Node Graph visualization. This query outputs HTTP traffic between the services in your cluster.", "script": "# Copyright 2018- The Pixie Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# SPDX-License-Identifier: Apache-2.0\n\n'''\nThis query outputs a graph of the HTTP traffic between the services in\nyour cluster. Use with Grafana's node graph panel.\n\nThis query is for use with Grafana's Pixie Datasource Plugin only,\nas it uses Grafana macros for adding Grafana dashboard context.\nThe functions in this query are pulled from the px/cluster script:\nhttps://github.com/pixie-io/pixie/tree/main/src/pxl_scripts/px/cluster\n'''\n\n# $pixieCluster - work around to update the panel if this dashboard variable is present\n\n# Import Pixie's module for querying data.\nimport px\n\n# Window size to use on time_ column for bucketing.\nns_per_s = 1000 * 1000 * 1000\nns_per_ms = 1000 * 1000\nwindow_ns = px.DurationNanos(10 * ns_per_s)\n\n# Flag to filter out health checks from the data.\nfilter_health_checks = True\n\n# Whether or not to include traffic from IPs that don't resolve to a known pod/service.\ninclude_ips = True\n\n\ndef http_stats():\n ''' Get a dataframe of HTTP events.\n Certain traffic (like health checks) are auto removed, and some standard fields are added.\n '''\n df = px.DataFrame(table='http_events', start_time=__time_from)\n\n # Add K8s metadata.\n df.namespace = df.ctx['namespace']\n df.service = df.ctx['service']\n df.pod = df.ctx['pod']\n\n # Add optional filters.\n df = df[df.namespace == 'px-sock-shop']\n # df = df[df.service == '']\n # df = df[df.pod == '']\n\n # Filter out non-k8s entities.\n df = df[df.pod != '']\n\n # Snap timestamps to bins.\n df.timestamp = px.bin(df.time_, window_ns)\n\n # Additional HTTP fields, pre-computed for convenience.\n df.failure = df.resp_status >= 400\n\n # Remove health checks, and anything with no remote address.\n health_check_req = ((df.req_path == '/healthz' or df.req_path == '/readyz') or\n df.req_path == '/livez')\n filter_out_conds = (health_check_req and filter_health_checks) or (df['remote_addr'] == '-')\n df = df[not filter_out_conds]\n\n return df\n\n\ndef service_let_graph():\n ''' Compute a summary of traffic by requesting service, for requests on services\n in the current cluster. Similar to `inbound_let_summary` but also breaks down\n by pod in addition to service.\n '''\n df = http_stats()\n df = df.groupby(['timestamp', 'service', 'remote_addr', 'pod', 'trace_role']).agg(\n latency_quantiles=('latency', px.quantiles),\n error_rate=('failure', px.mean),\n throughput_total=('latency', px.count),\n inbound_bytes_total=('req_body_size', px.sum),\n outbound_bytes_total=('resp_body_size', px.sum)\n )\n\n # Get the traced and remote pod/service/IP information.\n df.traced_pod = df.pod\n df.traced_svc = df.service\n df.traced_ip = px.pod_name_to_pod_ip(df.pod)\n df.remote_pod = px.pod_id_to_pod_name(px.ip_to_pod_id(df.remote_addr))\n df.remote_svc = px.service_id_to_service_name(px.ip_to_service_id(df.remote_addr))\n df.remote_ip = df.remote_addr\n # If external IPs are excluded in the service graph, then we also exclude any\n # traffic where we don't know the remote pod or remote service name.\n df = df[include_ips or (df.remote_pod != '' or df.remote_svc != '')]\n\n # Associate it with Client/Server roles, based on the trace role.\n df.is_server_side_tracing = df.trace_role == 2\n df.responder_pod = px.select(df.is_server_side_tracing, df.traced_pod, df.remote_pod)\n df.requestor_pod = px.select(df.is_server_side_tracing, df.remote_pod, df.traced_pod)\n df.responder_service = px.select(df.is_server_side_tracing, df.traced_svc, df.remote_svc)\n df.requestor_service = px.select(df.is_server_side_tracing, df.remote_svc, df.traced_svc)\n df.responder_ip = px.select(df.is_server_side_tracing, df.traced_ip, df.remote_ip)\n df.requestor_ip = px.select(df.is_server_side_tracing, df.remote_ip, df.traced_ip)\n\n # Compute statistics about each edge of the service graph.\n df.latency_p50 = px.DurationNanos(px.floor(px.pluck_float64(df.latency_quantiles, 'p50')))\n df.latency_p90 = px.DurationNanos(px.floor(px.pluck_float64(df.latency_quantiles, 'p90')))\n df.latency_p99 = px.DurationNanos(px.floor(px.pluck_float64(df.latency_quantiles, 'p99')))\n df.request_throughput = df.throughput_total / window_ns\n df.inbound_throughput = df.inbound_bytes_total / window_ns\n df.outbound_throughput = df.outbound_bytes_total / window_ns\n df.error_rate = px.Percent(df.error_rate)\n return df.groupby(['responder_pod', 'requestor_pod', 'responder_service',\n 'requestor_service', 'responder_ip', 'requestor_ip']).agg(\n latency_p50=('latency_p50', px.mean),\n latency_p90=('latency_p90', px.mean),\n latency_p99=('latency_p99', px.mean),\n request_throughput=('request_throughput', px.mean),\n error_rate=('error_rate', px.mean),\n inbound_throughput=('inbound_throughput', px.mean),\n outbound_throughput=('outbound_throughput', px.mean),\n throughput_total=('throughput_total', px.sum)\n )\n\n\ndef graphnode_sources():\n df = service_let_graph()\n # Use Pod name for source node id and title. If pod name is not available,\n # use service name or IP address.\n df.source_svc_ip = px.select(df.requestor_service != '', df.requestor_service, df.requestor_ip)\n df.id = px.select(df.requestor_pod != '', df.requestor_pod, df.source_svc_ip)\n df.title = df.id\n df = df.groupby(['id', 'title']).agg()\n return df\n\n\ndef graphnode_targets():\n df = service_let_graph()\n # Use Pod name for target node id and title. If pod name is not available,\n # use service name or IP address.\n df.target_svc_ip = px.select(df.responder_service != '', df.responder_service, df.responder_ip)\n df.id = px.select(df.responder_pod != '', df.responder_pod, df.target_svc_ip)\n df.title = df.id\n df = df.groupby(['id', 'title']).agg()\n return df\n\n\ndef nodes():\n node_sources = graphnode_sources()\n node_targets = graphnode_targets()\n df = node_sources.append(node_targets)\n return df\n\n\ndef edges():\n df = service_let_graph()\n df.source_svc_ip = px.select(df.requestor_service != '', df.requestor_service, df.requestor_ip)\n df.source = px.select(df.requestor_pod != '', df.requestor_pod, df.source_svc_ip)\n df.target_svc_ip = px.select(df.responder_service != '', df.responder_service, df.responder_ip)\n df.target = px.select(df.responder_pod != '', df.responder_pod, df.target_svc_ip)\n df.id = df.source + '-' + df.target\n df.mainStat = df.error_rate * 100\n df.secondaryStat = df.latency_p90 / ns_per_ms\n return df[['id', 'source', 'target', 'mainStat', 'secondaryStat']]\n\n\nnodes_table = nodes()\nedges_table = edges()\npx.display(nodes_table, \"nodes\")\npx.display(edges_table, \"edges\")" } diff --git a/src/pxl_scripts/inbound-connections-node-graph.json b/src/pxl_scripts/inbound-connections-node-graph.json index 53b7ca8..e1b9879 100644 --- a/src/pxl_scripts/inbound-connections-node-graph.json +++ b/src/pxl_scripts/inbound-connections-node-graph.json @@ -1,5 +1,5 @@ { "name": "Inbound Connections in Cluster", - "description": "This query outputs a graph of the network connections to, from and within your cluster.", + "description": "Use with the Node Graph visualization. This query outputs network connections to, from and within your cluster.", "script": "# Copyright 2018- The Pixie Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# SPDX-License-Identifier: Apache-2.0\n\n'''\nThis query outputs a graph of the network connections to, from and within your cluster.\nUse this query with Grafana's Node Graph panel.\n\nThis query is for use with Grafana's Pixie Datasource Plugin only,\nas it uses Grafana macros for adding Grafana dashboard context.\nThis query is a modification of the px/inbound_conns script:\nhttps://github.com/pixie-io/pixie/tree/main/src/pxl_scripts/px/inbound_conns\n'''\n\n# $pixieCluster - work around to update the panel if this dashboard variable is present\n\n# Import Pixie's module for querying data.\nimport px\n\n\ndef inbound_conns():\n df = px.DataFrame(table='conn_stats', start_time=__time_from)\n\n df.namespace = df.ctx['namespace']\n df.service = df.ctx['service']\n df.node = df.ctx['node']\n df.pod = df.ctx['pod']\n\n # # Add optional filters:\n # Filter IP address of sender.\n # df = df[df.remote_addr == '10.38.0.15']\n # # Filter namespace, service, node or pod name of receiving pod.\n # # Pixie formats service and pod names in the / format.\n # df = df[df.service == 'px-sock-shop/front-end']\n # df = df[df.pod == '']\n # df = df[df.node == '']\n # df = df[df.namespace == '']\n\n # Trace-role of 2 means server-side tracing.\n df = df[df.trace_role == 2]\n\n # Filter out any connections from known pods.\n df.remote_pod_id = px.ip_to_pod_id(df.remote_addr)\n df.remote_service_id = px.ip_to_service_id(df.remote_addr)\n df = df[df.remote_pod_id == '' and df.remote_service_id == '']\n\n # Filter out connections from localhost.\n df = df[not df.remote_addr == '127.0.0.1']\n\n # Calculate connection stats for each process for each unique pod / remote_addr pair.\n df = df.groupby(['pod', 'upid', 'remote_addr']).agg(\n # The fields below are counters per UPID, so we take\n # the min (starting value) and the max (ending value) to subtract them.\n conn_open_min=('conn_open', px.min),\n conn_open_max=('conn_open', px.max),\n bytes_sent_min=('bytes_sent', px.min),\n bytes_sent_max=('bytes_sent', px.max),\n bytes_recv_min=('bytes_recv', px.min),\n bytes_recv_max=('bytes_recv', px.max),\n )\n\n # Calculate connection stats over the time window.\n df.conn_open = df.conn_open_max - df.conn_open_min\n df.bytes_sent = df.bytes_sent_max - df.bytes_sent_min\n df.bytes_recv = df.bytes_recv_max - df.bytes_recv_min\n\n # Calculate connection stats for each unique pod / remote_addr pair. Since there\n # may be multiple processes per pod we perform an additional aggregation to\n # consolidate those into one entry.\n df = df.groupby(['pod', 'remote_addr']).agg(\n connections_open=('conn_open', px.sum),\n bytes_sent=('bytes_sent', px.sum),\n bytes_recv=('bytes_recv', px.sum),\n )\n\n # Convert to kilobytes.\n df.kbytes_sent = df.bytes_sent / 1000\n df.kbytes_recv = df.bytes_recv / 1000\n df.kbytes_total = df.kbytes_sent + df.kbytes_recv\n\n # Resolve remote addresses to public domain\n df.domain = px.nslookup(df.remote_addr)\n df.domain = px.select(df.domain == df.remote_addr, '', df.domain)\n\n return df[['pod', 'remote_addr', 'domain', 'connections_open', 'kbytes_sent',\n 'kbytes_recv', 'kbytes_total']]\n\n\n# Construct the nodes table for the Node Graph panel.\n# https://grafana.com/docs/grafana/next/visualizations/node-graph/#node-parameters\ndef nodes():\n df1 = inbound_conns()\n df1.id = df1.pod\n df1.title = df1.pod\n df1 = df1.groupby(['id', 'title']).agg()\n df2 = inbound_conns()\n df2.id = df2.remote_addr\n df2.title = df2.remote_addr\n df2 = df2.groupby(['id', 'title']).agg()\n return df1.append(df2)\n\n\n# Construct the edges table for the Node Graph panel.\n# https://grafana.com/docs/grafana/next/visualizations/node-graph/#edge-parameters\ndef edges():\n df = inbound_conns()\n df.source = df.remote_addr\n df.target = df.pod\n df.id = df.source + '-' + df.target\n df.mainStat = df.kbytes_sent\n df.secondaryStat = df.kbytes_recv\n return df[['id', 'source', 'target', 'mainStat', 'secondaryStat']]\n\n\n# Display the tables.\nnodes = nodes()\npx.display(nodes, \"nodes\")\n\nedges = edges()\npx.display(edges, \"edges\")" } diff --git a/src/pxl_scripts/namespaces-metrics.json b/src/pxl_scripts/namespaces-metrics.json index 9f8c1ab..d72041e 100644 --- a/src/pxl_scripts/namespaces-metrics.json +++ b/src/pxl_scripts/namespaces-metrics.json @@ -1,6 +1,6 @@ { "name": "Namespace Metrics", - "description": "Gets a overview of namespaces in the current cluster since start time", + "description": "Use with the Table visualization. Gets an overview of namespaces in the current cluster since start time.", "script": "# $pixieCluster - work around to update the panel if this dashboard variable is present\n\nimport px\n\ndef process_stats_by_entity(start_time: int, entity: str):\n ''' Gets the windowed process stats (CPU, memory, etc) per node or pod.\n Args:\n @start_time Starting time of the data to examine.\n @entity: Either pod or node_name.\n '''\n # Window size to use on time_ column for bucketing.\n ns_per_s = 1000 * 1000 * 1000\n window_ns = px.DurationNanos(10 * ns_per_s)\n\n df = px.DataFrame(table='process_stats', start_time=start_time)\n df[entity] = df.ctx[entity]\n df.timestamp = px.bin(df.time_, window_ns)\n # First calculate CPU usage by process (UPID) in each k8s_object\n # over all windows.\n df = df.groupby([entity, 'upid', 'timestamp']).agg(\n rss=('rss_bytes', px.mean),\n vsize=('vsize_bytes', px.mean),\n # The fields below are counters, so we take the min and the max to subtract them.\n cpu_utime_ns_max=('cpu_utime_ns', px.max),\n cpu_utime_ns_min=('cpu_utime_ns', px.min),\n cpu_ktime_ns_max=('cpu_ktime_ns', px.max),\n cpu_ktime_ns_min=('cpu_ktime_ns', px.min),\n read_bytes_max=('read_bytes', px.max),\n read_bytes_min=('read_bytes', px.min),\n write_bytes_max=('write_bytes', px.max),\n write_bytes_min=('write_bytes', px.min),\n rchar_bytes_max=('rchar_bytes', px.max),\n rchar_bytes_min=('rchar_bytes', px.min),\n wchar_bytes_max=('wchar_bytes', px.max),\n wchar_bytes_min=('wchar_bytes', px.min),\n )\n # Next calculate cpu usage and memory stats per window.\n df.cpu_utime_ns = df.cpu_utime_ns_max - df.cpu_utime_ns_min\n df.cpu_ktime_ns = df.cpu_ktime_ns_max - df.cpu_ktime_ns_min\n df.read_bytes = df.read_bytes_max - df.read_bytes_min\n df.write_bytes = df.write_bytes_max - df.write_bytes_min\n df.rchar_bytes = df.rchar_bytes_max - df.rchar_bytes_min\n df.wchar_bytes = df.wchar_bytes_max - df.wchar_bytes_min\n # Sum by UPID.\n df = df.groupby([entity, 'timestamp']).agg(\n cpu_ktime_ns=('cpu_ktime_ns', px.sum),\n cpu_utime_ns=('cpu_utime_ns', px.sum),\n read_bytes=('read_bytes', px.sum),\n write_bytes=('write_bytes', px.sum),\n rchar_bytes=('rchar_bytes', px.sum),\n wchar_bytes=('wchar_bytes', px.sum),\n rss=('rss', px.sum),\n vsize=('vsize', px.sum),\n )\n df.actual_disk_read_throughput = df.read_bytes / window_ns\n df.actual_disk_write_throughput = df.write_bytes / window_ns\n df.total_disk_read_throughput = df.rchar_bytes / window_ns\n df.total_disk_write_throughput = df.wchar_bytes / window_ns\n # Now take the mean value over the various timestamps.\n df = df.groupby(entity).agg(\n cpu_ktime_ns=('cpu_ktime_ns', px.mean),\n cpu_utime_ns=('cpu_utime_ns', px.mean),\n actual_disk_read_throughput=('actual_disk_read_throughput', px.mean),\n actual_disk_write_throughput=('actual_disk_write_throughput', px.mean),\n total_disk_read_throughput=('total_disk_read_throughput', px.mean),\n total_disk_write_throughput=('total_disk_write_throughput', px.mean),\n avg_rss=('rss', px.mean),\n avg_vsize=('vsize', px.mean),\n )\n # Finally, calculate total (kernel + user time) percentage used over window.\n df.cpu_usage = px.Percent((df.cpu_ktime_ns + df.cpu_utime_ns) / window_ns)\n return df.drop(['cpu_ktime_ns', 'cpu_utime_ns'])\n \n''' Gets a overview of namespaces in the current cluster since `start_time`.\nArgs:\n@start_time Start time of the data to examine.\n'''\ndf = px.DataFrame(table='process_stats', start_time=$__from)\ndf.service = df.ctx['service_name']\ndf.pod = df.ctx['pod_name']\ndf.namespace = df.ctx['namespace']\nagg = df.groupby(['service', 'pod', 'namespace']).agg()\npod_count = agg.groupby(['namespace', 'pod']).agg()\npod_count = pod_count.groupby('namespace').agg(pod_count=('pod', px.count))\nsvc_count = agg.groupby(['namespace', 'service']).agg()\nsvc_count = svc_count.groupby('namespace').agg(service_count=('service', px.count))\npod_and_svc_count = pod_count.merge(svc_count, how='inner',\n left_on='namespace', right_on='namespace',\n suffixes=['', '_x'])\nprocess_stats = process_stats_by_entity($__from, 'namespace')\ndf = process_stats.merge(pod_and_svc_count, how='inner', left_on='namespace',\n right_on='namespace', suffixes=['', '_y'])\ndf = df[[$__columns]]\npx.display(df)", "columnNames": ["namespace", "pod_count", "service_count", "avg_vsize", "avg_rss"], "groupByColumns": ["namespace", "pod_count", "service_count", "avg_vsize", "avg_rss"], diff --git a/src/pxl_scripts/network-traffic-node-graph.json b/src/pxl_scripts/network-traffic-node-graph.json index 5abacbe..61bbb11 100644 --- a/src/pxl_scripts/network-traffic-node-graph.json +++ b/src/pxl_scripts/network-traffic-node-graph.json @@ -1,5 +1,5 @@ { "name": "Network Connections in Cluster", - "description": "This query outputs a graph of the network connections to, from and within your cluster.", + "description": "Use with the Node Graph visualization. This query outputs network connections to, from and within your cluster.", "script": "# Copyright 2018- The Pixie Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# SPDX-License-Identifier: Apache-2.0\n\n'''\nThis query outputs a graph of the network connections to, from and\nwithin your cluster. Use this query with Grafana's Node Graph panel.\n\nThis query is for use with Grafana's Pixie Datasource Plugin only,\nas it uses Grafana macros for adding Grafana dashboard context.\nThis query is a modification of the px/net_flow_graph script:\nhttps://github.com/pixie-io/pixie/tree/main/src/pxl_scripts/px/net_flow_graph\n'''\n\n# $pixieCluster - work around to update the panel if this dashboard variable is present\n\n# Import Pixie's module for querying data.\nimport px\n\n\ndef net_flow_graph():\n df = px.DataFrame(table='conn_stats', start_time=__time_from)\n\n # Add context\n df.namespace = df.ctx['namespace']\n df.service = df.ctx['service']\n df.node = df.ctx['node']\n df.pod = df.ctx['pod']\n\n # Filter out any non k8s sources.\n df = df[df.pod != '']\n\n # Filter for client side requests.\n df = df[df.trace_role == 1]\n\n # Use aggregate to pick the first and last sample for any given client-server pair.\n # We do this by picking the min/max of the stats, since they are all counters.\n df = df.groupby(['pod', 'upid', 'remote_addr']).agg(\n bytes_sent_min=('bytes_sent', px.min),\n bytes_sent_max=('bytes_sent', px.max),\n bytes_recv_min=('bytes_recv', px.min),\n bytes_recv_max=('bytes_recv', px.max),\n )\n df.bytes_sent = df.bytes_sent_max - df.bytes_sent_min\n df.bytes_recv = df.bytes_recv_max - df.bytes_recv_min\n df.bytes_total = df.bytes_sent + df.bytes_recv\n df = df.drop(['bytes_sent_max', 'bytes_sent_min', 'bytes_recv_max', 'bytes_recv_min'])\n\n # To create a graph, add 'from' and 'to' entities.\n df.from_entity = df.pod\n df.to_entity = px.nslookup(df.remote_addr)\n\n # Since there may be multiple processes per pod,\n # perform an additional aggregation to consolidate those into one entry.\n df = df.groupby(['from_entity', 'to_entity']).agg(\n bytes_sent=('bytes_sent', px.sum),\n bytes_recv=('bytes_recv', px.sum),\n bytes_total=('bytes_total', px.sum),\n )\n\n # Convert to kilobytes.\n df.kbytes_sent = df.bytes_sent / 1000\n df.kbytes_recv = df.bytes_recv / 1000\n df.kbytes_total = df.bytes_total / 1000\n\n # # Add optional filters:\n # df = df[px.contains(df.from_entity, 'px-sock-shop')]\n # df = df[px.contains(df.to_entity, '')]\n\n return df\n\n\n# Construct the nodes table for the Node Graph panel.\n# https://grafana.com/docs/grafana/next/visualizations/node-graph/#node-parameters\ndef nodes():\n df1 = net_flow_graph()\n df1.id = df1.to_entity\n df1.title = df1.to_entity\n df1 = df1.groupby(['id', 'title']).agg()\n df2 = net_flow_graph()\n df2.id = df2.from_entity\n df2.title = df2.from_entity\n df2 = df2.groupby(['id', 'title']).agg()\n return df1.append(df2)\n\n\n# Construct the edges table for the Node Graph panel.\n# https://grafana.com/docs/grafana/next/visualizations/node-graph/#edge-parameters\ndef edges():\n df = net_flow_graph()\n df.source = df.from_entity\n df.target = df.to_entity\n df.id = df.source + '-' + df.target\n df.mainStat = df.kbytes_sent\n df.secondaryStat = df.kbytes_recv\n return df[['id', 'source', 'target', 'mainStat', 'secondaryStat']]\n\n\n# Display the tables.\nnodes_table = nodes()\nedges_table = edges()\npx.display(nodes_table, \"nodes\")\npx.display(edges_table, \"edges\")" } diff --git a/src/pxl_scripts/node-metrics.json b/src/pxl_scripts/node-metrics.json index 39625c9..b1c654a 100644 --- a/src/pxl_scripts/node-metrics.json +++ b/src/pxl_scripts/node-metrics.json @@ -1,6 +1,6 @@ { "name": "Node Metrics", - "description": "Displays metrics about each nodes in the cluster", + "description": "Use with the Table visualization. Displays metrics about each node in the cluster.", "script": "# $pixieCluster - work around to update the panel if this dashboard variable is present\n\nimport px\n\ndef process_stats_by_entity(start_time: int, entity: str):\n ''' Gets the windowed process stats (CPU, memory, etc) per node or pod.\n Args:\n @start_time Starting time of the data to examine.\n @entity: Either pod or node_name.\n '''\n # Window size to use on time_ column for bucketing.\n ns_per_s = 1000 * 1000 * 1000\n window_ns = px.DurationNanos(10 * ns_per_s)\n\n df = px.DataFrame(table='process_stats', start_time=start_time)\n df[entity] = df.ctx[entity]\n df.timestamp = px.bin(df.time_, window_ns)\n # First calculate CPU usage by process (UPID) in each k8s_object\n # over all windows.\n df = df.groupby([entity, 'upid', 'timestamp']).agg(\n rss=('rss_bytes', px.mean),\n vsize=('vsize_bytes', px.mean),\n # The fields below are counters, so we take the min and the max to subtract them.\n cpu_utime_ns_max=('cpu_utime_ns', px.max),\n cpu_utime_ns_min=('cpu_utime_ns', px.min),\n cpu_ktime_ns_max=('cpu_ktime_ns', px.max),\n cpu_ktime_ns_min=('cpu_ktime_ns', px.min),\n read_bytes_max=('read_bytes', px.max),\n read_bytes_min=('read_bytes', px.min),\n write_bytes_max=('write_bytes', px.max),\n write_bytes_min=('write_bytes', px.min),\n rchar_bytes_max=('rchar_bytes', px.max),\n rchar_bytes_min=('rchar_bytes', px.min),\n wchar_bytes_max=('wchar_bytes', px.max),\n wchar_bytes_min=('wchar_bytes', px.min),\n )\n # Next calculate cpu usage and memory stats per window.\n df.cpu_utime_ns = df.cpu_utime_ns_max - df.cpu_utime_ns_min\n df.cpu_ktime_ns = df.cpu_ktime_ns_max - df.cpu_ktime_ns_min\n df.read_bytes = df.read_bytes_max - df.read_bytes_min\n df.write_bytes = df.write_bytes_max - df.write_bytes_min\n df.rchar_bytes = df.rchar_bytes_max - df.rchar_bytes_min\n df.wchar_bytes = df.wchar_bytes_max - df.wchar_bytes_min\n # Sum by UPID.\n df = df.groupby([entity, 'timestamp']).agg(\n cpu_ktime_ns=('cpu_ktime_ns', px.sum),\n cpu_utime_ns=('cpu_utime_ns', px.sum),\n read_bytes=('read_bytes', px.sum),\n write_bytes=('write_bytes', px.sum),\n rchar_bytes=('rchar_bytes', px.sum),\n wchar_bytes=('wchar_bytes', px.sum),\n rss=('rss', px.sum),\n vsize=('vsize', px.sum),\n )\n df.actual_disk_read_throughput = df.read_bytes / window_ns\n df.actual_disk_write_throughput = df.write_bytes / window_ns\n df.total_disk_read_throughput = df.rchar_bytes / window_ns\n df.total_disk_write_throughput = df.wchar_bytes / window_ns\n # Now take the mean value over the various timestamps.\n df = df.groupby(entity).agg(\n cpu_ktime_ns=('cpu_ktime_ns', px.mean),\n cpu_utime_ns=('cpu_utime_ns', px.mean),\n actual_disk_read_throughput=('actual_disk_read_throughput', px.mean),\n actual_disk_write_throughput=('actual_disk_write_throughput', px.mean),\n total_disk_read_throughput=('total_disk_read_throughput', px.mean),\n total_disk_write_throughput=('total_disk_write_throughput', px.mean),\n avg_rss=('rss', px.mean),\n avg_vsize=('vsize', px.mean),\n )\n # Finally, calculate total (kernel + user time) percentage used over window.\n df.cpu_usage = px.Percent((df.cpu_ktime_ns + df.cpu_utime_ns) / window_ns)\n return df.drop(['cpu_ktime_ns', 'cpu_utime_ns'])\n \n\n''' Gets a list of nodes in the current cluster since `start_time`.\nArgs:\n@start_time Start time of the data to examine.\n'''\ndf = px.DataFrame(table='process_stats', start_time=$__from)\ndf.node = df.ctx['node_name']\ndf.pod = df.ctx['pod_name']\nagg = df.groupby(['node', 'pod']).agg()\nnodes = agg.groupby('node').agg(pod_count=('pod', px.count))\nprocess_stats = process_stats_by_entity($__from, 'node')\noutput = process_stats.merge(nodes, how='inner', left_on='node', right_on='node',\n suffixes=['', '_x'])\npx.display(output[[$__columns]])", "columnNames": ["node", "cpu_usage", "pod_count"], "isColDisplay": true diff --git a/src/pxl_scripts/outbound-connections-node-graph.json b/src/pxl_scripts/outbound-connections-node-graph.json index 4db4739..0b7f73b 100644 --- a/src/pxl_scripts/outbound-connections-node-graph.json +++ b/src/pxl_scripts/outbound-connections-node-graph.json @@ -1,5 +1,5 @@ { "name": "Outbound Connections in Cluster", - "description": "This query outputs a graph of the inbound connections to your cluster (connections made from external IPs).", + "description": "Use with the Node Graph visualization. This query outputs inbound connections to your cluster (connections made from external IPs).", "script": "# Copyright 2018- The Pixie Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n# SPDX-License-Identifier: Apache-2.0\n\n'''\nThis query outputs a graph of the inbound connections to your cluster\n(connections made from external IPs). Use this query with Grafana's Node Graph panel.\n\nThis query is for use with Grafana's Pixie Datasource Plugin only,\nas it uses Grafana macros for adding Grafana dashboard context.\nThis query is a modification of the px/outbound_conns script:\nhttps://github.com/pixie-io/pixie/tree/main/src/pxl_scripts/px/outbound_conns\n'''\n\n# $pixieCluster - work around to update the panel if this dashboard variable is present\n\n# Import Pixie's module for querying data.\nimport px\n\n\ndef outbound_conns():\n df = px.DataFrame(table='conn_stats', start_time=__time_from)\n\n df.namespace = df.ctx['namespace']\n df.service = df.ctx['service']\n df.node = df.ctx['node']\n df.pod = df.ctx['pod']\n\n # # Add optional filters:\n # Filter IP address of outbound reciever.\n # df = df[df.remote_addr == '10.38.0.15']\n # # Filter namespace, service, node or pod name of sending pod.\n # # Pixie formats service and pod names in the / format.\n # df = df[df.service == 'px-sock-shop/front-end']\n # df = df[df.pod == '']\n # df = df[df.node == '']\n # df = df[df.namespace == '']\n\n # Filter for outbound traffic only.\n # Trace-role of 1 means client-side tracing. Pixie only traces\n # on the client side when traffic is leaving the cluster.\n df = df[df.trace_role == 1]\n\n # Filter out any connections from known pods.\n df.remote_pod_id = px.ip_to_pod_id(df.remote_addr)\n df.remote_service_id = px.ip_to_service_id(df.remote_addr)\n df = df[df.remote_pod_id == '' and df.remote_service_id == '']\n\n # Filter out connections from localhost.\n df = df[not df.remote_addr == '127.0.0.1']\n df = df[not df.remote_addr == '0.0.0.0']\n\n # Calculate connection stats for each process for each unique pod / remote_addr pair.\n df = df.groupby(['pod', 'upid', 'remote_addr', 'remote_port']).agg(\n # The fields below are counters per UPID, so we take\n # the min (starting value) and the max (ending value) to subtract them.\n conn_open_min=('conn_open', px.min),\n conn_open_max=('conn_open', px.max),\n bytes_sent_min=('bytes_sent', px.min),\n bytes_sent_max=('bytes_sent', px.max),\n bytes_recv_min=('bytes_recv', px.min),\n bytes_recv_max=('bytes_recv', px.max),\n )\n\n # Calculate connection stats over the time window.\n df.conn_open = df.conn_open_max - df.conn_open_min\n df.bytes_sent = df.bytes_sent_max - df.bytes_sent_min\n df.bytes_recv = df.bytes_recv_max - df.bytes_recv_min\n\n # Calculate connection stats for each unique pod / remote_addr pair. Since there\n # may be multiple processes per pod we perform an additional aggregation to\n # consolidate those into one entry.\n df = df.groupby(['pod', 'remote_addr', 'remote_port']).agg(\n connections_open=('conn_open', px.sum),\n bytes_sent=('bytes_sent', px.sum),\n bytes_recv=('bytes_recv', px.sum),\n )\n\n df.kbytes_sent = df.bytes_sent / 1000\n df.kbytes_recv = df.bytes_recv / 1000\n df.kbytes_total = df.kbytes_sent + df.kbytes_recv\n\n # Resolve remote addresses to public domain.\n df.domain = px.nslookup(df.remote_addr)\n df.domain = px.select(df.domain == df.remote_addr, '', df.domain)\n\n return df[['pod', 'remote_addr', 'remote_port', 'domain', 'connections_open', 'kbytes_sent',\n 'kbytes_recv', 'kbytes_total']]\n\n\n# Construct the nodes table for the Node Graph panel.\n# https://grafana.com/docs/grafana/next/visualizations/node-graph/#node-parameters\ndef nodes():\n df1 = outbound_conns()\n df1.id = df1.pod\n df1.title = df1.pod\n df1 = df1.groupby(['id', 'title']).agg()\n df2 = outbound_conns()\n df2.id = df2.remote_addr\n df2.title = df2.remote_addr\n df2 = df2.groupby(['id', 'title']).agg()\n return df1.append(df2)\n\n\n# Construct the edges table for the Node Graph panel.\n# https://grafana.com/docs/grafana/next/visualizations/node-graph/#edge-parameters\ndef edges():\n df = outbound_conns()\n df.source = df.pod\n df.target = df.remote_addr\n df.id = df.source + '-' + df.target\n df.mainStat = df.kbytes_sent\n df.secondaryStat = df.kbytes_recv\n return df[['id', 'source', 'target', 'mainStat', 'secondaryStat']]\n\n\n# Display the tables.\nnodes = nodes()\npx.display(nodes, \"nodes\")\n\nedges = edges()\npx.display(edges, \"edges\")" } diff --git a/src/pxl_scripts/pods-metrics.json b/src/pxl_scripts/pods-metrics.json index 31bed81..0eaa051 100644 --- a/src/pxl_scripts/pods-metrics.json +++ b/src/pxl_scripts/pods-metrics.json @@ -1,6 +1,6 @@ { "name": "Pod Metrics", - "description": "Displays metrics about each pod in the cluster", + "description": "Use with the Table visualization. Displays metrics about each pod in the cluster.", "script": "# $pixieCluster - work around to update the panel if this dashboard variable is present\n\nimport px\n\ndef process_stats_by_entity(start_time: int, entity: str):\n ''' Gets the windowed process stats (CPU, memory, etc) per node or pod.\n Args:\n @start_time Starting time of the data to examine.\n @entity: Either pod or node_name.\n '''\n # Window size to use on time_ column for bucketing.\n ns_per_s = 1000 * 1000 * 1000\n window_ns = px.DurationNanos(10 * ns_per_s)\n\n df = px.DataFrame(table='process_stats', start_time=start_time)\n df[entity] = df.ctx[entity]\n df.timestamp = px.bin(df.time_, window_ns)\n # First calculate CPU usage by process (UPID) in each k8s_object\n # over all windows.\n df = df.groupby([entity, 'upid', 'timestamp']).agg(\n rss=('rss_bytes', px.mean),\n vsize=('vsize_bytes', px.mean),\n # The fields below are counters, so we take the min and the max to subtract them.\n cpu_utime_ns_max=('cpu_utime_ns', px.max),\n cpu_utime_ns_min=('cpu_utime_ns', px.min),\n cpu_ktime_ns_max=('cpu_ktime_ns', px.max),\n cpu_ktime_ns_min=('cpu_ktime_ns', px.min),\n read_bytes_max=('read_bytes', px.max),\n read_bytes_min=('read_bytes', px.min),\n write_bytes_max=('write_bytes', px.max),\n write_bytes_min=('write_bytes', px.min),\n rchar_bytes_max=('rchar_bytes', px.max),\n rchar_bytes_min=('rchar_bytes', px.min),\n wchar_bytes_max=('wchar_bytes', px.max),\n wchar_bytes_min=('wchar_bytes', px.min),\n )\n # Next calculate cpu usage and memory stats per window.\n df.cpu_utime_ns = df.cpu_utime_ns_max - df.cpu_utime_ns_min\n df.cpu_ktime_ns = df.cpu_ktime_ns_max - df.cpu_ktime_ns_min\n df.read_bytes = df.read_bytes_max - df.read_bytes_min\n df.write_bytes = df.write_bytes_max - df.write_bytes_min\n df.rchar_bytes = df.rchar_bytes_max - df.rchar_bytes_min\n df.wchar_bytes = df.wchar_bytes_max - df.wchar_bytes_min\n # Sum by UPID.\n df = df.groupby([entity, 'timestamp']).agg(\n cpu_ktime_ns=('cpu_ktime_ns', px.sum),\n cpu_utime_ns=('cpu_utime_ns', px.sum),\n read_bytes=('read_bytes', px.sum),\n write_bytes=('write_bytes', px.sum),\n rchar_bytes=('rchar_bytes', px.sum),\n wchar_bytes=('wchar_bytes', px.sum),\n rss=('rss', px.sum),\n vsize=('vsize', px.sum),\n )\n df.actual_disk_read_throughput = df.read_bytes / window_ns\n df.actual_disk_write_throughput = df.write_bytes / window_ns\n df.total_disk_read_throughput = df.rchar_bytes / window_ns\n df.total_disk_write_throughput = df.wchar_bytes / window_ns\n # Now take the mean value over the various timestamps.\n df = df.groupby(entity).agg(\n cpu_ktime_ns=('cpu_ktime_ns', px.mean),\n cpu_utime_ns=('cpu_utime_ns', px.mean),\n actual_disk_read_throughput=('actual_disk_read_throughput', px.mean),\n actual_disk_write_throughput=('actual_disk_write_throughput', px.mean),\n total_disk_read_throughput=('total_disk_read_throughput', px.mean),\n total_disk_write_throughput=('total_disk_write_throughput', px.mean),\n avg_rss=('rss', px.mean),\n avg_vsize=('vsize', px.mean),\n )\n # Finally, calculate total (kernel + user time) percentage used over window.\n df.cpu_usage = px.Percent((df.cpu_ktime_ns + df.cpu_utime_ns) / window_ns)\n return df.drop(['cpu_ktime_ns', 'cpu_utime_ns'])\n\n''' A list of pods in `namespace`.\nArgs:\n@start_time: The timestamp of data to start at.\n@namespace: The name of the namespace to filter on.\n'''\ndf = px.DataFrame(table='process_stats', start_time=$__from)\ndf.pod = df.ctx['pod_name']\ndf.node = df.ctx['node_name']\ndf.container = df.ctx['container_name']\ndf = df.groupby(['pod', 'node', 'container']).agg()\ndf = df.groupby(['pod', 'node']).agg(container_count=('container', px.count))\ndf.start_time = px.pod_name_to_start_time(df.pod)\ndf.status = px.pod_name_to_status(df.pod)\n\n\nprocess_stats = process_stats_by_entity($__from, 'pod')\noutput = process_stats.merge(df, how='inner', left_on='pod', right_on='pod',\n suffixes=['', '_x'])\n\n\ndf=output[[$__columns]]\npx.display(df)", "columnNames": ["pod", "cpu_usage", "total_disk_read_throughput", "total_disk_write_throughput", "container_count", "node", "start_time", "status"], "groupByColumns": ["pod", "cpu_usage", "total_disk_read_throughput", "total_disk_write_throughput", "container_count", "node", "start_time", "status"], diff --git a/src/pxl_scripts/service-metrics.json b/src/pxl_scripts/service-metrics.json index 6edee7c..7bb5c9c 100644 --- a/src/pxl_scripts/service-metrics.json +++ b/src/pxl_scripts/service-metrics.json @@ -1,6 +1,6 @@ { "name": "Service Metrics", - "description": "Displays metrics about each service in the cluster", + "description": "Use with the Table visualization. Displays metrics about each service in the cluster.", "script": "# $pixieCluster - work around to update the panel if this dashboard variable is present\n\nimport px\n\nfilter_health_checks = True\n\ndef get_time_window(start_time: int):\n ''' Converts the start_time string into a table with a single column and single row.\n The approach is hacky, and will round to roughly 1 second.\n '''\n df = px.DataFrame('process_stats', start_time=start_time)\n\n df = df.agg(\n time_min=('time_', px.min),\n time_max=('time_', px.max),\n )\n\n df.window = px.DurationNanos(df.time_max - df.time_min)\n df = df[['window']]\n\n return df\n \ndef add_time_window_column(df, start_time):\n tw = get_time_window(start_time)\n df = df.merge(tw, how='inner', left_on=[], right_on=[])\n return df\n \ndef http_stats(start_time: int):\n ''' Get a dataframe of HTTP events.\n Certain traffic (like health checks) are auto removed, and some standard fields are added.\n Args:\n @start_time: The timestamp of data to start at.\n '''\n df = px.DataFrame(table='http_events', start_time=start_time)\n\n # Add K8s metadata.\n df.service = df.ctx['service']\n df.pod = df.ctx['pod']\n\n # Filter out non-k8s entities.\n df = df[df.pod != '']\n\n # Additional HTTP fields, pre-computed for convenience.\n df.failure = df.resp_status >= 400\n\n # Remove health checks, and anything with no remote address.\n health_check_req = ((df.req_path == '/healthz' or df.req_path == '/readyz') or df.req_path == '/livez')\n filter_out_conds = (health_check_req and filter_health_checks) or (df['remote_addr'] == '-')\n df = df[not filter_out_conds]\n\n return df\n \ndef conn_stats(start_time: int):\n ''' Get a dataframe of connection stats.\n For each client-server pair, the resulting data frame has the bytes sent and received.\n Args:\n @start_time: The timestamp of data to start at.\n '''\n df = px.DataFrame(table='conn_stats', start_time=start_time)\n\n df.pod = df.ctx['pod']\n df.service = df.ctx['service']\n\n df = df[df.service != '']\n\n # Find min/max bytes transferred over the selected time window per pod.\n df = df.groupby(['upid', 'remote_addr', 'remote_port', 'pod', 'service', 'trace_role']).agg(\n bytes_recv_min=('bytes_recv', px.min),\n bytes_recv_max=('bytes_recv', px.max),\n bytes_sent_min=('bytes_sent', px.min),\n bytes_sent_max=('bytes_sent', px.max),\n )\n\n # Calculate bytes transferred over the time window\n df.bytes_sent = df.bytes_sent_max - df.bytes_sent_min\n df.bytes_recv = df.bytes_recv_max - df.bytes_recv_min\n df = df.drop(['bytes_recv_min', 'bytes_recv_max', 'bytes_sent_min', 'bytes_sent_max'])\n\n return df\n \ndef http_stats_by_service(start_time: int):\n ''' Get a data frame of HTTP stats per service. The HTTP stats are for inbound traffic,\n and includes HTTP request count, error count and latency quantiles.\n Args:\n @start_time: The timestamp of data to start at.\n '''\n df = http_stats(start_time)\n\n # Filter only to inbound service traffic (server-side).\n # Don't include traffic initiated by this service to an external location.\n df = df[df.trace_role == 2]\n\n # Compute HTTP metrics.\n df = df.groupby(['service']).agg(\n http_req_count_in=('latency', px.count),\n http_error_count_in=('failure', px.sum),\n http_latency_in=('latency', px.quantiles)\n )\n\n return df\n \ndef conn_stats_by_service(start_time: int):\n ''' Get a dataframe of connection stats aggregated by service.\n For each service, the resulting data frame contains rx/tx stats for server-side and client-side connections.\n Args:\n @start_time: The timestamp of data to start at.\n '''\n df = conn_stats(start_time)\n\n # Group by service and trace role.\n # Do this after computing bytes sent/received by conn_stats key ({upid, remote_addr, remote_port}).\n # Keeping trace_role allows us to see which traffic was part of server duties vs client duties.\n df = df.groupby(['service', 'trace_role']).agg(\n bytes_recv=('bytes_recv', px.sum),\n bytes_sent=('bytes_sent', px.sum),\n )\n\n # Get RX/TX stats for the server side connections.\n server_df = df[df.trace_role == 2]\n server_df.rx_server = server_df.bytes_recv\n server_df.tx_server = server_df.bytes_sent\n server_df = server_df[['service', 'rx_server', 'tx_server']]\n\n # Get RX/TX stats for the client side connections.\n client_df = df[df.trace_role == 1]\n client_df.rx_client = client_df.bytes_recv\n client_df.tx_client = client_df.bytes_sent\n client_df = client_df[['service', 'rx_client', 'tx_client']]\n\n # Create a dataframe that contains both server-side and client-side RX/TX stats.\n df = server_df.merge(client_df,\n how='left',\n left_on='service',\n right_on='service',\n suffixes=['', '_x'])\n df = df['service', 'rx_server', 'tx_server', 'rx_client', 'tx_client']\n\n return df\n \ndef service_let_summary(start_time: int):\n ''' Compute a summary of traffic by requesting service, for requests\n on services in the current cluster..\n Args:\n @start_time: The timestamp of data to start at.\n '''\n conn_stats_df = conn_stats_by_service(start_time)\n http_stats_df = http_stats_by_service(start_time)\n\n # Merge conn_stats_df and http_stats_df.\n df = conn_stats_df.merge(http_stats_df,\n how='left',\n left_on='service',\n right_on='service',\n suffixes=['', '_x'])\n\n # Compute time window for the query and add it as a column.\n df = add_time_window_column(df, start_time)\n\n # Compute throughput values.\n df.http_req_throughput_in = df.http_req_count_in / df.window\n df.http_error_rate_in = px.Percent(\n px.select(df.http_req_count_in != 0, df.http_error_count_in / df.http_req_count_in, 0.0))\n df.inbound_conns = (df.rx_server + df.tx_server) / df.window\n df.outbound_conns = (df.tx_client + df.rx_client) / df.window\n\n return df[['service', 'http_latency_in', 'http_req_throughput_in', 'http_error_rate_in',\n 'inbound_conns', 'outbound_conns']]\n \n''' Compute a summary of traffic by requesting service, for requests\n on services in the current cluster..\nArgs:\n@start_time: The timestamp of data to start at.\n'''\nconn_stats_df = conn_stats_by_service(start_time=$__from)\nhttp_stats_df = http_stats_by_service($__from)\n\n# Merge conn_stats_df and http_stats_df.\ndf = conn_stats_df.merge(http_stats_df,\n how='left',\n left_on='service',\n right_on='service',\n suffixes=['', '_x'])\n\n# Compute time window for the query and add it as a column.\ndf = add_time_window_column(df, $__from)\n\n# Compute throughput values.\ndf.http_req_throughput_in = df.http_req_count_in / df.window\ndf.http_error_rate_in = px.Percent(\n px.select(df.http_req_count_in != 0, df.http_error_count_in / df.http_req_count_in, 0.0))\ndf.inbound_conns = (df.rx_server + df.tx_server) / df.window\ndf.outbound_conns = (df.tx_client + df.rx_client) / df.window\n\ndf = df[[$__columns]]\npx.display(df)", "columnNames": ["service", "http_latency_in", "http_req_throughput_in", "http_error_rate_in","inbound_conns", "outbound_conns"], "groupByColumns": ["service", "http_latency_in", "http_req_throughput_in", "http_error_rate_in","inbound_conns", "outbound_conns"],