Commit bcea60cf authored by Kishore Battula's avatar Kishore Battula
Browse files

Merge branch 'airflow-monitoring-alerts-fixes' into 'master'

Fixed bugs in the Airflow Monitoring and Alerts

See merge request !444
parents 0d6c5e2e 12ece23c
Pipeline #58609 passed with stages
in 1 minute and 57 seconds
......@@ -17,8 +17,8 @@
"content": {
"settings": {
"content": "# Airflow Dag Metrics",
"title": "",
"subtitle": ""
"subtitle": "",
"title": ""
}
}
}
......@@ -75,7 +75,7 @@
},
{
"name": "Query",
"value": "customMetrics\n| where name has \"dag_processing.last_runtime\" \n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.\" dataPartitionId @\"\\.dag_processing\\.last_runtime\\.([0-9a-zA-Z_])*\"\n| parse kind=regex name with @\"([0-9a-zA-Z_\\.])*\\.dag_processing\\.last_runtime\\.\" dagName\n| summarize DagProcessingTime = max(value) by bin(timestamp, 10m), MetricName = \"dag_processing.last_runtime\", dataPartitionId, dagName\n| render timechart \n",
"value": "customMetrics\n| where name has \"dag_processing.last_runtime\" \n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.\" clusterName @\"\\.dag_processing\\.last_runtime\\.([0-9a-zA-Z_])*\"\n| parse kind=regex name with @\"([0-9a-zA-Z_\\.])*\\.dag_processing\\.last_runtime\\.\" dagName\n| summarize DagProcessingTime = max(value) by timestamp, MetricName = \"dag_processing.last_runtime\", clusterName, dagName\n| render timechart \n",
"isOptional": true
},
{
......@@ -101,6 +101,13 @@
{
"name": "Dimensions",
"value": {
"aggregation": "Sum",
"splitBy": [
{
"name": "MetricName",
"type": "string"
}
],
"xAxis": {
"name": "timestamp",
"type": "datetime"
......@@ -110,14 +117,7 @@
"name": "DagProcessingTime",
"type": "real"
}
],
"splitBy": [
{
"name": "MetricName",
"type": "string"
}
],
"aggregation": "Sum"
]
},
"isOptional": true
},
......@@ -138,13 +138,28 @@
"type": "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart",
"settings": {
"content": {
"Query": "customMetrics\n| where name has \"dag_processing.last_runtime\" \n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.\" partitionId @\"\\.dag_processing\\.last_runtime\\.([0-9a-zA-Z_])*\"\n| extend dataPartitionId = case(partitionId == \"\", \"common-cluster\",\n partitionId)\n| parse kind=regex name with @\"([0-9a-zA-Z_\\.])*\\.dag_processing\\.last_runtime\\.\" dagName\n| summarize DagProcessingTime = max(value) by bin(timestamp, 10m), MetricName = \"dag_processing.last_runtime\", dataPartitionId, dagName\n| render timechart \n\n",
"PartTitle": "Time taken for processing Dag File"
"Dimensions": {
"aggregation": "Sum",
"splitBy": [
{
"name": "dagName",
"type": "string"
}
],
"xAxis": {
"name": "timestamp",
"type": "datetime"
},
"yAxis": [
{
"name": "DagProcessingTime",
"type": "real"
}
]
},
"PartTitle": "Time taken for processing Dag File",
"Query": "customMetrics\n| where name has \"dag_processing.last_runtime\" \n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.\" partitionId @\"\\.dag_processing\\.last_runtime\\.([0-9a-zA-Z_])*\"\n| extend clusterName = case(partitionId == \"\", \"common-cluster\",\n partitionId)\n| parse kind=regex name with @\"([0-9a-zA-Z_\\.])*\\.dag_processing\\.last_runtime\\.\" dagName\n| summarize DagProcessingTime = max(value) by timestamp, dagName, MetricName = \"dag_processing.last_runtime\", clusterName\n| render timechart \n\n"
}
},
"savedContainerState": {
"partTitle": "Time taken for processing Dag File",
"assetName": "${centralGroupPrefix}-ai"
}
}
},
......@@ -199,7 +214,7 @@
},
{
"name": "Query",
"value": "customMetrics\n| where name has \"dagrun.duration.success\" or name has \"dagrun.duration.failed\"\n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.\" dataPartitionId @\"\\.dagrun\\.duration\\.([0-9a-zA-Z_\\.])*\"\n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.dagrun\\.duration\\.([0-9a-zA-Z_])*\\.\" dagName\n| extend duration = value/1000\n| summarize DagrunTime = max(duration) by bin(timestamp, 10m), MetricName = \"dagrun.duration\", dataPartitionId, dagName\n| render timechart \n\n",
"value": "customMetrics\n| where name has \"dagrun.duration.success\" or name has \"dagrun.duration.failed\"\n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.\" clusterName @\"\\.dagrun\\.duration\\.([0-9a-zA-Z_\\.])*\"\n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.dagrun\\.duration\\.([0-9a-zA-Z_])*\\.\" dagName\n| extend duration = value/1000\n| summarize DagrunTime = max(duration) by timestamp, MetricName = \"dagrun.duration\", clusterName, dagName\n| render timechart \n\n",
"isOptional": true
},
{
......@@ -225,6 +240,13 @@
{
"name": "Dimensions",
"value": {
"aggregation": "Sum",
"splitBy": [
{
"name": "MetricName",
"type": "string"
}
],
"xAxis": {
"name": "timestamp",
"type": "datetime"
......@@ -234,14 +256,7 @@
"name": "DagrunTime",
"type": "real"
}
],
"splitBy": [
{
"name": "MetricName",
"type": "string"
}
],
"aggregation": "Sum"
]
},
"isOptional": true
},
......@@ -262,14 +277,30 @@
"type": "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart",
"settings": {
"content": {
"Query": "customMetrics\n| where name has \"dagrun.duration.success\" or name has \"dagrun.duration.failed\"\n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.\" partitionId @\"\\.dagrun\\.duration\\.([0-9a-zA-Z_\\.])*\"\n| extend dataPartitionId = case(partitionId == \"\", \"common-cluster\",\n partitionId)\n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.dagrun\\.duration\\.([0-9a-zA-Z_])*\\.\" dagName\n| extend duration = value/1000\n| summarize DagrunTime = max(duration) by bin(timestamp, 10m), MetricName = \"dagrun.duration\", dataPartitionId, dagName\n| render timechart \n\n",
"Query": "customMetrics\n| where name has \"dagrun.duration.success\" or name has \"dagrun.duration.failed\"\n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.\" partitionId @\"\\.dagrun\\.duration\\.([0-9a-zA-Z_\\.])*\"\n| extend clusterName = case(partitionId == \"\", \"common-cluster\",\n partitionId)\n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.dagrun\\.duration\\.([0-9a-zA-Z_])*\\.\" dagName\n| extend duration = value/1000\n| summarize DagrunTime = max(duration) by timestamp, dagName, MetricName = \"dagrun.duration\", clusterName\n| render columnchart \n\n",
"ControlType": "FrameControlChart",
"PartTitle": "Dagrun Duration"
"SpecificChart": "StackedColumn",
"PartTitle": "Dagrun Duration",
"Dimensions": {
"aggregation": "Sum",
"splitBy": [
{
"name": "dagName",
"type": "string"
}
],
"xAxis": {
"name": "timestamp",
"type": "datetime"
},
"yAxis": [
{
"name": "DagrunTime",
"type": "real"
}
]
}
}
},
"savedContainerState": {
"partTitle": "Dagrun Duration",
"assetName": "${centralGroupPrefix}-ai"
}
}
},
......@@ -324,7 +355,7 @@
},
{
"name": "Query",
"value": "customMetrics\n| where name matches regex @\"dag\\.([0-9a-zA-Z_])*\\.([0-9a-zA-Z_])*\\.duration\\z\" \n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.\" dataPartitionId @\"\\.dag\\.([0-9a-zA-Z_\\.])*\\.duration\"\n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.dag\\.\" dagIdTaskId @\"\\.duration\"\n| extend duration = value/1000, dagName = split(dagIdTaskId,\".\")[0]\n| summarize TaskRunDuration = max(duration) by bin(timestamp, 10m), MetricName = \"TaskRun Duration\", tostring(dagName), dataPartitionId\n| render timechart \n\n",
"value": "customMetrics\n| where name matches regex @\"dag\\.([0-9a-zA-Z_])*\\.([0-9a-zA-Z_])*\\.duration\\z\" \n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.\" clusterName @\"\\.dag\\.([0-9a-zA-Z_\\.])*\\.duration\"\n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.dag\\.\" dagIdTaskId @\"\\.duration\"\n| extend duration = value/1000, dagName = split(dagIdTaskId,\".\")[0]\n| summarize TaskRunDuration = max(duration) by timestamp, MetricName = \"TaskRun Duration\", tostring(dagName), clusterName\n| render timechart \n\n",
"isOptional": true
},
{
......@@ -350,6 +381,13 @@
{
"name": "Dimensions",
"value": {
"aggregation": "Sum",
"splitBy": [
{
"name": "MetricName",
"type": "string"
}
],
"xAxis": {
"name": "timestamp",
"type": "datetime"
......@@ -359,14 +397,7 @@
"name": "TaskRunDuration",
"type": "real"
}
],
"splitBy": [
{
"name": "MetricName",
"type": "string"
}
],
"aggregation": "Sum"
]
},
"isOptional": true
},
......@@ -387,14 +418,30 @@
"type": "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart",
"settings": {
"content": {
"Query": "customMetrics\n| where name matches regex @\"dag\\.([0-9a-zA-Z_])*\\.([0-9a-zA-Z_])*\\.duration\\z\" \n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.\" partitionId @\"\\.dag\\.([0-9a-zA-Z_\\.])*\\.duration\"\n| extend dataPartitionId = case(partitionId == \"\", \"common-cluster\",\n partitionId)\n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.dag\\.\" dagIdTaskId @\"\\.duration\"\n| extend duration = value/1000, dagName = split(dagIdTaskId,\".\")[0], taskId = split(dagIdTaskId,\".\")[1]\n| summarize TaskRunDuration = max(duration) by bin(timestamp, 10m), MetricName = \"TaskRun Duration\", tostring(dagName), dataPartitionId, tostring(taskId)\n| render timechart \n\n",
"Query": "customMetrics\n| where name matches regex @\"dag\\.([0-9a-zA-Z_])*\\.([0-9a-zA-Z_])*\\.duration\\z\" \n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.\" partitionId @\"\\.dag\\.([0-9a-zA-Z_\\.])*\\.duration\"\n| extend clusterName = case(partitionId == \"\", \"common-cluster\",\n partitionId)\n| parse kind=regex name with @\"([0-9a-zA-Z_])*\\.dag\\.\" dagIdTaskId @\"\\.duration\"\n| extend duration = value\n| summarize TaskRunDuration = max(duration) by timestamp, taskId = tostring(dagIdTaskId), clusterName, MetricName = \"TaskRun Duration\"\n| render columnchart \n\n",
"ControlType": "FrameControlChart",
"PartTitle": "TaskRun Duration"
"SpecificChart": "StackedColumn",
"PartTitle": "TaskRun Duration",
"Dimensions": {
"aggregation": "Sum",
"splitBy": [
{
"name": "taskId",
"type": "string"
}
],
"xAxis": {
"name": "timestamp",
"type": "datetime"
},
"yAxis": [
{
"name": "TaskRunDuration",
"type": "real"
}
]
}
}
},
"savedContainerState": {
"partTitle": "TaskRun Duration",
"assetName": "${centralGroupPrefix}-ai"
}
}
}
......@@ -428,24 +475,24 @@
"value": "Past 3 days"
},
"filteredPartIds": [
"StartboardPart-LogsDashboardPart-5f661255-7b67-43c0-9ef2-0d49e36dc7c9",
"StartboardPart-LogsDashboardPart-5f661255-7b67-43c0-9ef2-0d49e36dc7cb",
"StartboardPart-LogsDashboardPart-5f661255-7b67-43c0-9ef2-0d49e36dc7cd"
"StartboardPart-LogsDashboardPart-71118fe6-96f2-4bfe-bdff-41dde9ad700a",
"StartboardPart-LogsDashboardPart-71118fe6-96f2-4bfe-bdff-41dde9ad700c",
"StartboardPart-LogsDashboardPart-71118fe6-96f2-4bfe-bdff-41dde9ad700e"
]
},
"dynamicFilter_dataPartitionId": {
"dynamicFilter_clusterName": {
"model": {
"operator": "equals",
"values": []
},
"displayCache": {
"name": "dataPartitionId",
"name": "clusterName",
"value": "none"
},
"filteredPartIds": [
"StartboardPart-LogsDashboardPart-5f661255-7b67-43c0-9ef2-0d49e36dc7c9",
"StartboardPart-LogsDashboardPart-5f661255-7b67-43c0-9ef2-0d49e36dc7cb",
"StartboardPart-LogsDashboardPart-5f661255-7b67-43c0-9ef2-0d49e36dc7cd"
"StartboardPart-LogsDashboardPart-c875862f-6b48-4cb6-bc31-930249a611f2",
"StartboardPart-LogsDashboardPart-c875862f-6b48-4cb6-bc31-930249a611f4",
"StartboardPart-LogsDashboardPart-c875862f-6b48-4cb6-bc31-930249a611f6"
]
},
"dynamicFilter_dagName": {
......@@ -458,9 +505,8 @@
"value": "none"
},
"filteredPartIds": [
"StartboardPart-LogsDashboardPart-5f661255-7b67-43c0-9ef2-0d49e36dc7c9",
"StartboardPart-LogsDashboardPart-5f661255-7b67-43c0-9ef2-0d49e36dc7cb",
"StartboardPart-LogsDashboardPart-5f661255-7b67-43c0-9ef2-0d49e36dc7cd"
"StartboardPart-LogsDashboardPart-c875862f-6b48-4cb6-bc31-930249a611f2",
"StartboardPart-LogsDashboardPart-c875862f-6b48-4cb6-bc31-930249a611f4"
]
},
"dynamicFilter_taskId": {
......@@ -473,7 +519,7 @@
"value": "none"
},
"filteredPartIds": [
"StartboardPart-LogsDashboardPart-5f661255-7b67-43c0-9ef2-0d49e36dc7cd"
"StartboardPart-LogsDashboardPart-c875862f-6b48-4cb6-bc31-930249a611f6"
]
}
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment