How to split the App Insights table data to multiple datasets - azure-application-insights

I have Application sights table with around million rows and want equal split data 4 datasets /queries and then merge them all in report. I have the insertDate which can be used to partition to different datasets without overlap of data between the queries. using insertDate>=ago(180d) and insertDate < ago(180d) will give me only 2 partitions of this data , how can i split this data equally ,lets say 5 datasets meaning 5 queries each of which returns 2 months data?
Note I don't know the underlying dates so cannot hard code dates like between dates operator . This is sample data , some months have more data than other months but need to split the data equally to avoid the app insights rest API's max rows limit of 500K rows
Context: Application Insights rest API connector to powerBI does not support more than 500K rows so above is one workaround I am thinking of.
let T = datatable(release:string, insertDate:datetime )
[
"2205", datetime(2022-01-05),
"2205", datetime(2022-01-10),
"2205", datetime(2022-02-10),
"2204", datetime(2022-03-25),
"2203", datetime(2022-04-15),
"2205", datetime(2022-04-12),
"2205", datetime(2022-05-12),
"2206", datetime(2022-06-23),
"2207", datetime(2022-07-27),
"2209", datetime(2022-08-12),
"2201", datetime(2022-09-26),
"2201", datetime(2022-10-08),
];
T
|take 100;

let T = datatable(release:string, insertDate:datetime )
[
"2205", datetime(2022-01-05),
"2205", datetime(2022-01-10),
"2205", datetime(2022-02-10),
"2204", datetime(2022-03-25),
"2203", datetime(2022-04-15),
"2205", datetime(2022-04-12),
"2205", datetime(2022-05-12),
"2206", datetime(2022-06-23),
"2207", datetime(2022-07-27),
"2209", datetime(2022-08-12),
"2201", datetime(2022-09-26),
"2201", datetime(2022-10-08),
];
let groups = 5;
let total_rows = toscalar(T | count);
let group_rows = 1.0 * total_rows / groups;
let edge_rows = toscalar
(
range x from group_rows to total_rows - group_rows step group_rows
| extend x = round(x)
| summarize make_list(x)
);
union (print insertDate = datetime(0001))
,(T
| order by insertDate asc
| where row_number() in (edge_rows)
| project insertDate
)
| order by insertDate asc
| project fromDate = insertDate
,toDate = next(insertDate, 1, datetime(3000))
fromDate
toDate
0001-01-01T00:00:00Z
2022-01-10T00:00:00Z
2022-01-10T00:00:00Z
2022-04-12T00:00:00Z
2022-04-12T00:00:00Z
2022-05-12T00:00:00Z
2022-05-12T00:00:00Z
2022-08-12T00:00:00Z
2022-08-12T00:00:00Z
3000-01-01T00:00:00Z
Fiddle

Related

Filter columns by dashboard multi-select parameter

I'm trying to render a time series, but I have too many columns to show by default. To remedy this I figured I would present the user with a multi-select of all the columns and downselect the columns I render to that list, but I can't for the life of me figure out or find an answer on how to do it.
I have data with, say, columns Time, X1, X2, ... X120 and a multi-select parameter _columns of that table | getschema | project ColumnName | where ColumnName != "Time". I want to project to Time and the contents of _columns.
I can only find how to filter rows based on some column's value vs the multi-select. I feel like I'm missing something very simple.
Updated
There is also a simple solution for data that looks something like that:
This kind of data might be created by a make-series operator with multiple aggregation functions, e.g. -
make-series series_001 = count(), series_002 = min(x), series_003 = sum(x), series_004 = avg(x), series_005 = countif(type == 1), series_006 = countif(subtype == 123) on Timestamp from ago(7d) to now() step 1d
// Data sample generation, including series creation.
// Not part of the solution.
let p_series_num = 100;
let data = materialize
(
range i from 1 to p_series_num step 1
| project series_name = strcat("series_", substring(strcat("00", i), -3))
| mv-apply range(1, 7, 1) on (summarize make_list(rand()))
| evaluate pivot(series_name, take_any(list_))
| extend Timestamp = range(now() - 6d, now(), 1d)
| project-reorder Timestamp, * granny-asc
);
// Solution starts here
// We assume the creation of a parameter named _series, in the dashboard
// Uncomment the following line when executed outside the context of the dashboard
let ['_series'] = 'series_001';
data
| project Timestamp, column_ifexists(['_series'], real(null))
| render timechart
Timestamp
series_001
["2022-10-08T15:59:51.4634127Z","2022-10-09T15:59:51.4634127Z","2022-10-10T15:59:51.4634127Z","2022-10-11T15:59:51.4634127Z","2022-10-12T15:59:51.4634127Z","2022-10-13T15:59:51.4634127Z","2022-10-14T15:59:51.4634127Z"]
["0.35039128090096505","0.79027849410190631","0.023939659111657484","0.14207071795033441","0.91242141133745414","0.33016368441829869","0.50674771943297525"]
Fiddle
This solution supports multi-selection.
The original data looks something like that:
// Data sample generation. Not part of the solution.
let p_start_time = startofday(ago(1d));
let p_interval = 5m;
let p_rows = 15;
let p_cols = 120;
let data = materialize
(
range Timestamp from p_start_time to p_start_time + p_rows * p_interval step p_interval
| mv-expand MetricID = range(1, p_cols) to typeof(int)
| extend MetricVal = rand(), MetricName = strcat("x", tostring(MetricID))
| evaluate pivot(MetricName, take_any(MetricVal), Timestamp)
| project-reorder Timestamp, * granny-asc
);
// Solution starts here
// We assume the creation of a parameter named _MetricName, in the dashboard
// Uncomment the following line when executed outside the context of the dashboard
let ['_series'] = dynamic(['x1', 'x3', 'x7', 'x100', 'x120']);
data
| project Timestamp, pa = pack_all()
| project Timestamp, cols = bag_remove_keys(pa, set_difference(bag_keys(pa), _series))
| evaluate bag_unpack(cols)
| render timechart
Timestamp
x1
x100
x120
x3
x7
2022-10-20T00:00:00Z
0.40517703772298719
0.86952520047109094
0.67458442932790974
0.20662741864260561
0.19230161743580523
2022-10-20T00:05:00Z
0.098438188653858671
0.14095230636982198
0.10269711129443576
0.99361020447683746
0.093624077251808144
2022-10-20T00:10:00Z
0.3779198279036311
0.095647188329308852
0.38967218915903867
0.62601873006422182
0.18486009896828509
2022-10-20T00:15:00Z
0.141551736845493
0.64623737123678782
... etc.
Fiddle
It might be very simple, if our tabular data (post creation of the series) looks something like that:
This kind of data might be created by a make-series operator with by clause, e.g. -
make-series count() on Timestamp from ago(7d) to now() step 1d by series_name
In that case, all we need to do is add a filter on the series name, E.g. -
// Data sample generation, including series creation.
// Not part of the solution.
let p_series_num = 100;
let data = materialize
(
range i from 1 to 1000000 step 1
| extend Timestamp = ago(rand()*7d)
,series_name = strcat("series_", substring(strcat("00", tostring(toint(rand(p_series_num)))), -3))
| make-series count() on Timestamp from ago(7d) to now() step 1d by series_name
);
// Solution starts here
// We assume the creation of a parameter named _series, in the dashboard
// Uncomment the following line when executed outside the context of the dashboard
let ['_series'] = 'series_001';
data
| where series_name == _series
| render timechart
series_name
count_
Timestamp
series_001
[1434,1439,1430,1428,1422,1372,1475]
["2022-10-07T15:54:57.3677580Z","2022-10-08T15:54:57.3677580Z","2022-10-09T15:54:57.3677580Z","2022-10-10T15:54:57.3677580Z","2022-10-11T15:54:57.3677580Z","2022-10-12T15:54:57.3677580Z","2022-10-13T15:54:57.3677580Z"]
Fiddle
Here is a solution that match the data structure in your scenario.
* It is the same solution is the other solution I just modified, but since the source data structure is different, I posted an additional answer for learning purposes.
The original data looks something like that:
The code is actually very simple, leveraging column_ifexists():
// Data sample generation. Not part of the solution.
let p_start_time = startofday(ago(1d));
let p_interval = 5m;
let p_rows = 15;
let p_cols = 120;
let data = materialize
(
range Timestamp from p_start_time to p_start_time + p_rows * p_interval step p_interval
| mv-expand MetricID = range(1, p_cols) to typeof(int)
| extend MetricVal = rand(), MetricName = strcat("x", tostring(MetricID))
| evaluate pivot(MetricName, take_any(MetricVal), Timestamp)
| project-reorder Timestamp, * granny-asc
);
// Solution starts here
// We assume the creation of a parameter named _MetricName, in the dashboard
// Uncomment the following line when executed outside the context of the dashboard
let ['_MetricName'] = "x42";
data
| project Timestamp, column_ifexists(['_MetricName'], real(null))
| render timechart
Timestamp
x42
2022-10-13T00:00:00Z
0.89472385054721115
2022-10-13T00:05:00Z
0.11275174098360444
2022-10-13T00:10:00Z
0.96233152692333268
2022-10-13T00:15:00Z
0.21751913633816042
2022-10-13T00:20:00Z
0.69591667527850931
2022-10-13T00:25:00Z
0.36802228024058203
2022-10-13T00:30:00Z
0.29060518653083045
2022-10-13T00:35:00Z
0.13362332423562559
2022-10-13T00:40:00Z
0.013920161307282448
2022-10-13T00:45:00Z
0.05909880950497
2022-10-13T00:50:00Z
0.146454957813311
2022-10-13T00:55:00Z
0.318823204227693
2022-10-13T01:00:00Z
0.020087435985750794
2022-10-13T01:05:00Z
0.31110660126024159
2022-10-13T01:10:00Z
0.75531136771424379
2022-10-13T01:15:00Z
0.99289833682620265
Fiddle

Limiting Azure data explorer update policy input

We have a use case where we are saving telemetry and statistic data from the machines but the update policy, which is supposed to process the raw data, is giving us trouble and running out of memory.
Aggregation over string column exceeded the memory budget of 8GB during evaluation
We have two tables, the 'ingest-table' where the data is initially being ingested to and the 'main-table' where it should end up.
We are in a process of migrating from another solution to ADX and have to ingest a high volume of data.
The raw data is in a matrix format, which means that one message from a machine will end up as multiple rows/records in the ADX database. We use mv-expand for the breakdown and the query is pretty much doing that, among with some data formatting.
So, our update policy looks like the following:
['ingest-table']
| mv-expand Counter = Data.Counters
| mv-expand with_itemindex = r Row = Rows
| mv-expand Column = Rows[r].Data
| project ...
I don't see any way how could I improve the processing query itself and I'm looking for a way to somehow limit the number of the record which the update policy function would receive.
I've tried playing around with the ingestion batching (MaximumNumberOfItems = 1000) and also sharding policy (MaxRowCount = 1000) for the 'ingest-table' but it does not have any effect on the number of records the update policy is pulling it at once.
My idea is to let only 1000 items at once to be processed by the update policy function because I've manually tested and it works fine to up to 5k record but fails closly above that.
Any suggestion what we could do in this case and how I can achieve that?
EDIT:
An example raw message which has to be processed by the update policy.
The number of rows the policy has to generate is the number of COUNTERS * ROWS * COLUMNS. In this case it would mean that we end up with ~1200 rows after this single message is processed.
I do not see any other way that doing a mv-expand here.
{
"Name": "StatisicName",
"TimeInterval": {
"StartUtc": 1654221156.285,
"EndUtc": 1654221216.286
},
"Legend": {
"RowLabels": [
"0",
"0.04",
"0.08",
"0.12",
"0.16",
"0.2",
"0.24",
"0.28",
"0.32",
"0.36",
"0.4",
"0.44",
"0.48",
"0.52",
"0.56",
"0.6",
"0.64",
"0.68",
"0.72",
"0.76",
"0.8",
"0.84",
"0.88",
"0.92",
"0.96",
"1",
"1.04",
"1.08",
"1.12",
"1.16",
"1.2",
"1.24",
"1.28",
"1.32",
"1.36",
"1.4",
"1.44",
"1.48",
"1.52",
"1.56",
"1.6",
"1.64",
"1.68",
"1.72",
"1.76",
"1.8",
"1.84",
"1.88",
"1.92",
"1.96"
],
"ColumnLabels": [
"Material1",
"Material2",
"Material3",
"Material4",
"Material5",
"Material6",
"Material7",
"Material8",
"Material9",
"Material10",
"Material11",
"Material12"
]
},
"Counters": [
{
"Type": "Cumulative",
"Matrix": {
"Rows": [
{
"Data": [
6.69771873292923,
0,
0,
0,
0.01994649920463562,
0.017650499296188355,
0.007246749711036683,
0.003443999862670899,
0.1422802443265915,
0,
0,
0.0008609999656677247
]
}
//,{...} ... for each row of the matrix
]
}
},
{
"Type": "Count",
"Matrix": {
"Rows": [
{
"Data": [
0.0001434999942779541,
0,
0,
0,
0.0001434999942779541,
0.0001434999942779541,
0.0001317590856552124,
0.0001434999942779541,
0.00014285165093031273,
0,
0,
0.0001434999942779541
]
}
//,{...} ... for each row of the matrix
]
}
}
]
}
The main issue I see in your code is this:
| mv-expand with_itemindex = r Row = Rows
| mv-expand Column = Rows[r].Data
You explode Rows and get the exploded values in a new column called Row, but then instead of working with Row.Data, you keep using the original unexploded Rows, traversing through the elements using r.
This leads to unnecessary duplication of Rows and it is probably what creates the memory pressure.
Check out the following code.
You can use the whole code and get the data formatted as a table with columns Material1, Material2 etc., or exclude the last 2 rows and simply get the exploded values, each in a separate row.
// Data sample generation. Not part of the solution
let p_matrixes = 3;
let p_columns = 12;
let p_rows = 50;
let ['ingest-table'] =
range i from 1 to p_matrixes step 1
| extend StartUtc = floor((ago(28d + rand()*7d) - datetime(1970))/1ms/1000,0.001)
| extend EndUtc = floor((ago(rand()*7d) - datetime(1970))/1ms/1000,0.001)
| extend RowLabels = toscalar(range x from todecimal(0) to todecimal(0.04 * (p_rows - 1)) step todecimal(0.04) | summarize make_list(tostring(x)))
| extend ColumnLabels = toscalar(range x from 1 to p_columns step 1 | summarize make_list(strcat("Material",tostring(x))))
| extend Counters_Cumulative = toscalar(range x from 1 to p_rows step 1 | mv-apply range(1, p_columns) on (summarize Data = pack_dictionary("Data", make_list(rand()))) | summarize make_list(Data))
| extend Counters_Count = toscalar(range x from 1 to p_rows step 1 | mv-apply range(1, p_columns) on (summarize Data = pack_dictionary("Data", make_list(rand()))) | summarize make_list(Data))
| project i, Data = pack_dictionary("Name", "StatisicName", "TimeInterval", pack_dictionary("StartUtc", StartUtc, "EndUtc",EndUtc), "Legend", pack_dictionary("RowLabels", RowLabels, "ColumnLabels", ColumnLabels), "Counters", pack_array(pack_dictionary("Type", "Cumulative", "Matrix", pack_dictionary("Rows", Counters_Cumulative)), pack_dictionary("Type", "Count", "Matrix", pack_dictionary("Rows", Counters_Count))))
;
// Solution starts here
// Explode values
['ingest-table']
| project Name = tostring(Data.Name), StartUtc = todecimal(Data.TimeInterval.StartUtc), EndUtc = todecimal(Data.TimeInterval.EndUtc), RowLabels = Data.Legend.RowLabels, ColumnLabels = Data.Legend.ColumnLabels, Counters = Data.Counters
| mv-apply Counters on (project Type = tostring(Counters.Type), Rows = Counters.Matrix.Rows)
| mv-apply RowLabels to typeof(decimal), Rows on (project RowLabels, Data = Rows.Data)
| mv-expand ColumnLabels to typeof(string), Data to typeof(real)
// Format as table
| evaluate pivot(ColumnLabels, take_any(Data))
| project-reorder Name, StartUtc, EndUtc, RowLabels, Type, * granny-asc
"Explode values" sample
Name
StartUtc
EndUtc
ColumnLabels
RowLabels
Type
Data
StatisicName
1658601891.654
1660953273.898
Material4
0.88
Count
0.33479977032253788
StatisicName
1658601891.654
1660953273.898
Material7
0.6
Cumulative
0.58620965468565811
StatisicName
1658801257.201
1660941025.56
Material1
0.72
Count
0.23164306814350025
StatisicName
1658601891.654
1660953273.898
Material4
1.68
Cumulative
0.47149864409592157
StatisicName
1658601891.654
1660953273.898
Material12
1.08
Cumulative
0.777589612330022
"Format as table" Sample
Name
StartUtc
EndUtc
RowLabels
Type
Material1
Material2
Material3
Material4
Material5
Material6
Material7
Material8
Material9
Material10
Material11
Material12
StatisicName
1658581605.446
1660891617.665
0.52
Cumulative
0.80568785763966921
0.69112398516227513
0.45844947991605256
0.87975011678339887
0.19607303271777138
0.76728212781319993
0.27520162657976527
0.48612400400362971
0.23810927904958085
0.53986865017468966
0.31225384042818344
0.99380179164514848
StatisicName
1658581605.446
1660891617.665
0.72
Count
0.77601864161716061
0.351768361021601
0.59345888695494731
0.92329751241805491
0.80811999338933449
0.49117503870065837
0.97871902062153937
0.94241064167069055
0.52950523227349289
0.39281849330041424
0.080759530370922858
0.8995622227351241
StatisicName
1658345203.482
1660893443.968
1.92
Count
0.78327575542772387
0.16795871437570925
0.01201541525964204
0.96029371013283549
0.60248327254185241
0.019315208353334352
0.4828009899119266
0.75923221663483853
0.29630236707606555
0.23977292819044668
0.94531978804572625
0.54626985282267437
StatisicName
1658345203.482
1660893443.968
1
Count
0.65268575186841382
0.61471913013853441
0.80536656853846211
0.380104887115314
0.84979344481966745
0.68790819414895632
0.80862491082567767
0.083687871352600765
0.16707928827946666
0.4071460045501768
0.94115460659910444
0.25011225557898314
StatisicName
1658581605.446
1660891617.665
1.6
Count
0.75532393959433786
0.71081551001527776
0.9757484452705758
0.55510969429009
0.055800808878012885
0.74924458240427783
0.78706505608871058
0.18745675452118818
0.70192553697345517
0.39429935579653647
0.4048784200404818
0.14888395753558561
Fiddle

Is it possible to iterate over the row values of a column in KQL to feed each value through a function

I am applying the series_decompose_anomalies algorithm to time data coming from multiple meters. Currently, I am using the ADX dashboard feature to feed my meter identifier as a parameter into the algorithm and return my anomalies and scores as a table.
let dt = 3hr;
Table
| where meter_ID == dashboardParameter
| make-series num=avg(value) on timestamp from _startTime to _endTime step dt
| extend (anomalies,score,baseline) = series_decompose_anomalies( num, 3,-1, 'linefit')
| mv-expand timestamp, num, baseline, anomalies, score
| where anomalies ==1
| project dashboardParameter, todatetime(timestamp), toreal(num), toint(anomalies), toreal(score)
I would like to bulk process all my meters in one go and return a table with all anomalies found across them. Is it possible to feed an array as an iterable in KQL or something similar to allow my parameter to change multiple times in a single run?
Simply add by meter_ID to make-series
(and remove | where meter_ID == dashboardParameter)
| make-series num=avg(value) on timestamp from _startTime to _endTime step dt by meter_ID
P.S.
Anomaly can be positive (num > baseline => flag = 1) or negative (num < baseline => flag = -1)
Demo
let _step = 1h;
let _endTime = toscalar(TransformedServerMetrics | summarize max(Timestamp));
let _startTime = _endTime - 12h;
TransformedServerMetrics
| make-series num = avg(Value) on Timestamp from _startTime to _endTime step _step by SQLMetrics
| extend (flag, score, baseline) = series_decompose_anomalies(num , 3,-1, 'linefit')
| mv-expand Timestamp to typeof(datetime), num to typeof(real), flag to typeof(int), score to typeof(real), baseline to typeof(real)
| where flag != 0
SQLMetrics
num
Timestamp
flag
score
baseline
write_bytes
169559910.91717172
2022-06-14T15:00:30.2395884Z
-1
-3.4824039875238131
170205132.25708669
cpu_time_ms
17.369556143036036
2022-06-14T17:00:30.2395884Z
1
7.8874529842826
11.04372634506527
percent_complete
0.04595588235294118
2022-06-14T22:00:30.2395884Z
1
25.019464868749985
0.004552738927738928
blocking_session_id
-5
2022-06-14T22:00:30.2395884Z
-1
-25.019464868749971
-0.49533799533799527
pending_disk_io_count
0.0019675925925925924
2022-06-14T23:00:30.2395884Z
1
6.4686836384225685
0.00043773741690408352
Fiddle

How to convert to dynamic type/ apply multiple functions on same 'pack' in KQL/Kusto

I am absolutely in love with ADX time series capabilities; having worked tons on sensor data with Python. Below are the requirements for my case:
Handle Sensor data tags at different frequencies -- bring them to all to 1 sec frequency (if in milliseconds, aggregate over a 1sec interval)
Convert stacked data to unstacked data.
Join with another dataset which has multiple "string-labels" by timestamp, after unstack.
Do linear interpolation on some columns, and forward fill in others (around 10-12 in all).
I think with below query I have gotten the first three done; but unable to use series_fill_linear directly on column. The docs say this function requires a dynamic type as input. The error message is helpful:
series_fill_linear(): argument #1 was not of an expected data type: dynamic
Is it possible to apply series_fill_linear where I'm already using pack instead of using pack again. How can I apply this function selectively by Tag; and make my overall query more readable? It's important to note that only sensor_data table requires both series_fill_linear and series_fill_forward; label_data only requires series_fill_forward.
List item
sensor_data
| where timestamp > datetime(2020-11-24 00:59:59) and timestamp <datetime(2020-11-24 12:00:00)
| where device_number =='PRESSURE_599'
| where tag_name in ("tag1", "tag2", "tag3", "tag4")
| make-series agg_value = avg(value) default = double(null) on timestamp in range (datetime(2020-11-24 00:59:59), datetime(2020-11-24 12:00:00), 1s) by tag_name
| extend series_fill_linear(agg_value, double(null), false) //EDIT
| mv-expand timestamp to typeof(datetime), agg_value to typeof(double)
| summarize b = make_bag(pack(tag_name, agg_value)) by timestamp
| evaluate bag_unpack(b)
|join kind = leftouter (label_data
| where timestamp > datetime(2020-11-24 00:58:59) and timestamp <datetime(2020-11-24 12:00:01)
| where device_number =='PRESSURE_599'
| where tag != "PRESSURE_599_label_Raw"
| summarize x = make_bag(pack(tag, value)) by timestamp
| evaluate bag_unpack(x)) on timestamp
| project timestamp,
MY_LINEAR_COL_1 = series_fill_linear(tag1, double(null), false),
MY_LINEAR_COL_2 = series_fill_forward(tag2),
MY_LABEL_1 = series_fill_forward(PRESSURE_599_label_level1),
MY_LABEL_2 = series_fill_forward(PRESSURE_599_label_level2)
EDIT: I ended up using extend with case to handle different cases of interpolation.
// let forward_tags = dynamic({"tags": ["tag2","tag4"]}); unable to use this in query as "forward_tags.tags"
sensor_data
| where timestamp > datetime(2020-11-24 00:59:59) and timestamp <datetime(2020-11-24 12:00:00)
| where device_number = "PRESSURE_599"
| where tag_name in ("tag1", "tag2", "tag3", "tag4") // use a variable here instead?
| make-series agg_value = avg(value)
default = double(null)
on timestamp
in range (datetime(2020-11-24 00:59:59), datetime(2020-11-24 12:00:00), 1s)
by tag_name
| extend agg_value = case (tag_name in ("tag2", "tag3"), // use a variable here instead?
series_fill_forward(agg_value, double(null)),
series_fill_linear(agg_value, double(null), false)
)
| mv-expand timestamp to typeof(datetime), agg_value to typeof(double)
| summarize b = make_bag(pack(tag_name, agg_value)) by timestamp
| evaluate bag_unpack(b)
| join kind = leftouter (
label_data // don't want to use make-series here, will be unecessary data generation since already in 'ss' format.
| where timestamp > datetime(2020-11-24 00:58:59) and timestamp <datetime(2020-11-24 12:00:01)
| where tag != "PRESSURE_599_label_Raw"
| summarize x = make_bag(pack(tag, value)) by timestamp
| evaluate bag_unpack(x)
)
on timestamp
I was wondering if it is possible in KQL to pass a list of strings inside a query/fxn to use as shown below. I have commented where I think a list of strings could be passed to make the code more readable.
Now, I just need to fill_forward the label columns (MY_LABEL_1, MY_LABEL_2); which are a result of the below query. I would prefer the code is added on to the main query, and the final result is a table with all columns; Here is a sample table based on my case's result.
datatable (timestamp:datetime, tag1:double, tag2:double, tag3:double, tag4:double, MY_LABEL_1: string, MY_LABEL_2: string)
[
datetime(2020-11-24T00:01:00Z), 1, 3, 6, 9, "x", "foo",
datetime(2020-11-24T00:01:01Z), 1, 3, 6, 9, "", "",
datetime(2020-11-24T00:01:02Z), 1, 3, 6, 9,"", "",
datetime(2020-11-24T00:01:03Z), 1, 3, 6, 9,"y", "bar",
datetime(2020-11-24T00:01:04Z), 1, 3, 6, 9,"", "",
datetime(2020-11-24T00:01:05Z), 1, 3, 6, 9,"", "",
]
Series functions in ADX only work on dynamic arrays. You can apply a selective fill function using case() function, by replacing this line:
| extend series_fill_linear(agg_value, double(null), false) //EDIT
With something like the following:
| extend agg_value = case(
tag_name == "tag1", series_fill_linear(agg_value, double(null), false),
tag_name == "tag2", series_fill_forward(agg_value),
series_fill_forward(agg_value)
)
Edit:
Here is an example of string column fill-forward workaround:
let T = datatable ( Timestamp: datetime, Employee: string )
[ datetime(2020-01-01), "Bob",
datetime(2021-01-02), "",
datetime(2021-01-03), "Alice",
datetime(2021-01-04), "",
datetime(2021-01-05), "",
datetime(2021-01-06), "Alan",
datetime(2021-01-07), "",
datetime(2021-01-08), "" ]
| sort by Timestamp asc;
let employeeLookup = toscalar(T | where isnotempty(Employee) | summarize make_list(Employee));
T
| extend idx = row_cumsum(tolong(isnotempty(Employee)))
| extend EmployeeFilled = employeeLookup[idx - 1]
| project-away idx
Timestamp
Employee
EmployeeFilled
2021-01-01 00:00:00.0000000
Bob
Bob
2021-01-02 00:00:00.0000000
Bob
2021-01-03 00:00:00.0000000
Alice
Alice
2021-01-04 00:00:00.0000000
Alice
2021-01-05 00:00:00.0000000
Alice
2021-01-06 00:00:00.0000000
Alan
Alan
2021-01-07 00:00:00.0000000
Alan
2021-01-08 00:00:00.0000000
Alan
Regarding your requirement to convert the time series in many frequencies to a common one, have a look at series_downsample_fl() function library

is it possible for better optimization of my kusto query

below is my Kusto query, it takes 2+ mins in lens dashboard to show the data, I have optimized my query to have materialize() in let statements and contains with has. is there anyother way to optimize it in a better way.
let C_masfunteams = materialize(find withsource=source in (cluster(X).database('oci-*').['TextFileLogs']) where AttemptedIngestTime > ago(7d)
and FileLineContent has "<li>Build Number:" | summarize min(AttemptedIngestTime) by source, FileLineContent);//, AttemptedIngestTime
let n = C_masfunteams | extend databaseName = extract(#"""(oci-[^""]*)""", 1, source)
| extend BuildNumber = extract(#"([A-Z]\w*\.[0-9]\d*\.[0-9]\d*\.[0-9]\d*)",1,FileLineContent)
| extend StampVersion = extract(#"([0-9]\d*\.[0-9]\d*\.[0-9]\d*\.[0-9]\d*)",1,FileLineContent)
|extend cluster ='masfunteams'
| project BuildNumber , StampVersion , min_AttemptedIngestTime
| summarize NumberOfRuns=count() , ingestedtime = min(min_AttemptedIngestTime) by BuildNumber,StampVersion;
let C_masfun= materialize(find withsource=source in (cluster(Y).database('oci-*').['TextFileLogs']) where AttemptedIngestTime > ago(7d)
and FileLineContent has "<li>Build Number:" | summarize min(AttemptedIngestTime) by source, FileLineContent);//, AttemptedIngestTime
let m = C_masfun | extend databaseName = extract(#"""(oci-[^""]*)""", 1, source)
| extend BuildNumber = extract(#"([A-Z]\w*\.[0-9]\d*\.[0-9]\d*\.[0-9]\d*)",1,FileLineContent)
| extend StampVersion = extract(#"([0-9]\d*\.[0-9]\d*\.[0-9]\d*\.[0-9]\d*)",1,FileLineContent)
|extend cluster ='masfunteams'
| project BuildNumber , StampVersion , min_AttemptedIngestTime
| summarize NumberOfRuns=count() , ingestedtime = min(min_AttemptedIngestTime) by BuildNumber,StampVersion;
let C_masvaas = materialize(find withsource=source in (cluster(z).database('oci-*').['TextFileLogs']) where AttemptedIngestTime > ago(7d)
and FileLineContent has "<li>Build Number:" | summarize min(AttemptedIngestTime) by source, FileLineContent);//, AttemptedIngestTime
let o= C_masvaas | extend databaseName = extract(#"""(oci-[^""]*)""", 1, source)
| extend BuildNumber = extract(#"([A-Z]\w*\.[0-9]\d*\.[0-9]\d*\.[0-9]\d*)",1,FileLineContent)
| extend StampVersion = extract(#"([0-9]\d*\.[0-9]\d*\.[0-9]\d*\.[0-9]\d*)",1,FileLineContent)
|extend cluster ='masfunteams'
| project BuildNumber , StampVersion , min_AttemptedIngestTime
| summarize NumberOfRuns=count() , ingestedtime = min(min_AttemptedIngestTime) by BuildNumber,StampVersion;
union isfuzzy=true m,n,o
| summarize Ingestedtime =min(ingestedtime) by BuildNumber,StampVersion
Hi the query is quite complex and without running it on the actual cluster it is hard to figure out what is the expected results. So here are a few tips:
Consider starting the union operator as the first operator with a uniform logic for the filtering, parsing and summarize operations
Consider removing the materialize() if you are only using each dataset only once
Consider removing the 'find' as you are not doing search across multiple columns, If you are using it to get the source table in your output records set, consider adding "withsource" to the union statement
If possible consider using the 'parse' operator instead of the regular expression
Hope this helps!

Resources