Sum all columns that match wildcard patterns in Kusto - azure-data-explorer

I need to capture the aggregated number of items in each subcategories and categories. Each category has the same finite set of subcategories.There is a finite set of category as well. I intend to capture the aggregation using this schema:
Timestamp
Count_CatA_Sub1
Count_CatA_Sub2
...
Count_CatD_Sub6
Count_CatD_Sub7
2022-11-29
1
2
...
27
28
But I need to visualize the number of items in a category as well. With the above sample row, I need the output:
Timestamp
CountA
CountB
CountC
CountD
2022-11-29
28
77
126
175
I would rather not have to type out CountA=Count_CatA_Sub1 + Count_CatA_Sub2 +... Count_CatA_Sub7. Is it possible to do these with wildcard in Kusto? Or a better schema?

One way would be to use the narrow() plugin plus some conditional summarizing.
datatable(Timestamp: datetime, CatA_Sub1: int, CatA_Sub2: int, CatB_Sub1: int) [
datetime(2022-11-28 11:50:00), 1, 2, 3,
datetime(2022-11-28 11:51:00), 4, 5, 6,
datetime(2022-11-28 11:52:00), 7, 8, 9,
datetime(2022-11-28 11:53:00), 10, 11, 12,
datetime(2022-11-28 11:54:00), 13, 14, 15,
datetime(2022-11-28 11:55:00), 16, 17, 18
]
| evaluate narrow()
| summarize Timestamp = maxif(todatetime(Value), Column == 'Timestamp'),
TotalCount = sumif(toint(Value), Column != 'Timestamp') by Row
| project-away Row
Output:
Timestamp
TotalCount
2022-11-28 11:50:00.0000000
6
2022-11-28 11:51:00.0000000
15
2022-11-28 11:52:00.0000000
24
2022-11-28 11:53:00.0000000
33
2022-11-28 11:54:00.0000000
42
2022-11-28 11:55:00.0000000
51
Not sure if there is a more elegant solution.

// Sample data generation. Not part of the solution.
let t = materialize(print Timestamp = range(datetime(2022-11-29), datetime(2022-12-03), 1d), Cat = range(0, 3), Sub = range(1, 7) | mv-expand Timestamp to typeof(datetime) | mv-expand Cat to typeof(int) | mv-expand Sub to typeof(int) | serialize Val = row_number() | extend Col = strcat("Count_Cat", tostring(make_string(to_utf8("A")[0] + Cat)) ,"_Sub", Sub) | evaluate pivot(Col, take_any(Val), Timestamp));
// Solution starts here
t
| project Timestamp, Metrics = bag_remove_keys(pack_all(), dynamic(["Timestamp"]))
| mv-apply Metrics on
(
extend k = tostring(bag_keys(Metrics)[0])
| extend v = tolong(Metrics[k])
| parse k with "Count_Cat" Cat "_Sub" Sub:int
| summarize sum(v) by Cat
| summarize make_bag(bag_pack(strcat("Count", Cat), sum_v))
)
| evaluate bag_unpack(bag_)
Timestamp
CountA
CountB
CountC
CountD
2022-11-29T00:00:00Z
28
77
126
175
2022-11-30T00:00:00Z
224
273
322
371
2022-12-01T00:00:00Z
420
469
518
567
2022-12-02T00:00:00Z
616
665
714
763
2022-12-03T00:00:00Z
812
861
910
959
Fiddle

// Sample data generation. Not part of the solution.
let t = materialize(print Timestamp = range(datetime(2022-11-29), datetime(2022-12-03), 1d), Cat = range(0, 3), Sub = range(1, 7) | mv-expand Timestamp to typeof(datetime) | mv-expand Cat to typeof(int) | mv-expand Sub to typeof(int) | serialize Val = row_number() | extend Col = strcat("Count_Cat", tostring(make_string(to_utf8("A")[0] + Cat)) ,"_Sub", Sub) | evaluate pivot(Col, take_any(Val), Timestamp));
// Solution starts here
t
| project Timestamp, Metrics = bag_remove_keys(pack_all(), dynamic(["Timestamp"]))
| mv-expand kind=array Metrics
| extend k = tostring(Metrics[0]), v = tolong(Metrics[1])
| parse k with "Count_Cat" Cat "_Sub" Sub:int
| extend Col = strcat("Count", Cat)
| evaluate pivot(Col, sum(v), Timestamp)
Timestamp
CountA
CountB
CountC
CountD
2022-11-29T00:00:00Z
28
77
126
175
2022-11-30T00:00:00Z
224
273
322
371
2022-12-01T00:00:00Z
420
469
518
567
2022-12-02T00:00:00Z
616
665
714
763
2022-12-03T00:00:00Z
812
861
910
959
Fiddle

// Sample data generation. Not part of the solution.
let t = materialize(print Timestamp = range(datetime(2022-11-29), datetime(2022-12-03), 1d), Cat = range(0, 3), Sub = range(1, 7) | mv-expand Timestamp to typeof(datetime) | mv-expand Cat to typeof(int) | mv-expand Sub to typeof(int) | serialize Val = row_number() | extend Col = strcat("Count_Cat", tostring(make_string(to_utf8("A")[0] + Cat)) ,"_Sub", Sub) | evaluate pivot(Col, take_any(Val), Timestamp));
// Solution starts here
t
| evaluate narrow()
| parse Column with "Count_Cat" Cat "_Sub" Sub:int
| summarize Count = sumif(tolong(Value), Column != "Timestamp")
,Timestamp = todatetime(take_anyif(Value, Column == "Timestamp"))
by Row, Cat
| summarize take_any(Timestamp), make_bag(bag_pack(Cat, Count)) by Row
| evaluate bag_unpack(bag_, "Count")
| project-away Row
Timestamp
CountA
CountB
CountC
CountD
2022-11-29T00:00:00Z
28
77
126
175
2022-11-30T00:00:00Z
224
273
322
371
2022-12-01T00:00:00Z
420
469
518
567
2022-12-02T00:00:00Z
616
665
714
763
2022-12-03T00:00:00Z
812
861
910
959
Fiddle

// Sample data generation. Not part of the solution.
let t = materialize(print Timestamp = range(datetime(2022-11-29), datetime(2022-12-03), 1d), Cat = range(0, 3), Sub = range(1, 7) | mv-expand Timestamp to typeof(datetime) | mv-expand Cat to typeof(int) | mv-expand Sub to typeof(int) | serialize Val = row_number() | extend Col = strcat("Count_Cat", tostring(make_string(to_utf8("A")[0] + Cat)) ,"_Sub", Sub) | evaluate pivot(Col, take_any(Val), Timestamp));
// Solution starts here
let cat = 4;
let sub = 7;
let categories = toscalar
(
t
| getschema
| parse-where ColumnName with "Count_Cat" Cat "_Sub" Sub:int
| summarize make_set(Cat)
);
t
| project Timestamp, Count = array_split(array_slice(pack_array(*), 1, cat*sub), range(sub, sub*(cat-1), sub))
| mv-expand with_itemindex=i Count
| extend Cat = strcat("Count", tostring(categories[i]))
| evaluate pivot(Cat, take_any(tolong(array_sum(Count))), Timestamp)
Timestamp
CountA
CountB
CountC
CountD
2022-11-29T00:00:00Z
28
77
126
175
2022-11-30T00:00:00Z
224
273
322
371
2022-12-01T00:00:00Z
420
469
518
567
2022-12-02T00:00:00Z
616
665
714
763
2022-12-03T00:00:00Z
812
861
910
959
Fiddle

Related

In SQLite3, is it possible to list out specific rows?

Let's say I have a table:
+--------------+--------------+------+-----+
| ID | Score | email | add |
+--------------+--------------+------+-----+
| 123 | 88 | 123#gmail.com | somewhere |
| 456 | 77 | 123#gmail.com | somewhere |
| 789 | 88 | 123#gmail.com | somewhere |
| 111 | 77 |123#gmail.com | somewhere |
| 555 | 77 | 123#gmail.com | somewhere |
|444 | 88 | 123#gmail.com | somewhere
| 222 | 77 | 123#gmail.com | somewhere |
| 333 | 88 |123#gmail.com | somewhere |
My question is it possible to select Score column and ONLY print out first 3 88 and 77 Score?
I tried but it seems only give me 3 88 scores only
SELECT Score
FROM Table_Name
WHERE Score = '88' OR Score = '77'
LIMIT 3
First filter the table so that only the rows with the scores that you want are returned and then use ROW_NUMBER() window function to rank the rows of each Score based on ID so that you can filter out the rows that exceed the first 3 of each group:
SELECT ID, Score, email, add
FROM (
SELECT *, ROW_NUMBER() OVER (PARTITION BY Score ORDER BY ID) rn
FROM Table_Name
WHERE Score = '88' OR Score = '77'
)
WHERE rn <= 3;
For versions of SQLite prior to 3.25.0 that do not support window functions use a correlated subquery:
SELECT t1.*
FROM Table_Name t1
WHERE t1.Score = '88' OR t1.Score = '77'
AND (SELECT COUNT(*) FROM Table_Name t2 WHERE t2.Score = t1.Score AND t2.ID <= t1.ID) <= 3;

Sqlite checking if value in one row is between values in next rows without loop

Let's say that I have sql table like this:
id | val_1 | val_2 |
1 | 50 | 130 | -
2 | 70 | 110 | False
3 | 60 | 135 | True
4 | 40 | 70 | True
...
Now, I want to check if (val_1 50 is between 70 AND 110) OR ( val_2 130 is between 70 and 110), which gives False, and the next row which gives True coz 130 is between 60 and 135 and so on. Is that possible with window functions?
You can use the lag function to get the previous row:
SELECT id, val_1, val_2,
LAG(val_1) OVER (ORDER BY id ASC) BETWEEN val_1 AND val_2 OR
LAG(val_2) OVER (ORDER BY id ASC) BETWEEN val_1 AND val_2
FROM mytable

Merging two data frames without duplicating metric values

I have two data frames and I want to merge them by leader values, so that I can see the total runs and walks for each groups. Each leader can have multiple members in their team, but the problem that I'm having is that when I merge them, the metrics also gets duplicated over to the newly added rows.
Here is an example of the two data sets that I have:
Data set 1:
+-------------+-----------+------------+-------------+
| leader name | leader id | total runs | total walks |
+-------------+-----------+------------+-------------+
| ab | 11 | 4 | 9 |
| tg | 47 | 8 | 3 |
+-------------+-----------+------------+-------------+
Data set 2:
+-------------+-----------+--------------+-----------+
| leader name | leader id | member name | member id |
+-------------+-----------+--------------+-----------+
| ab | 11 | gfh | 589 |
| ab | 11 | tyu | 739 |
| tg | 47 | rtf | 745 |
| tg | 47 | jke | 996 |
+-------------+-----------+--------------+-----------+
I want to merge the two datasets so that they become like this:
+-------------+-----------+--------------+------------+------------+-------------+
| leader name | leader id | member name | member id | total runs | total walks |
+-------------+-----------+--------------+------------+------------+-------------+
| ab | 11 | gfh | 589 | 4 | 9 |
| ab | 11 | tyu | 739 | | |
| tg | 47 | rtf | 745 | 8 | 3 |
| tg | 47 | jke | 996 | | |
+-------------+-----------+--------------+------------+------------+-------------+
But right now I keep getting:
+-------------+-----------+--------------+------------+------------+-------------+
| leader name | leader id | member name | member id | total runs | total walks |
+-------------+-----------+--------------+------------+------------+-------------+
| ab | 11 | gfh | 589 | 4 | 9 |
| ab | 11 | tyu | 739 | 4 | 9 |
| tg | 47 | rtf | 745 | 8 | 3 |
| tg | 47 | jke | 996 | 8 | 3 |
+-------------+-----------+--------------+------------+------------+-------------+
It doesn't matter if they're blank, NA's or 0's, as long as the values aren't duplicating. Is there a way to achieve this?
We can do a replace on those 'total' columns after a left_join
library(dplyr)
left_join(df2, df1 ) %>%
group_by(leadername) %>%
mutate_at(vars(starts_with('total')), ~ replace(., row_number() > 1, NA))
# A tibble: 4 x 6
# Groups: leadername [2]
# leadername leaderid membername memberid totalruns totalwalks
# <chr> <dbl> <chr> <dbl> <dbl> <dbl>
#1 ab 11 gfh 589 4 9
#2 ab 11 tyu 739 NA NA
#3 tg 47 rtf 745 8 3
#4 tg 47 jke 996 NA NA
Or without using the group_by
left_join(df2, df1 ) %>%
mutate_at(vars(starts_with('total')), ~
replace(., duplicated(leadername), NA))
Or a base R option is
out <- merge(df2, df1, all.x = TRUE)
i1 <- duplicated(out$leadername)
out[i1, c("totalruns", "totalwalks")] <- NA
out
# leadername leaderid membername memberid totalruns totalwalks
#1 ab 11 gfh 589 4 9
#2 ab 11 tyu 739 NA NA
#3 tg 47 rtf 745 8 3
#4 tg 47 jke 996 NA NA
data
df1 <- structure(list(leadername = c("ab", "tg"), leaderid = c(11, 47
), totalruns = c(4, 8), totalwalks = c(9, 3)), class = "data.frame", row.names = c(NA,
-2L))
df2 <- structure(list(leadername = c("ab", "ab", "tg", "tg"), leaderid = c(11,
11, 47, 47), membername = c("gfh", "tyu", "rtf", "jke"), memberid = c(589,
739, 745, 996)), class = "data.frame", row.names = c(NA, -4L))

R: Looping over rows until condition is met, then start over in next row

I have a table with orders per customer with timestamp. I want to know which orders happen in timeframe x after an order, and once the timeframe is over, start again with timeframe x with the next order. The new column should always say which one was the first order.
Best is to look at example below.
I tried some for loops with nextalready but could not make it work at all.
The data looks like this:
x <- data.frame("Customer" =c(123,123,123,123,123,123,123,567), "Order_nr" = c(1,2,3,4,5,6,7,1), "Order_datetime" = c('2018-11-24 00:00:25','2018-11-24 15:58:23','2018-11-24 19:10:29','2018-11-24 21:29:04','2018-11-24 22:03:59','2018-11-24 22:26:59','2018-11-24 22:36:13','2018-11-24 12:00:55'))
x
| Customer | Order_nr | Order_datetime|
| ------------- |:-------------:| -----:|
| 123 | 1 | 2018-11-24 00:00:25 |
| 123 | 2 | 2018-11-24 15:58:23 |
| 123 | 3 | 2018-11-24 19:10:29 |
| 123 | 4 | 2018-11-24 21:29:04 |
| 123 | 5 | 2018-11-24 22:03:59 |
| 123 | 6 | 2018-11-24 22:26:59 |
| 123 | 7 | 2018-11-24 22:36:1 |
| 567 | 1 | 2018-11-24 12:00:55 |
If I would want to know the orders within a 1h timeframe, I would like to have the outcome in column 1h bundle first order, if it was 3h, it should be outcome of column 3h bundle first order
| Customer | Order_nr | Order_datetime| 3h bundle first order| 3h bundle first order|
| ------------- |:-------------:| -----:|-----:|
| 123 | 1A | 2018-11-24 00:00:25 |1A |1A|
| 123 | 2A | 2018-11-24 15:58:23 |2A |2A|
| 123 | 3A | 2018-11-24 19:10:29 |3A |3A|
| 123 | 4A | 2018-11-24 21:29:04 |4A |3A|
| 123 | 5A | 2018-11-24 22:03:59 |4A |3A|
| 123 | 6A | 2018-11-24 22:26:59 |4A |4A|
| 123 | 7A | 2018-11-24 22:36:1 |5A |4A|
| 567 | 1B | 2018-11-24 12:00:55 |1B |1B|
So I need to know that Order 4A, 5A and 6A happened within 1h starting from order 4A on, in the example of 1h bundle first order.
A collegue helped me find the solution:
Order_nr_3h = []
order1 = df['Order_nr ']
initial_order = order1[0] ##initials needed so the loop can start with a value
time = df['Order_datetime']
initial = time[0]
customer1 = df['customer']
initial_customer = customer1[0]
for i in range(-1,len(time)-1):
delta = (time[i+1]-initial).seconds/3600 ## 1h
if customer1[i+1] != initial_customer:
initial_order = order1[i+1]
initial_customer = customer1[i+1]
if delta > 1:
initial = time[i+1]
initial_order = order1[i+1]
Order_nr_3h.append(initial_order)
else:
Order_nr_3h.append(initial_order)
continue
df['Order_nr_3h'] = Order_nr_3h
I then had another loop around it for each time interval I needed (1h,3h, etc.)
so <- data.frame("Customer" =c(123,123,123,123,123,123,123,567),
"Order_nr" = c(1,2,3,4,5,6,7,1),
"Order_datetime" = c('2018-11-24 00:00:25','2018-11-24 15:58:23',
'2018-11-24 19:10:29','2018-11-24 21:29:04',
'2018-11-24 22:03:59','2018-11-24 22:26:59',
'2018-11-24 22:36:13','2018-11-24 12:00:55'))
learn <- function(date_time, df, hr.within, i){
subject <- abs(difftime(date_time, df$Order_datetime, units="hours"))
ifelse(i ==1,
thatrow <- which((subject <= hr.within) == TRUE),
thatrow <- intersect( which((subject <= hr.within) == TRUE),
which((subject >= hr.within-1) == TRUE)))
if(identical(thatrow, integer(0))) return()
else{
R2 <- df[thatrow, c("Customer", "Order_nr", "Order_datetime")]
R2$x <- NA
R2[,"x"] <- paste0(hr.within, "A")
colnames(R2)[4] <- paste0(hr.within,"A bundle first order")
return(R2)
}
}
learn.wrapper <- function(date_time, df, hr.within=seq(1,100,1)){
learn.out <- list()
for(i in 1:length(hr.within)){
learn.out[[i]] <- learn(date_time,so, hr.within[i], i)
}
return(rbindlist(learn.out, fill=TRUE))
}
learnery <- learn.wrapper('2018-11-24 19:00:00', df=so) #first argument is the time you want to ref. with
learnery
This assume all action happen within 100 hours, you may reset into proper duration window via hr.within=seq(1,100,1) and then recompile.
You may then row merge the result yourself upon reviewing its outputs.

I have fortnightly(fourteen)dates,i want op as 14 dates and amount based on deal date and maturity date

My question is:
i have two tables(A and B) as below
A
deal_no deal_date deal_amnt
501 `20180525` `10`
502 `20180526` `20`
601 `20180528` `30`
602 `20180529` `40`
B
deal_type maturity_date
501 `20180525`
502 `20180527`
601 `20180530`
602 `20180530`
For the same deal_no(deal_type),if deal_date from A = maturity_date from B then deal_amnt for maturity_date should be same
eg(for 501--> if(20180525=20180525) then amnt=10)
For the same deal_no(deal_type),if deal_date from A < maturity_date from B then deal_amnt for maturity_date should be same,and it same amount should be upadted under deal_date
eg(for 502--> if(20180526 < 20180527) then amount for 20180527=20 and for 20180526 should also =20)
however, if i am having more than one deal_date having same maturity_date then amount for maturity_date should be sum of that two or more deal_dates.
eg(for 601 --> if(20180528 < 20180530) and for 602 --> if(20180529 < 20180530)
then amount for 20180528=30 and amount for 20180529 should =((previous 30)+(current 40))=70 and amount for 20180530 should be 70 as outstanding)
I have fortnightly(fourteen)dates,i want op as 14 dates and amount based on deal date and maturity date
op should be:
date `deal_amnt`
20180516 0
20180517 0
20180518 0
20180519 0
20180520 0
20180521 0
20180522 0
20180523 0
20180524 0
20180525 10
20180526 20
20180527 20
20180528 30
20180529 70
20180530 70
Need help i am using plsql
Here's one solution purely SQL no PL needed
SQL Fiddle
Query 1:
with dts(n, dt) as (
select 1 n
, date '2018-05-16'
from dual
union all
select n+1
, date '2018-05-16' + n
from dts where n < 15
)
select dt
, sum(deal_amnt)
from a
join b
on a.deal_no = b.deal_type
right join dts
on dts.dt between a.deal_date and b.maturity_date
group by dt
Results:
| DT | SUM(DEAL_AMNT) |
|----------------------|----------------|
| 2018-05-16T00:00:00Z | (null) |
| 2018-05-17T00:00:00Z | (null) |
| 2018-05-18T00:00:00Z | (null) |
| 2018-05-19T00:00:00Z | (null) |
| 2018-05-20T00:00:00Z | (null) |
| 2018-05-21T00:00:00Z | (null) |
| 2018-05-22T00:00:00Z | (null) |
| 2018-05-23T00:00:00Z | (null) |
| 2018-05-24T00:00:00Z | (null) |
| 2018-05-25T00:00:00Z | 10 |
| 2018-05-26T00:00:00Z | 20 |
| 2018-05-27T00:00:00Z | 20 |
| 2018-05-28T00:00:00Z | 30 |
| 2018-05-29T00:00:00Z | 70 |
| 2018-05-30T00:00:00Z | 70 |

Resources