how can i melt a data.table with concatenated column names - r

I'm using dcast.data.table to convert a long data.table to a wide data.table
library(data.table)
library(reshape2)
set.seed(1234)
dt.base <- data.table(A = rep(c(1:3),2), B = rep(c(1:2),3), C=c(1:4,1,2),thevalue=rnorm(6))
#from long to wide using dcast.data.table()
dt.cast <- dcast.data.table(dt.base, A ~ B + C, value.var = "thevalue", fun = sum)
#now some stuff happens e.g., please do not bother what happens between dcast and melt
setkey(dt.cast, A)
dt.cast[2, c(2,3,4):=1,with = FALSE]
now i want to melt the data.table back again to the original column layout and here i'm stuck, how do I separate the concatenated columnames from the casted data.table, this is my problem
dt.melt <- melt(dt.cast,id.vars = c("A"), value.name = "thevalue")
I need two columns instead of one
the result that i'm looking for can be produced with this code
#update
dt.base[A==2 & B == 1 & C == 1, thevalue :=1]
dt.base[A==2 & B == 2 & C == 2, thevalue :=1]
#insert (2,1,3 was not there in the base data.table)
dt.newrow <- data.table(A=2, B=1, C=3, thevalue = 1)
dt.base <-rbindlist(list(dt.base, dt.newrow))
dt.base
As always any help is appreciated

Would that work for you?
colnames <- c("B", "C")
dt.melt[, (colnames) := (colsplit(variable, "_", colnames))][, variable := NULL]
subset(dt.melt, thevalue != 0)
# or dt.melt[thevalue != 0, ]
# A thevalue B C
#1: 1 -1.2070657 1 1
#2: 2 1.0000000 1 1
#3: 2 1.0000000 1 3
#4: 3 1.0844412 1 3
#5: 2 1.0000000 2 2
#6: 3 0.5060559 2 2
#7: 1 -2.3456977 2 4
If your data set isn't representable and there could be zeros in valid rows, here's alternative approach
colnames <- c("B", "C")
setkey(dt.melt[, (colnames) := (colsplit(variable, "_",colnames))][, variable := NULL], A, B, C)
setkey(dt.base, A, B, C)
dt.base <- dt.melt[rbind(dt.base, data.table(A = 2, B = 1, C = 3), fill = T)]
dt.base[, thevalue.1 := NULL]
## A B C thevalue
## 1: 1 1 1 -1.2070657
## 2: 1 2 4 -2.3456977
## 3: 2 1 1 1.0000000
## 4: 2 2 2 1.0000000
## 5: 3 1 3 1.0844412
## 6: 3 2 2 0.5060559
## 7: 2 1 3 1.0000000
Edit
As. suggested by #Arun, the most efficient way would be to use #AnandaMahto cSplit function, as it is using data.table too, i.e,
cSplit(dt.melt, "variable", "_")
Second Edit
In order to save the manual merges, you can set fill = NA (for example) while dcasting and then do everything in one go with csplit, e.g.
dt.cast <- dcast.data.table(dt.base, A ~ B + C, value.var = "thevalue", fun = sum, fill = NA)
setkey(dt.cast, A)
dt.cast[2, c(2,3,4):=1,with = FALSE]
dt.melt <- melt(dt.cast,id.vars = c("A"), value.name = "thevalue")
dt.cast <- cSplit(dt.melt, "variable", "_")[!is.na(thevalue)]
setnames(dt.cast, 3:4, c("B","C"))
# A thevalue B C
# 1: 1 -1.2070657 1 1
# 2: 2 1.0000000 1 1
# 3: 2 1.0000000 1 3
# 4: 3 1.0844412 1 3
# 5: 2 1.0000000 2 2
# 6: 3 0.5060559 2 2
# 7: 1 -2.3456977 2 4

Related

Replace a sequence of values by group depending on preceeding values

I have a data table of this form (2000000+ rows, 1000+groups):
set.seed(1)
dt <- data.table(id = rep(1:3, each = 5), values = sample(c("a", "b","c"), 15, TRUE))
> dt
id values
1: 1 a
2: 1 c
3: 1 a
4: 1 b
5: 1 a
6: 2 c
7: 2 c
8: 2 b
9: 2 b
10: 2 c
11: 3 c
12: 3 a
13: 3 a
14: 3 a
15: 3 b
I want to, within each ID group, replace the entire sequence of character "a", that precedes the character "b", and I want to replace them with "b". So the condition is that if "a" or a sequence of "a"s appear before "b", replace all the "a"s. (actually, in my real table, it's when "b" is preceded by "a","x", or"y", preceding character should be replaced, but I should be able to generalize)
In the example above,the value of "a" in row 3 should be replaced (easy to do with (shift) in data.table), as well as all the "a"s in rows 12-14 (not sure how to do). So, the desired output is this:
> dt
id values
1: 1 a
2: 1 c
3: 1 b
4: 1 b
5: 1 a
6: 2 c
7: 2 c
8: 2 b
9: 2 b
10: 2 c
11: 3 c
12: 3 b
13: 3 b
14: 3 b
15: 3 b
What comes to my mind is looping from the last index, but I am not exactly sure how to do that with if I have multiple groupings (say, ID and DATE), and anyway, this doesn't seem to be the fastest dt solution.
Here's another data.table approach:
dt[, x := rleid(values), by = .(id)]
dt[dt[values == "b", .(id, x=x-1, values="a")],
on = .(id, x, values),
values := "b"
][, x := NULL]
create a new column "x" with the run length ids per value grouped by id
join on itself while modifying the run length ids (x) to be the preceeding value and values to be "a" (the specific value you want to change), then update values with "b"
delete column x afterwards
The result is:
dt
# id values
# 1: 1 a
# 2: 1 c
# 3: 1 b
# 4: 1 b
# 5: 1 a
# 6: 2 c
# 7: 2 c
# 8: 2 b
# 9: 2 b
# 10: 2 c
# 11: 3 c
# 12: 3 b
# 13: 3 b
# 14: 3 b
# 15: 3 b
And here's a generalization to the case where you want to replace values "a", "x", or "y" followed by "b" with "b":
dt[, x := rleid(values), by = .(id)]
dt[dt[values == "b", .(values=c("a", "x", "y")), by = .(id, x=x-1)],
on = .(id, x, values),
values := "b"
][, x := NULL]
Late to the party and several nice run length alternatives were already provided ;) So here I try nafill instead.
(1) Create a variable 'v2' which is NA when 'values' are "a". (2) Fill missing values by next observation carried backward. (3) When the original 'values' are "a" and the corresponding filled values in 'v2' are "b", update 'v' with 'v2'.
# 1
dt[values != "a" , v2 := values]
# 2
d1[, v2 := v2[nafill(replace(seq_len(.N), is.na(v2), NA), type = "nocb")], by = id]
# 3
dt[values == "a" & v2 == "b", values := v2]
# clean-up
dt[ , v2 := NULL]
Currently, nafill only works with numeric variables, hence replace step in chunk # 2 (modified from #chinsoon12 in the issue nafill, setnafill for character, factor and other types).
The NA replacement code may be slightly shortened by using zoo::nalocf:
dt[, v2 := zoo::na.locf(v2, fromLast = TRUE, na.rm = FALSE), by = id]
However, note that na.locf is slower.
When comparing the answers on larger data (data.table(id = rep(1:1e4, each = 1e4, replace = TRUE), values = sample(c("a", "b", "c"), 1e8, replace = TRUE)), it turns out that this alternative actually is faster than the others.
This is not pretty but I think this is what you are after:
dt[, .N, by = .(id, values = paste0(values, rleid(values)))
][, values := sub("[0-9]+", "", values)
][, values := fifelse(values == "a" & shift(values, -1L) == "b" & !is.na(shift(values, -1L)), "b", values), by = id
][, .SD[rep(seq_len(.N), N)]
][, !"N"]
id values
1: 1 a
2: 1 c
3: 1 b
4: 1 b
5: 1 a
6: 2 c
7: 2 c
8: 2 b
9: 2 b
10: 2 c
11: 3 c
12: 3 b
13: 3 b
14: 3 b
15: 3 b
You can use rle().
Note: To avoid ambiguity, I rename the "values" column to "var" because the rle() function also produces a list containing a vector named "values".
dt[, new := with(rle(var), rep(ifelse(values == "a" & c(values[-1], "") == "b", "b", values), lengths)), by = id]
dt
# id var new
# 1: 1 a a
# 2: 1 c c
# 3: 1 a b
# 4: 1 b b
# 5: 1 a a
# 6: 2 c c
# 7: 2 c c
# 8: 2 b b
# 9: 2 b b
# 10: 2 c c
# 11: 3 c c
# 12: 3 a b
# 13: 3 a b
# 14: 3 a b
# 15: 3 b b

Match in lagged group in data.table

I'm trying to create a new column that indicates if an ID was present in a previous group. Here's my data:
data <- data.table(ID = c(1:3, c(9,2,3,4),c(5,1)),
groups = c(rep(c("a", "b", "c"), c(3, 4,2))))
ID groups
1: 1 a
2: 2 a
3: 3 a
4: 9 b
5: 2 b
6: 3 b
7: 4 b
8: 5 c
9: 1 c
I'm not sure how to specify lagged groups. I tried to use shift, but it's not working:
data[,.(ID=ID,match_lagged=ID %in% shift(ID)),by=groups]
Here's my desired result.
The first 3 lines are not matched because there is no previous group. FALSE would also work for these three rows. ID=4 (in group b) is not matched in group a. ID=5 (in group c) is not matched in group b.
Note that ID 1 in group c is not matched in group b so it should be false even though it exists in group a. This is why duplicated(data$ID) does not work. Data from a group has to be matched from the previous group.
groups ID match_lagged
1: a 1 NA
2: a 2 NA
3: a 3 NA
4: b 9 FALSE
5: b 2 TRUE
6: b 3 TRUE
7: b 4 FALSE
8: c 5 FALSE
9: c 1 FALSE
A dplyr solution would also work.
Number the groups, and then check if the diff is equal to one for each ID.
data[, grp.id := .GRP, by = groups]
data[, match_lagged := c(FALSE, diff(grp.id) == 1), by = ID][
grp.id == 1, match_lagged := NA][]
# ID groups grp.id match_lagged
#1: 1 a 1 NA
#2: 2 a 1 NA
#3: 3 a 1 NA
#4: 9 b 2 FALSE
#5: 2 b 2 TRUE
#6: 3 b 2 TRUE
#7: 4 b 2 FALSE
#8: 5 c 3 FALSE
#9: 1 c 3 FALSE
This assumes that you find each ID only once in each group. If that's not the case you can unique, do the above, and then merge in.
This works. There is probably an easier solution:
data <- data.frame(ID = c(1:3, 1:4,c(5,1)),
groups = c(rep(c("a", "b", "c"), c(3, 4,2))))
z <- data %>% group_by(groups) %>% summarize(all_vals = list(ID))
z <- z %>% mutate(lagged_id = lag(all_vals,1))
match_lagged <- lapply(1:nrow(z) , function(x) {
(z$all_vals[x] %>% unlist) %in% (z$lagged_id[x] %>% unlist)
})
data$match_lagged = match_lagged %>% unlist

Column order of `.SD` in j argument differs when `get()` is used

I very often transform subsets of data using the .SDcols option in data.table. It makes sense that the .SD columns sent to j are in the same order as the original data.table.
EDITED to properly identify the issue
It's nice that .SD columns have the same order as that specified in the .SDcols argument. This does not happen when get is used in the j argument (inside an lapply call, at least). In this case, the .SD table columns maintain their original order.
Is there any way to override this behaviour?
An example without get works fine
# library(data.table)
dt = data.table(col1 = rep(LETTERS[1:3], 4),
b = rnorm(12),
a = 1:12,
c = LETTERS[1:12])
# columns I want to do something to
d.vars = c('a', 'b') #' names in different order than names(dt)
# Generate columns of first differences by group
dt[, paste('d', d.vars, sep='.') :=
lapply(.SD, function(L) L - shift(L, n = 1, type='lag') ),
keyby = col1, .SDcols = d.vars]
The result is assigns differenced values to the "wrong" column because my named vector (d.vars) is ordered differently than the columns in dt. The result is:
The results are as expected, the .SD table's columns are ordered the same way as the names in d.vars.
> dt
col1 b a c d.a d.b
1: A -0.28901751 1 A NA NA
2: A 0.65746901 4 D 3 0.94648651
3: A -0.10602462 7 G 3 -0.76349362
4: A -0.38406252 10 J 3 -0.27803790
5: B -1.06963450 2 B NA NA
6: B 0.35137273 5 E 3 1.42100723
7: B 0.43394046 8 H 3 0.08256772
8: B 0.82525042 11 K 3 0.39130996
9: C 0.50421710 3 C NA NA
10: C -1.09493665 6 F 3 -1.59915375
11: C -0.04858163 9 I 3 1.04635501
12: C 0.45867279 12 L 3 0.50725443
Which is the expected output because lapply in j processed column a first and b second, in spite of the column order in dt.
Example with get behaves differently
dt2 = data.table(col1 = rep(LETTERS[1:3], 4),
b = rnorm(12),
a = 1:12,
neg = -1,
c = LETTERS[1:12])
# columns I want to do something to
d.vars = c('a', 'b') #' names in different order than names(dt)
# name of variable to be called in j.
negate <- 'neg'
dt2[, paste('d', d.vars, sep='.') :=
lapply(.SD, function(L) {(L - shift(L, n = 1, type='lag') ) * get(negate) }),
keyby = col1, .SDcols = d.vars]
Now the naming of the newly created columns doesn't align with the name order in d.vars:
> dt2
col1 b a neg c d.a d.b
1: A -0.3539066 1 -1 A NA NA
2: A 0.2702374 4 -1 D -0.62414408 -3
3: A -0.7834941 7 -1 G 1.05373150 -3
4: A -1.2765652 10 -1 J 0.49307118 -3
5: B -0.2936422 2 -1 B NA NA
6: B -0.2451996 5 -1 E -0.04844252 -3
7: B -1.6577614 8 -1 H 1.41256181 -3
8: B 1.0668059 11 -1 K -2.72456737 -3
9: C -0.1160938 3 -1 C NA NA
10: C -0.7940771 6 -1 F 0.67798333 -3
11: C 0.2951743 9 -1 I -1.08925140 -3
12: C -0.4508854 12 -1 L 0.74605969 -3
In this second example the b column is processed by lapply first and therefore assigned to d.a.
If I refer to neg directly (i.e., I don't use get) then the results are as expected: lapply processes the .SD columns in the order given in d.vars.
p.s. Thanks data.table team! I love this package!
Based on the description, we can use match to match the 'd.vars' and the column names of 'dt' ('d.vars1') and then use it to get the order right
d.vars1 <- d.vars[match(names(dt), d.vars, nomatch = 0)]
dt[, paste0("d.",d.vars1) := lapply(.SD, function(L)
L - shift(L, n = 1, type='lag') ), keyby = col1, .SDcols = d.vars1]
dt
# col1 b a c d.b d.a
# 1: A -0.28901751 1 A NA NA
# 2: A 0.65746901 4 D 0.94648652 3
# 3: A -0.10602462 7 G -0.76349363 3
# 4: A -0.38406252 10 J -0.27803790 3
# 5: B -1.06963450 2 B NA NA
# 6: B 0.35137273 5 E 1.42100723 3
# 7: B 0.43394046 8 H 0.08256773 3
# 8: B 0.82525042 11 K 0.39130996 3
# 9: C 0.50421710 3 C NA NA
#10: C -1.09493665 6 F -1.59915375 3
#11: C -0.04858163 9 I 1.04635502 3
#12: C 0.45867279 12 L 0.50725442 3
Update
Based on the new dataset
d.vars1 <- d.vars[match(names(dt2), d.vars, nomatch = 0)]
dt2[, paste0('d.', d.vars1) := lapply(.SD, function(L)
L - shift(L, n = 1, type='lag') * get(negate) ),
keyby = col1, .SDcols = d.vars1]
dt2
# col1 b a neg c d.b d.a
# 1: A -0.3539066 1 -1 A NA NA
# 2: A 0.2702374 4 -1 D -0.0836692 5
# 3: A -0.7834941 7 -1 G -0.5132567 11
# 4: A -1.2765652 10 -1 J -2.0600593 17
# 5: B -0.2936422 2 -1 B NA NA
# 6: B -0.2451996 5 -1 E -0.5388418 7
# 7: B -1.6577614 8 -1 H -1.9029610 13
# 8: B 1.0668059 11 -1 K -0.5909555 19
# 9: C -0.1160938 3 -1 C NA NA
#10: C -0.7940771 6 -1 F -0.9101709 9
#11: C 0.2951743 9 -1 I -0.4989028 15
#12: C -0.4508854 12 -1 L -0.1557111 21

ifelse function group in group in R

I have data set
ID <- c(1,1,2,2,2,2,3,3,3,3,3,4,4,4)
Eval <- c("A","A","B","B","A","A","A","A","B","B","A","A","A","B")
med <- c("c","d","k","k","h","h","c","d","h","h","h","c","h","k")
df <- data.frame(ID,Eval,med)
> df
ID Eval med
1 1 A c
2 1 A d
3 2 B k
4 2 B k
5 2 A h
6 2 A h
7 3 A c
8 3 A d
9 3 B h
10 3 B h
11 3 A h
12 4 A c
13 4 A h
14 4 B k
I try to create variable x and y, group by ID and Eval. For each ID, if Eval = A, and med = "h" or "k", I set x = 1, other wise x = 0, if Eval = B and med = "h" or "k", I set y = 1, other wise y = 0. I use the way I don't like it, I got answer but it seem like not that great
df <- data.table(df)
setDT(df)[, count := uniqueN(med) , by = .(ID,Eval)]
setDT(df)[Eval == "A", x:= ifelse(count == 1 & med %in% c("k","h"),1,0), by=ID]
setDT(df)[Eval == "B", y:= ifelse(count == 1 & med %in% c("k","h"),1,0), by=ID]
ID Eval med count x y
1: 1 A c 2 0 NA
2: 1 A d 2 0 NA
3: 2 B k 1 NA 1
4: 2 B k 1 NA 1
5: 2 A h 1 1 NA
6: 2 A h 1 1 NA
7: 3 A c 3 0 NA
8: 3 A d 3 0 NA
9: 3 B h 1 NA 1
10: 3 B h 1 NA 1
11: 3 A h 3 0 NA
12: 4 A c 2 0 NA
13: 4 A h 2 0 NA
14: 4 B k 1 NA 1
Then I need to collapse the row to get unique ID, I don't know how to collapse rows, any idea?
The output
ID x y
1 0 0
2 1 1
3 0 1
4 0 1
We create the 'x' and 'y' variables grouped by 'ID' without the NA elements directly coercing the logical vector to binary (as.integer)
df[, x := as.integer(Eval == "A" & count ==1 & med %in% c("h", "k")) , by = ID]
and similarly for 'y'
df[, y := as.integer(Eval == "B" & count ==1 & med %in% c("h", "k")) , by = ID]
and summarise it, using any after grouping by "ID"
df[, lapply(.SD, function(x) as.integer(any(x))) , ID, .SDcols = x:y]
# ID x y
#1: 1 0 0
#2: 2 1 1
#3: 3 0 1
#4: 4 0 1
If we need a compact approach, instead of assinging (:=), we summarise the output grouped by "ID", "Eval" based on the conditions and then grouped by 'ID', we check if there is any TRUE values in 'x' and 'y' by looping over the columns described in the .SDcols.
setDT(df)[, if(any(uniqueN(med)==1 & med %in% c("h", "k"))) {
.(x= Eval=="A", y= Eval == "B") } else .(x=FALSE, y=FALSE),
by = .(ID, Eval)][, lapply(.SD, any) , by = ID, .SDcols = x:y]
# ID x y
#1: 1 FALSE FALSE
#2: 2 TRUE TRUE
#3: 3 FALSE TRUE
#4: 4 FALSE TRUE
If needed, we can convert to binary similar to the approach showed in the first solution.
The OP's goal...
"I try to create variable x and y, group by ID and Eval. For each ID, if Eval = A, and med = "h" or "k", I set x = 1, other wise x = 0, if Eval = B and med = "h" or "k", I set y = 1, other wise y = 0. [...] Then I need to collapse the row to get unique ID"
can be simplified to...
For each ID and Eval, flag if all med values are h or all med values are k.
setDT(df) # only do this once
df[, all(med=="k") | all(med=="h"), by=.(ID,Eval)][, dcast(.SD, ID ~ Eval, fun=any)]
ID A B
1: 1 FALSE FALSE
2: 2 TRUE TRUE
3: 3 FALSE TRUE
4: 4 FALSE TRUE
To see what dcast is doing, read ?dcast and try running just the first part on its own, df[, all(med=="k") | all(med=="h"), by=.(ID,Eval)].
The change to use x and y instead of A and B is straightforward but ill-advised (since unnecessary renaming can be confusing and lead to extra work when there are new Eval values); and ditto the change for 1/0 instead of TRUE/FALSE (since the values captured are actually boolean).
Here is my dplyr solution since I find it more readable than data.table.
library(dplyr)
df %>%
group_by(ID, Eval) %>%
mutate(
count = length(unique(med)),
x = ifelse(Eval == "A" &
count == 1 & med %in% c("h", "k"), 1, 0),
y = ifelse(Eval == "B" &
count == 1 & med %in% c("h", "k"), 1, 0)
) %>%
group_by(ID) %>%
summarise(x1 = max(unique(x)),
y1 = max(unique(y)))
A one liner solution for collapsing the rows of your result :
df[,lapply(.SD,function(i) {ifelse(1 %in% i,ifelse(!0 %in% i,1,0),0)}),.SDcols=x:y,by=ID]
ID x y
1: 1 0 0
2: 2 1 1
3: 3 0 1
4: 4 0 1

How to reorder data.table columns (without copying)

I'd like to reorder columns in my data.table x, given a character vector of column names, neworder:
library(data.table)
x <- data.table(a = 1:3, b = 3:1, c = runif(3))
neworder <- c("c", "b", "a")
Obviously I could do:
x[ , neworder, with = FALSE]
# or
x[ , ..neworder]
# c b a
# 1: 0.8476623 3 1
# 2: 0.4787768 2 2
# 3: 0.3570803 1 3
but that would require copying the entire dataset again. Is there another way to do this?
Use setcolorder():
library(data.table)
x <- data.table(a = 1:3, b = 3:1, c = runif(3))
x
# a b c
# [1,] 1 3 0.2880365
# [2,] 2 2 0.7785115
# [3,] 3 1 0.3297416
setcolorder(x, c("c", "b", "a"))
x
# c b a
# [1,] 0.2880365 3 1
# [2,] 0.7785115 2 2
# [3,] 0.3297416 1 3
From ?setcolorder:
In data.table parlance, all set* functions change their input by reference. That is, no copy is made at all, other than temporary working memory, which is as large as one column.
so should be pretty efficient. See ?setcolorder for details.
One may find it easier to use the above solution, but instead sort by column number. For example:
library(data.table)
> x <- data.table(a = 1:3, b = 3:1, c = runif(3))
> x
a b c
[1,] 1 3 0.2880365
[2,] 2 2 0.7785115
[3,] 3 1 0.3297416
> setcolorder(x, c(3,2,1))
> x
c b a
[1,] 0.2880365 3 1
[2,] 0.7785115 2 2
[3,] 0.3297416 1 3

Resources