Merge 2 columns in R - r

I have a data set with columns I'd like to merge similar to this:
library(data.table)
DF <- as.data.table(list(ID = c(1,2,3,4,5), Product = c('Y', NA, NA, 'Z', NA), Type = c(NA, 'D', 'G', NA, NA)))
DF
ID Product Type
1 Y NA
2 NA D
3 NA G
4 Z NA
5 NA NA
which I would like to look like this:
DF
ID Product Type Category
1 Y NA Y
2 NA D D
3 NA G G
4 Z NA Z
5 NA NA NA
My Code is:
DF[,Category := na.omit(c(Product,Type)), by = ID][,c("Product","Type"):=NULL]
The problem that I have is that I would like to have for the Category to be NA when both Product and Type are NAs. Also, I don't know if my code works because my data set has over 200,000 rows.

DF[ , Category := ifelse(is.na(Product), Type, Product)]
# ID Product Type Category
#1: 1 Y NA Y
#2: 2 NA D D
#3: 3 NA G G
#4: 4 Z NA Z
#5: 5 NA NA NA
This is assuming if there are values for both Product and Type, you want Product in Category

We can do this in two assignments and avoid ifelse as assignment in place (:=) is faster and efficient.
DF[, Category := Product][is.na(Product), Category := Type][]
# ID Product Type Category
#1: 1 Y NA Y
#2: 2 NA D D
#3: 3 NA G G
#4: 4 Z NA Z
#5: 5 NA NA NA
Or if we assume that there will be only a maximum 1 non-NA value per row for Product/Type, then pmax can be used.
DF[, Category := pmax(Product, Type, na.rm = TRUE)][]
# ID Product Type Category
#1: 1 Y NA Y
#2: 2 NA D D
#3: 3 NA G G
#4: 4 Z NA Z
#5: 5 NA NA NA
Benchmarks
DF1 <- DF[rep(1:nrow(DF), 1e6)]
DF2 <- copy(DF1)
DF3 <- copy(DF1)
system.time(DF1[, Category := Product][is.na(Product), Category := Type])
# user system elapsed
# 0.16 0.06 0.17
system.time(DF2[ , Category := ifelse(is.na(Product), Type, Product)])
# user system elapsed
# 1.35 0.19 1.53
system.time(DF3[ ,Category := pmax(Product, Type, na.rm = TRUE)])
# user system elapsed
# 0.04 0.02 0.06
EDIT: Updated with the benchmarks and it clearly shows both the methods mentioned in my post are efficient.

Related

loop over 2 datasets in R to match the value of all rows from one dateset with only one column of another dateset

I am trying to write a loop in R to perform some iteration on two datasets called datasetA and datasetB.
datasetA has 600 entries and datasetB has 200’000 entries.
For each entry in datasetA, I want to perform the following:
If the value of V2 in both datasets are equal,
then calculate the ppm:
(datasetA$V3 - datasetB$V3) / datasetA$V3 * 1000000
If the ppm < |10|, then paste the ppm value in V4 column in datasetB, paste the relevant name of datasetA$V1 in column V1 of datasetB.
Say this is datasetA with 600 entries:
datasetA<- read.table(text='Alex 1 50.00042
John 1 60.000423
Janine 3 88.000123
Aline 3 117
Mark 2 79.9999')
DatasetA
and this is an example of datasetB with 200000 entries:
datasetB<- read.table(text='NA 1 50.0001 NA
NA 1 50.00032 NA
NA 2 70 NA
NA 2 80 NA
NA 3 88.0004 NA
NA 3 100 NA
NA 3 101 NA
NA 2 102 NA')
DatasetB
The final table should look like this:
datasetC <- read.table(text='Alex 1 50.0001 6.459945
Alex 1 50.00032 2.059983
NA 2 70 NA
Mark 2 80 -1.25
Janine 3 88.0004 -3.14772
NA 3 100 NA
NA 3 101 NA
NA 2 102 NA')
The final table should look like this
data<-datasetB
for(i in 1:5){
for(j in 1:8){
if (datasetA$V2[i]==datasetB$V2[j] & abs((datasetA$V3[i]-datasetB$V3[j])/datasetA$V3[i]*10**6)<10){
data[j,1]=datasetA[i,1]
data[j,4]=(datasetA$V3[i]-datasetB$V3[j])/datasetA$V3[i]*10**6
}}}
data
Try this: I am a R noob but let me know if this works for you.
library(data.table)
datasetA<- read.table(text='Alex 1 50.00042
John 1 60.000423
Janine 3 88.000123
Aline 3 117
Mark 2 79.9999')
datasetB<- read.table(text='NA 1 50.0001 NA
NA 1 50.00032 NA
NA 2 70 NA
NA 2 80 NA
NA 3 88.0004 NA
NA 3 100 NA
NA 3 101 NA
NA 2 102 NA')
# I renamed columns for my own reference, V1,V2,.. were a bit confusing
names(datasetA) <- c("Name", "ID", "ValueA")
names(datasetB) <- c("V1", "ID", "ValueB", "V4")
# Create a key for each row in datasetB
datasetB$key <- seq(nrow(datasetB))
# Left join A to B on column ID, but first set them as data table
datasetB <- as.data.table(datasetB)
datasetA <- as.data.table(datasetA)
# Using base join but you can also use data table left join see below
datasetC <- merge(x = datasetB, y = datasetA, by = c("ID"), all.x = TRUE)
# Create PPM column
datasetC[, c("ppm") := 1000000*(ValueA - ValueB)/ValueA, ]
# Filter on PPM and keep columns we need
datasetC <- datasetC[abs(ppm) < 10, list(key,Name,ppm)]
# Left join to datasetB on key
setkey(datasetC, key)
setkey(datasetB, key)
datasetB <- datasetC[datasetB]
# Keep columns we need and rename to V1,... as requested
datasetB <- datasetB[, list(V1 = Name, V2 = ID, V3 = ValueB, V4 = ppm)]
The following answer seems to do what the question asks for but I am failing to get 2 of the computed values, final column V4.
AV2 <- sort(unique(datasetA$V2))
res <- lapply(AV2, function(v2){
inx_a <- datasetA[['V2']] == v2
inx_b <- datasetB[['V2']] == v2
mrg <- merge(datasetA[inx_a, ], datasetB[inx_b, ], by = 'V2')
ppm <- ((mrg$V3.x - mrg$V3.y)/mrg$V3.x)*1000000
cbind(mrg[abs(ppm) < 10, c(2, 1, 5)], ppm = ppm[abs(ppm) < 10])
})
res <- do.call(rbind, res)
names(res) <- paste0('V', 1:4)
row.names(res) <- NULL
final <- merge(res, datasetB, by = c('V2', 'V3'), all.y = TRUE)[c(3, 1, 2, 4)]
names(final) <- paste0('V', 1:4)
final
# V1 V2 V3 V4
#1 Alex 1 50.00010 6.399946
#2 Alex 1 50.00032 1.999983
#3 <NA> 2 70.00000 NA
#4 Mark 2 80.00000 -1.250002
#5 <NA> 2 102.00000 NA
#6 Janine 3 88.00040 -3.147723
#7 <NA> 3 100.00000 NA
#8 <NA> 3 101.00000 NA
If I understand correctly, the question is asking for a join with a complex condition. This can be implemented using data.table:
library(data.table)
setDT(datasetA)[setDT(datasetB), on = "V2", {
ppm <-(x.V3- i.V3) / i.V3 * 1E6
list(V1 = ifelse(abs(ppm) < 10, x.V1, NA_character_),
V2,
V3 = i.V3,
V4 = ifelse(abs(ppm) < 10, ppm, NA_real_))
}, mult = "first"]
V1 V2 V3 V4
1: Alex 1 50.00010 6.399987
2: Alex 1 50.00032 1.999987
3: <NA> 2 70.00000 NA
4: Mark 2 80.00000 -1.250000
5: Janine 3 88.00040 -3.147713
6: <NA> 3 100.00000 NA
7: <NA> 3 101.00000 NA
8: <NA> 2 102.00000 NA
Here is an alternative approach which updates datasetB in place by an update join:
library(data.table)
tmp <- setDT(datasetA)[setDT(datasetB), on = "V2"][
, V4 := (V3- i.V3) / i.V3 * 1E6][abs(V4) < 10][, i.V1 := NULL]
datasetB[, `:=`(V1 = as.character(V1), V4 = as.double(V4))]
datasetB[tmp, on = .(V2, V3 = i.V3), `:=`(V1 = i.V1, V4 = i.V4)][]
V1 V2 V3 V4
1: Alex 1 50.00010 6.399987
2: Alex 1 50.00032 1.999987
3: <NA> 2 70.00000 NA
4: Mark 2 80.00000 -1.250000
5: Janine 3 88.00040 -3.147713
6: <NA> 3 100.00000 NA
7: <NA> 3 101.00000 NA
8: <NA> 2 102.00000 NA

Match Dataframes Excluding Last Non-NA Value and disregarding order

I have two dataframes:
Partner<-c("Alpha","Beta","Zeta")
COL1<-c("A","C","M")
COL2<-c("B","D","K")
COL3<-c("C","F",NA)
COL4<-c("D",NA,NA)
df1<-data.frame(Partner,COL1,COL2,COL3,COL4)
lift<-c(9,10,11,12,12,23,12,24)
RULE1<-c("B","B","D","A","C","K","M","K")
RULE2<-c("A","A","C","B","A","M","T","M")
RULE3<-c("G","D","M","C" ,"M", "E",NA,NA)
RULE4<-c(NA,NA,"K","D" ,NA, NA,NA,NA)
df2<-data.frame(lift,RULE1,RULE2,RULE3,RULE4)
df1
Partner COL1 COL2 COL3 COL4
Alpha A B C D
Beta C D F NA
Zeta M K NA NA
df2
lift RULE1 RULE2 RULE3 RULE4
9 B A G NA
10 B A D NA
11 D C M K
12 A B C D
12 C A M NA
23 K M E NA
12 M T NA NA
24 K M NA NA
This is a market basket analysis. df1 is the customer/partner that bought each of the items listed: A, B, C...etc.
df2 are the recommendations associated with the items bought in the past.
The last value in each of the df2 rows represent the recommendation. So the preceding values in each row from the last non-NA value are the "baskets".
So for example in the first row of df2, it is stating: If B and A are bought together, recommend G.
I want to be able to figure out if each partner from df1 bought ALL the values in each row excluding the final value since that is the recommendation. Then add that recommendation to the end of each row of the new dataframe.
For example:
For partner: Alpha, would it be good to recommend value G from the first row? Answer would be yes because they bought all the values from that row in df2 (A and B).
For partner: Beta, it would not be good to recommend value G because not all of the values from the first row of df2 are found in the Beta row.
Final Output:
Partner COL1 COL2 COL3 COL4 lift RULE1 RULE2 RULE3 RULE4 Does Last Non-NA Value Exist in Row?
Alpha A B C D 9 B A G NA No
Alpha A B C D 10 B A D NA Yes
Alpha A B C D 12 A B C D Yes
Alpha A B C D 12 C A M NA No
Zeta M K NA NA 23 K M E NA No
Zeta M K NA NA 12 M T NA NA No
Zeta M K NA NA 24 K M NA NA Yes
Written out results for clarity:
df3
row1 outputs "No" because G is not found in Alpha Partner and all values before G show up in Alpha Partner (B,A)
row2 outputs "Yes" because D is found in Alpha Partner and all values before D show up in Alpha Partner (B,A)
row3 outputs "Yes" because D is found in Alpha Partner and all values before D show up in Alpha Partner (A,B,C)
row4 outputs "No" because M is not found in Alpha Partner and all values before M show up in Alpha Partner (C,A)
row5 outputs "No" because E is not found in Zeta Partner and all values before E show up in Zeta Partner (K,M)
row6 outputs "No" because T is not found in Zeta Partner and all values before T show up in Zeta Partner (M)
row7 outputs "Yes" because M is found in Zeta Partner and all values before M show up in Zeta Partner (K)
I think that has to be a join or a match of some kind but can't figure out how to do it.
This would be extremely helpful if someone can help me out with this.
Thanks.
This was the attempt:
df1<-cbind(df1_id=1:nrow(df1),df1)
df2 <- cbind(df2_id=1:nrow(df2),df2)
d11 <- df1 %>% gather(Col, Value,starts_with("C")) #Long
d11 <- d11 %>% na.omit() %>%group_by(df1_id) %>% slice(-n()) #remove last non NA
d22 <- df2 %>% gather(Rule, Value,starts_with("R"))
res <- inner_join(d11,d22)
rm(d22)
rm(d11)
final<-cbind(df1[res$df1_id,],df2[res$df2_id,])
final$Exist <- apply(final, 1, FUN = function(x)
c("No", "Yes")[(anyDuplicated(x[!is.na(x) & x != "" ])!=0) +1])
But this didn't work because it didn't take all of the values into account, only if one of them matched...not all.
This is quite tricky because the purchases of n customers have to be compared to a set of m rules. Besides this, there are two points which add to the complexity:
The last non-NA RULE column in df2 is semantically different from the others. Unfortunately, the given data structure doesn't reflect this. So, df2 is missing an explicite recommended column.
Finally, it has to be determined whether a partner already has purchased the recommended item.
The approach below relies on melt(), dcast() and join operations of the data.table package for performance reasons. However, in order to avoid creation of cartesian crossproduct of n * m rows, a loop is used.
EDIT The dcast() has been moved out of the lapply() function.
Prepare data for n:m join
library(data.table)
# convert to data.table and add row numbers
# here, a copy is used insteasd of setDT() in order to rename the data.tables
purchases <- as.data.table(df1)[, rnp := seq_len(.N)]
rules <- as.data.table(df2)[, rnr := seq_len(.N)]
# prepare purchases for joins
lp <- melt(purchases, id.vars = c("rnp", "Partner"), na.rm = TRUE)
wp <- dcast(lp, rnp ~ value, drop = FALSE)
wp
# rnp A B C D F K M
#1: 1 A B C D NA NA NA
#2: 2 NA NA C D F NA NA
#3: 3 NA NA NA NA NA K M
# prepare rules
lr <- melt(rules, id.vars = c("rnr", "lift"), na.rm = TRUE)
# identify last column of each rule which becomes the recommendation
rn_of_last_col <- lr[, last(.I), by = rnr][, V1]
# reshape from long to wide without recommendation
wr <- dcast(lr[-rn_of_last_col], rnr ~ value)
# add column with recommendations (kind of cbind, no join)
wr[, recommended := lr[rn_of_last_col, value]]
wr
# rnr A B C D K M recommended
#1: 1 A B NA NA NA NA G
#2: 2 A B NA NA NA NA D
#3: 3 NA NA C D NA M K
#4: 4 A B C NA NA NA D
#5: 5 A NA C NA NA NA M
#6: 6 NA NA NA NA K M E
#7: 7 NA NA NA NA NA M T
#8: 8 NA NA NA NA K NA M
Combine rules and purchases
combi <- rbindlist(
# implied loop over rules to find matching purchases for each rule
lapply(seq_len(nrow(rules)), function(i) {
# get col names except last col which is the recommendation
cols <- lr[rnr == i, value[-.N]]
# join single rule with all partners on relevant cols for this rule
wp[wr[i, .SD, .SDcols = c(cols, "rnr", "recommended")], on = cols, nomatch = 0]
})
)
# check if recommendation was purchased already
combi[, already_purchased := Reduce(`|`, lapply(.SD, function(x) x == recommended)),
.SDcols = -c("rnp", "rnr", "recommended")]
# clean up already purchased
combi[is.na(already_purchased), already_purchased := FALSE
][, already_purchased := ifelse(already_purchased, "Yes", "No")]
combi
# rnp A B C D F K M rnr recommended already_purchased
#1: 1 A B C D NA NA NA 1 G No
#2: 1 A B C D NA NA NA 2 D Yes
#3: 1 A B C D NA NA NA 4 D Yes
#4: 1 A B C D NA NA NA 5 M No
#5: 3 NA NA NA NA NA K M 6 E No
#6: 3 NA NA NA NA NA K M 7 T No
#7: 3 NA NA NA NA NA K M 8 M Yes
In creating combi, the trick is to join only on those columns which are included in each rule. This is why the join needs to be done for each rule separately.
Essentially, we are done now. However, it doesn't look like the desired output.
Final joins
tmp_rules <- rules[combi[, .(rnp, rnr, recommended, already_purchased)], on = "rnr"]
tmp_purch <- purchases[combi[, .(rnp, rnr)], on = "rnp"]
result <- tmp_purch[tmp_rules, on = c("rnp", "rnr")]
result[, (c("rnp", "rnr")) := NULL]
result
# Partner COL1 COL2 COL3 COL4 lift RULE1 RULE2 RULE3 RULE4 recommend already_purchased
#1: Alpha A B C D 9 B A G NA G No
#2: Alpha A B C D 10 B A D NA D Yes
#3: Alpha A B C D 12 A B C D D Yes
#4: Alpha A B C D 12 C A M NA M No
#5: Zeta M K NA NA 23 K M E NA E No
#6: Zeta M K NA NA 12 M T NA NA T No
#7: Zeta M K NA NA 24 K M NA NA M Yes

Column order of `.SD` in j argument differs when `get()` is used

I very often transform subsets of data using the .SDcols option in data.table. It makes sense that the .SD columns sent to j are in the same order as the original data.table.
EDITED to properly identify the issue
It's nice that .SD columns have the same order as that specified in the .SDcols argument. This does not happen when get is used in the j argument (inside an lapply call, at least). In this case, the .SD table columns maintain their original order.
Is there any way to override this behaviour?
An example without get works fine
# library(data.table)
dt = data.table(col1 = rep(LETTERS[1:3], 4),
b = rnorm(12),
a = 1:12,
c = LETTERS[1:12])
# columns I want to do something to
d.vars = c('a', 'b') #' names in different order than names(dt)
# Generate columns of first differences by group
dt[, paste('d', d.vars, sep='.') :=
lapply(.SD, function(L) L - shift(L, n = 1, type='lag') ),
keyby = col1, .SDcols = d.vars]
The result is assigns differenced values to the "wrong" column because my named vector (d.vars) is ordered differently than the columns in dt. The result is:
The results are as expected, the .SD table's columns are ordered the same way as the names in d.vars.
> dt
col1 b a c d.a d.b
1: A -0.28901751 1 A NA NA
2: A 0.65746901 4 D 3 0.94648651
3: A -0.10602462 7 G 3 -0.76349362
4: A -0.38406252 10 J 3 -0.27803790
5: B -1.06963450 2 B NA NA
6: B 0.35137273 5 E 3 1.42100723
7: B 0.43394046 8 H 3 0.08256772
8: B 0.82525042 11 K 3 0.39130996
9: C 0.50421710 3 C NA NA
10: C -1.09493665 6 F 3 -1.59915375
11: C -0.04858163 9 I 3 1.04635501
12: C 0.45867279 12 L 3 0.50725443
Which is the expected output because lapply in j processed column a first and b second, in spite of the column order in dt.
Example with get behaves differently
dt2 = data.table(col1 = rep(LETTERS[1:3], 4),
b = rnorm(12),
a = 1:12,
neg = -1,
c = LETTERS[1:12])
# columns I want to do something to
d.vars = c('a', 'b') #' names in different order than names(dt)
# name of variable to be called in j.
negate <- 'neg'
dt2[, paste('d', d.vars, sep='.') :=
lapply(.SD, function(L) {(L - shift(L, n = 1, type='lag') ) * get(negate) }),
keyby = col1, .SDcols = d.vars]
Now the naming of the newly created columns doesn't align with the name order in d.vars:
> dt2
col1 b a neg c d.a d.b
1: A -0.3539066 1 -1 A NA NA
2: A 0.2702374 4 -1 D -0.62414408 -3
3: A -0.7834941 7 -1 G 1.05373150 -3
4: A -1.2765652 10 -1 J 0.49307118 -3
5: B -0.2936422 2 -1 B NA NA
6: B -0.2451996 5 -1 E -0.04844252 -3
7: B -1.6577614 8 -1 H 1.41256181 -3
8: B 1.0668059 11 -1 K -2.72456737 -3
9: C -0.1160938 3 -1 C NA NA
10: C -0.7940771 6 -1 F 0.67798333 -3
11: C 0.2951743 9 -1 I -1.08925140 -3
12: C -0.4508854 12 -1 L 0.74605969 -3
In this second example the b column is processed by lapply first and therefore assigned to d.a.
If I refer to neg directly (i.e., I don't use get) then the results are as expected: lapply processes the .SD columns in the order given in d.vars.
p.s. Thanks data.table team! I love this package!
Based on the description, we can use match to match the 'd.vars' and the column names of 'dt' ('d.vars1') and then use it to get the order right
d.vars1 <- d.vars[match(names(dt), d.vars, nomatch = 0)]
dt[, paste0("d.",d.vars1) := lapply(.SD, function(L)
L - shift(L, n = 1, type='lag') ), keyby = col1, .SDcols = d.vars1]
dt
# col1 b a c d.b d.a
# 1: A -0.28901751 1 A NA NA
# 2: A 0.65746901 4 D 0.94648652 3
# 3: A -0.10602462 7 G -0.76349363 3
# 4: A -0.38406252 10 J -0.27803790 3
# 5: B -1.06963450 2 B NA NA
# 6: B 0.35137273 5 E 1.42100723 3
# 7: B 0.43394046 8 H 0.08256773 3
# 8: B 0.82525042 11 K 0.39130996 3
# 9: C 0.50421710 3 C NA NA
#10: C -1.09493665 6 F -1.59915375 3
#11: C -0.04858163 9 I 1.04635502 3
#12: C 0.45867279 12 L 0.50725442 3
Update
Based on the new dataset
d.vars1 <- d.vars[match(names(dt2), d.vars, nomatch = 0)]
dt2[, paste0('d.', d.vars1) := lapply(.SD, function(L)
L - shift(L, n = 1, type='lag') * get(negate) ),
keyby = col1, .SDcols = d.vars1]
dt2
# col1 b a neg c d.b d.a
# 1: A -0.3539066 1 -1 A NA NA
# 2: A 0.2702374 4 -1 D -0.0836692 5
# 3: A -0.7834941 7 -1 G -0.5132567 11
# 4: A -1.2765652 10 -1 J -2.0600593 17
# 5: B -0.2936422 2 -1 B NA NA
# 6: B -0.2451996 5 -1 E -0.5388418 7
# 7: B -1.6577614 8 -1 H -1.9029610 13
# 8: B 1.0668059 11 -1 K -0.5909555 19
# 9: C -0.1160938 3 -1 C NA NA
#10: C -0.7940771 6 -1 F -0.9101709 9
#11: C 0.2951743 9 -1 I -0.4989028 15
#12: C -0.4508854 12 -1 L -0.1557111 21

Maximum value of one data.table column based on other columns

I have a R data.table
DT = data.table(x=rep(c("b","a",NA_character_),each=3), y=rep(c('A', NA_character_, 'C'), each=3), z=c(NA_character_), v=1:9)
DT
# x y z v
#1: b A NA 1
#2: b A NA 2
#3: b A NA 3
#4: a NA NA 4
#5: a NA NA 5
#6: a NA NA 6
#7: NA C NA 7
#8: NA C NA 8
#9: NA C NA 9
For each column if the value is not NA, I want to extract the max value from column v. I am using
sapply(DT, function(x) { ifelse(all(is.na(x)), NA_integer_, max(DT[['v']][!is.na(x)])) })
#x y z v
#6 9 NA 9
Is there a simpler way to achive this?
here is a way, giving you -Inf (and a warning) if all values of the column are NA (you can later replace that by NA if you prefer):
DT[, lapply(.SD, function(x) max(v[!is.na(x)]))]
# x y z v
# 1: 6 9 -Inf 9
As suggested by #DavidArenburg, to ensure that everything goes well even when all values are NA (no warning and directly NA as result), you can do:
DT[, lapply(.SD, function(x) {
temp <- v[!is.na(x)]
if(!length(temp)) NA else max(temp)
})]
# x y z v
#1: 6 9 NA 9
We can use summarise_each from dplyr
library(dplyr)
DT %>%
summarise_each(funs(max(v[!is.na(.)])))
# x y z v
#1: 6 9 -Inf 9

List columns with NA values for each row of a data.table in R

I'd like to add a column to a data.table object which lists the column names that are NA for that row. For example let's say I have the following data.table:
dt <- data.table(a = c(1, 2, 3, NA),
b = c(1, 2, NA, NA),
c = c(NA, 2, NA, 4))
a b c
1: 1 1 NA
2: 2 2 2
3: 3 NA NA
4: NA NA 4
I'd like to add a column with these values, resulting in the below data.table:
dt[, na.cols := c("c", "", "b,c", "a,b")]
a b c na.cols
1: 1 1 NA c
2: 2 2 2
3: 3 NA NA b,c
4: NA NA 4 a,b
How can I add this column dynamically?
Here is an approach that will avoid usingapply on a data.table (which coerces to matrix internally)
dt[, na.cols := gsub('(^,+)|(,+$)','',do.call(paste, c(lapply(seq_along(.SD), function(x) ifelse(is.na(.SD[[x]]),names(.SD)[x],'')), sep=',')))]
# a b c na.cols
# 1: 1 1 NA c
# 2: 2 2 2
# 3: 3 NA NA b,c
# 4: NA NA 4 a,b
You could do it this way:
dt[, na.cols :=
apply(dt, 1, function(row) paste(names(row)[which(is.na(row))],
collapse=","))]
Details: basically, you're using apply along margin 1 (i.e. along the rows) and then, for each row, pasting together column names that are NA.

Resources