library(data.table)
data = data.table("cat" = c(0,5,NA,0,0,0),
"horse" = c(0,4,2,1,1,3),
"fox" = c(2,2,NA,NA,7,0))
I wish to replace values of 'cat' and 'fox' that are equal to '0' or '2' with '-99'
I can do it one at a time but how to do them both?
dat[fox == 0 | fox == 2, fox := -99]
Another approach with data.table is using a for(...) set(...)-approach, which is in this case both fast and memory efficient:
cols <- c('fox', 'cat')
# option 1
for (j in cols) d[get(j) %in% c(0, 2), (j) := -99]
# option 2 (thx to #Cole for highlighting)
for (j in cols) set(d, which(d[[j]] %in% c(0, 2)), j, value = -99)
# option 3 (thx to #Frank for highlighting)
for (j in cols) d[.(c(0,2)), on = j, (j) := -99]
which gives:
> d
cat horse fox
1: -99 0 -99
2: 5 4 -99
3: NA 2 NA
4: -99 1 NA
5: -99 1 7
6: -99 3 -99
d <- data.table("cat" = c(0,5,NA,0,0,0),
"horse" = c(0,4,2,1,1,3),
"fox" = c(2,2,NA,NA,7,0))
Here's a not-so-elegant way of doing this:
> data
cat horse fox
1: 0 0 2
2: 5 4 2
3: NA 2 NA
4: 0 1 NA
5: 0 1 7
6: 0 3 0
> data[, c('fox', 'cat') := list(ifelse(cat %in% c(0,2) | fox %in% c(0,2), 99, cat ), ifelse(cat %in% c(0,2) | fox %in% c(0,2), 99, cat ))]
> data
cat horse fox
1: 99 0 99
2: 99 4 99
3: NA 2 NA
4: 99 1 99
5: 99 1 99
6: 99 3 99
I'm calling (c('cat', 'fox')) explicitly, but you could save them as mycols and assign using := operator: data[, mycols := ...]
Similarly, I'm passing a list explicitly based on the conditions - this could be better done using a function instead.
If I understand, this would work as well:
cols = c("cat", "fox")
data[, (cols) := lapply(.SD, function (x) fifelse(x %in% c(0, 2), -99, x)), .SDcols = cols]
Related
library(data.table)
data=data.table("cat"=c(0,5,NA,0,0,0),
"fox"=c(2,0,NA,NA,7,0))
data[, Count0 := cat + fox]
data$WANT = c(1,1,NA,1,1,2)
I wash to count 0 values in 'cat' and fox' and my attempt shown is 'Count0' but desired output is 'WANT'
library(data.table)
data=data.table("cat"=c(0,5,NA,0,0,0),
"fox"=c(2,0,NA,NA,7,0))
cols <- c("cat","fox")
data[,count0:=rowSums(data[,.SD,.SDcols=cols]==0,na.rm = TRUE)]
data[rowMeans(is.na(data[,..cols]))==1,count0:=NA_integer_]
Created on 2020-04-25 by the reprex package (v0.3.0)
We can use :
library(data.table)
cols <- c("cat","fox")
data[,ans := rowSums(.SD == 0, na.rm = TRUE), .SDcols = cols]
data[ans == 0, ans := NA]
data
# cat fox and
#1: 0 2 1
#2: 5 0 1
#3: NA NA NA
#4: 0 NA 1
#5: 0 7 1
#6: 0 0 2
This question already has answers here:
Fastest way to replace NAs in a large data.table
(10 answers)
Closed 5 years ago.
A lot comes together in this question. First off all I would like to segment the data by column c. The subsets are given by the factor c: the levels are 1 to 4. So 4 distinct segments.
Next I have two columns. Column a and b.
I would like to replace the NA's with the maximum value of each segment specific column. So for example, NA at row 3 and column 'a', this would be 30. (b,3) would be 80, (b,8) would be 50 and (a, 5) would be 80.
I have created the code below that does the job, but now I need to make it automatic (like a for loop) for all segments and columns. How could I do this?
a <- c(10,NA,30,40,NA,60,70,80,90,90,80,90,10,40)
b <- c(80,70,NA,50,40,30,20,NA,0,0,10,69, 40, 90)
c <- c(1,1,1,2,2,2,2,2,3,3,3,4,4,4)
a b c
1: 10 80 1
2: NA 70 1
3: 30 NA 1
4: 40 50 2
5: NA 40 2
6: 60 30 2
7: 70 20 2
8: 80 NA 2
9: 90 0 3
10: 90 0 3
11: 80 10 3
12: 90 69 4
13: 10 40 4
14: 40 90 4
mytable <- data.table(a,b,c)
mytable[which(is.na(mytable[c == 1][,1, with = FALSE]) == TRUE),1] <- max(mytable[c==1,1], na.rm = TRUE)
Unfortunately, this try results in an error:
for(i in unique(mytable$c)){
for(j in unique(c(1:2))){
mytable[which(is.na(mytable[c == i][,j, with = FALSE]) == TRUE),j, with = FALSE] <- max(mytable[c==i][,j, with = FALSE], na.rm = TRUE)
}
}
Error in [<-.data.table(*tmp*, which(is.na(mytable[c == i][, j, with = FALSE]) == :
unused argument (with = FALSE)
Surprisingly, this results in an error as well:
for(i in unique(mytable$c)){
for(j in unique(c(1:2))){
mytable[which(is.na(mytable[c == i][,j]) == TRUE),j] <- max(mytable[c==i,j], na.rm = TRUE)
}
}
Error in [.data.table(mytable, c == i, j) :
j (the 2nd argument inside [...]) is a single symbol but column name 'j' is not found. Perhaps you intended DT[,..j] or DT[,j,with=FALSE]. This difference to data.frame is deliberate and explained in FAQ 1.1.
library("data.table")
mytable <- data.table(
a=c(10,NA,30,40,NA,60,70,80,90,90,80,90,10,40),
b=c(80,70,NA,50,40,30,20,NA,0,0,10,69, 40, 90),
c=c(1,1,1,2,2,2,2,2,3,3,3,4,4,4))
foo <- function(x) { x[is.na(x)] <- max(x, na.rm=TRUE); x }
mytable[, .(A=foo(a), B=foo(b)), by=c]
result:
> mytable[, .(A=foo(a), B=foo(b)), by=c]
# c A B
# 1: 1 10 80
# 2: 1 30 70
# 3: 1 30 80
# 4: 2 40 50
# 5: 2 80 40
# 6: 2 60 30
# 7: 2 70 20
# 8: 2 80 50
# 9: 3 90 0
#10: 3 90 0
#11: 3 80 10
#12: 4 90 69
#13: 4 10 40
#14: 4 40 90
or for direct substitution of a and b:
mytable[, `:=`(a=foo(a), b=foo(b)), by=c] # or
mytable[, c("a", "b") := (lapply(.SD, foo)), by = c] # from #Sotos
or the safer variant (tnx to #Frank for the remark):
cols <- c("a", "b")
mytable[, (cols) := lapply(.SD, foo), by=c, .SDcols=cols]
Using data.table
library(data.table)
mytable[, a := ifelse(is.na(a), max(a, na.rm = TRUE), a), by = c]
mytable[, b := ifelse(is.na(b), max(b, na.rm = TRUE), b), by = c]
Or in a single command
mytable[, c("a", "b") := lapply(.SD, function(x) ifelse(is.na(x), max(x, na.rm = TRUE), x)), .SDcols = c("a", "b"), by = c]
Use ddply() from package plyr:
df<-data.frame(a,b,c=as.factor(c))
library(plyr)
df2<-ddply(df, .(c), transform, a=ifelse(is.na(a), max(a, na.rm=T),a),
b=ifelse(is.na(b), max(b, na.rm=T),b))
I have data set
ID <- c(1,1,2,2,2,2,3,3,3,3,3,4,4,4)
Eval <- c("A","A","B","B","A","A","A","A","B","B","A","A","A","B")
med <- c("c","d","k","k","h","h","c","d","h","h","h","c","h","k")
df <- data.frame(ID,Eval,med)
> df
ID Eval med
1 1 A c
2 1 A d
3 2 B k
4 2 B k
5 2 A h
6 2 A h
7 3 A c
8 3 A d
9 3 B h
10 3 B h
11 3 A h
12 4 A c
13 4 A h
14 4 B k
I try to create variable x and y, group by ID and Eval. For each ID, if Eval = A, and med = "h" or "k", I set x = 1, other wise x = 0, if Eval = B and med = "h" or "k", I set y = 1, other wise y = 0. I use the way I don't like it, I got answer but it seem like not that great
df <- data.table(df)
setDT(df)[, count := uniqueN(med) , by = .(ID,Eval)]
setDT(df)[Eval == "A", x:= ifelse(count == 1 & med %in% c("k","h"),1,0), by=ID]
setDT(df)[Eval == "B", y:= ifelse(count == 1 & med %in% c("k","h"),1,0), by=ID]
ID Eval med count x y
1: 1 A c 2 0 NA
2: 1 A d 2 0 NA
3: 2 B k 1 NA 1
4: 2 B k 1 NA 1
5: 2 A h 1 1 NA
6: 2 A h 1 1 NA
7: 3 A c 3 0 NA
8: 3 A d 3 0 NA
9: 3 B h 1 NA 1
10: 3 B h 1 NA 1
11: 3 A h 3 0 NA
12: 4 A c 2 0 NA
13: 4 A h 2 0 NA
14: 4 B k 1 NA 1
Then I need to collapse the row to get unique ID, I don't know how to collapse rows, any idea?
The output
ID x y
1 0 0
2 1 1
3 0 1
4 0 1
We create the 'x' and 'y' variables grouped by 'ID' without the NA elements directly coercing the logical vector to binary (as.integer)
df[, x := as.integer(Eval == "A" & count ==1 & med %in% c("h", "k")) , by = ID]
and similarly for 'y'
df[, y := as.integer(Eval == "B" & count ==1 & med %in% c("h", "k")) , by = ID]
and summarise it, using any after grouping by "ID"
df[, lapply(.SD, function(x) as.integer(any(x))) , ID, .SDcols = x:y]
# ID x y
#1: 1 0 0
#2: 2 1 1
#3: 3 0 1
#4: 4 0 1
If we need a compact approach, instead of assinging (:=), we summarise the output grouped by "ID", "Eval" based on the conditions and then grouped by 'ID', we check if there is any TRUE values in 'x' and 'y' by looping over the columns described in the .SDcols.
setDT(df)[, if(any(uniqueN(med)==1 & med %in% c("h", "k"))) {
.(x= Eval=="A", y= Eval == "B") } else .(x=FALSE, y=FALSE),
by = .(ID, Eval)][, lapply(.SD, any) , by = ID, .SDcols = x:y]
# ID x y
#1: 1 FALSE FALSE
#2: 2 TRUE TRUE
#3: 3 FALSE TRUE
#4: 4 FALSE TRUE
If needed, we can convert to binary similar to the approach showed in the first solution.
The OP's goal...
"I try to create variable x and y, group by ID and Eval. For each ID, if Eval = A, and med = "h" or "k", I set x = 1, other wise x = 0, if Eval = B and med = "h" or "k", I set y = 1, other wise y = 0. [...] Then I need to collapse the row to get unique ID"
can be simplified to...
For each ID and Eval, flag if all med values are h or all med values are k.
setDT(df) # only do this once
df[, all(med=="k") | all(med=="h"), by=.(ID,Eval)][, dcast(.SD, ID ~ Eval, fun=any)]
ID A B
1: 1 FALSE FALSE
2: 2 TRUE TRUE
3: 3 FALSE TRUE
4: 4 FALSE TRUE
To see what dcast is doing, read ?dcast and try running just the first part on its own, df[, all(med=="k") | all(med=="h"), by=.(ID,Eval)].
The change to use x and y instead of A and B is straightforward but ill-advised (since unnecessary renaming can be confusing and lead to extra work when there are new Eval values); and ditto the change for 1/0 instead of TRUE/FALSE (since the values captured are actually boolean).
Here is my dplyr solution since I find it more readable than data.table.
library(dplyr)
df %>%
group_by(ID, Eval) %>%
mutate(
count = length(unique(med)),
x = ifelse(Eval == "A" &
count == 1 & med %in% c("h", "k"), 1, 0),
y = ifelse(Eval == "B" &
count == 1 & med %in% c("h", "k"), 1, 0)
) %>%
group_by(ID) %>%
summarise(x1 = max(unique(x)),
y1 = max(unique(y)))
A one liner solution for collapsing the rows of your result :
df[,lapply(.SD,function(i) {ifelse(1 %in% i,ifelse(!0 %in% i,1,0),0)}),.SDcols=x:y,by=ID]
ID x y
1: 1 0 0
2: 2 1 1
3: 3 0 1
4: 4 0 1
DT0 = data.table(x=rep(c(NA,NA,NA)), y=c(0,1,NA), v=c(0, 0, NA), l=c(1,1,1))
DT0
# x y v l
#1: NA 0 0 1
#2: NA 1 0 1
#3: NA NA NA 1
Based on the first three cols x, y and v I want to add a new col with following output
#1: No
#2: Yes
#3: NA
NA if all rows are NA. Yes if any of them is 1 else 0. My current approach is
relevant_cols <- c('x', 'y', 'v')
new <- data.table(apply(DT0[, relevant_cols, with=F], 1, function(val) { ifelse(all(is.na(val)), NA_character_, ifelse(any(val == TRUE, na.rm = TRUE), 'Yes', 'No')) }))
DT0[, new:= new]
DT0
# x y v l new
#1: NA 0 0 1 No
#2: NA 1 0 1 Yes
#3: NA NA NA 1 NA
However, as the actual data.table is large, is there a better way to do this?
Edit:
Often the data.table entries are non-numeric hence it would be quite helpful if I can have a more general solution than using pmax e.g.,
DT = data.table(x=rep(c(NA,NA,NA)), y=c('No','Yes',NA), v=c('No', 'No', NA), l=c(1,1,1))
DT
# x y v l
#1: NA No No 1
#2: NA Yes No 1
#3: NA NA NA 1
Here's one option:
DT[, new := ifelse(rowSums(.SD == "Yes", na.rm = T) > 0,
'Yes',
ifelse(rowSums(is.na(.SD)) != ncol(.SD), "No", NA))
, .SDcols = x:v]
# x y v l new
#1: NA No No 1 No
#2: NA Yes No 1 Yes
#3: NA NA NA 1 NA
pmax is pretty well-suited to this problem.
First example In the 0/1 case...
DT0[, do.call(pmax, c(na.rm=TRUE, .SD)), .SDcols=x:v]
# 0 1 NA
Second example If you've encoded 0/1 as my_lvls = c("No","Yes") instead...
DT[,
factor(
labels = my_lvls,
x = do.call(pmax, c(na.rm=TRUE, lapply(.SD, function(x)
as.integer(factor(x, levels=my_lvls)))))
)
, .SDcols=x:v]
# [1] No Yes <NA>
# Levels: No Yes
As shown in #eddi's answer, to add a new col, you can put the x:v in .SDcols and use .SD.
I'm using dcast.data.table to convert a long data.table to a wide data.table
library(data.table)
library(reshape2)
set.seed(1234)
dt.base <- data.table(A = rep(c(1:3),2), B = rep(c(1:2),3), C=c(1:4,1,2),thevalue=rnorm(6))
#from long to wide using dcast.data.table()
dt.cast <- dcast.data.table(dt.base, A ~ B + C, value.var = "thevalue", fun = sum)
#now some stuff happens e.g., please do not bother what happens between dcast and melt
setkey(dt.cast, A)
dt.cast[2, c(2,3,4):=1,with = FALSE]
now i want to melt the data.table back again to the original column layout and here i'm stuck, how do I separate the concatenated columnames from the casted data.table, this is my problem
dt.melt <- melt(dt.cast,id.vars = c("A"), value.name = "thevalue")
I need two columns instead of one
the result that i'm looking for can be produced with this code
#update
dt.base[A==2 & B == 1 & C == 1, thevalue :=1]
dt.base[A==2 & B == 2 & C == 2, thevalue :=1]
#insert (2,1,3 was not there in the base data.table)
dt.newrow <- data.table(A=2, B=1, C=3, thevalue = 1)
dt.base <-rbindlist(list(dt.base, dt.newrow))
dt.base
As always any help is appreciated
Would that work for you?
colnames <- c("B", "C")
dt.melt[, (colnames) := (colsplit(variable, "_", colnames))][, variable := NULL]
subset(dt.melt, thevalue != 0)
# or dt.melt[thevalue != 0, ]
# A thevalue B C
#1: 1 -1.2070657 1 1
#2: 2 1.0000000 1 1
#3: 2 1.0000000 1 3
#4: 3 1.0844412 1 3
#5: 2 1.0000000 2 2
#6: 3 0.5060559 2 2
#7: 1 -2.3456977 2 4
If your data set isn't representable and there could be zeros in valid rows, here's alternative approach
colnames <- c("B", "C")
setkey(dt.melt[, (colnames) := (colsplit(variable, "_",colnames))][, variable := NULL], A, B, C)
setkey(dt.base, A, B, C)
dt.base <- dt.melt[rbind(dt.base, data.table(A = 2, B = 1, C = 3), fill = T)]
dt.base[, thevalue.1 := NULL]
## A B C thevalue
## 1: 1 1 1 -1.2070657
## 2: 1 2 4 -2.3456977
## 3: 2 1 1 1.0000000
## 4: 2 2 2 1.0000000
## 5: 3 1 3 1.0844412
## 6: 3 2 2 0.5060559
## 7: 2 1 3 1.0000000
Edit
As. suggested by #Arun, the most efficient way would be to use #AnandaMahto cSplit function, as it is using data.table too, i.e,
cSplit(dt.melt, "variable", "_")
Second Edit
In order to save the manual merges, you can set fill = NA (for example) while dcasting and then do everything in one go with csplit, e.g.
dt.cast <- dcast.data.table(dt.base, A ~ B + C, value.var = "thevalue", fun = sum, fill = NA)
setkey(dt.cast, A)
dt.cast[2, c(2,3,4):=1,with = FALSE]
dt.melt <- melt(dt.cast,id.vars = c("A"), value.name = "thevalue")
dt.cast <- cSplit(dt.melt, "variable", "_")[!is.na(thevalue)]
setnames(dt.cast, 3:4, c("B","C"))
# A thevalue B C
# 1: 1 -1.2070657 1 1
# 2: 2 1.0000000 1 1
# 3: 2 1.0000000 1 3
# 4: 3 1.0844412 1 3
# 5: 2 1.0000000 2 2
# 6: 3 0.5060559 2 2
# 7: 1 -2.3456977 2 4