library(data.table)
data=data.table("cat"=c(0,5,NA,0,0,0),
"fox"=c(2,0,NA,NA,7,0))
data[, Count0 := cat + fox]
data$WANT = c(1,1,NA,1,1,2)
I wash to count 0 values in 'cat' and fox' and my attempt shown is 'Count0' but desired output is 'WANT'
library(data.table)
data=data.table("cat"=c(0,5,NA,0,0,0),
"fox"=c(2,0,NA,NA,7,0))
cols <- c("cat","fox")
data[,count0:=rowSums(data[,.SD,.SDcols=cols]==0,na.rm = TRUE)]
data[rowMeans(is.na(data[,..cols]))==1,count0:=NA_integer_]
Created on 2020-04-25 by the reprex package (v0.3.0)
We can use :
library(data.table)
cols <- c("cat","fox")
data[,ans := rowSums(.SD == 0, na.rm = TRUE), .SDcols = cols]
data[ans == 0, ans := NA]
data
# cat fox and
#1: 0 2 1
#2: 5 0 1
#3: NA NA NA
#4: 0 NA 1
#5: 0 7 1
#6: 0 0 2
Related
I have the below df:
df <- data.table(user = c('a', 'a', 'a', 'b', 'b')
, spend = 1:5
, shift_by = c(1,1,2,1,1)
); df
user spend shift_by
1: a 1 1
2: a 2 1
3: a 3 2
4: b 4 1
5: b 5 1
I am looking to create a lead lag column only this time the n parameter in data.table's shift function is dynamic and takes df$shiftby as input. My expected result is:
df[, spend_shifted := c(NA, 1, 1, NA, 4)]; df
user spend shift_by spend_shifted
1: a 1 1 NA
2: a 2 1 1
3: a 3 2 1
4: b 4 1 NA
5: b 5 1 4
However, with the below attempt it gives:
df[, spend_shifted := shift(x=spend, n=shift_by, type="lag"), user]; df
user spend shift_by spend_shifted
1: a 1 1 NA
2: a 2 1 NA
3: a 3 2 NA
4: b 4 1 NA
5: b 5 1 NA
This is the closest example I could find. However, I need a group by and am after a data.table solution because of speed. Truly look forward to finding any ideas.
I believe this will work. You can drop the newindex-column afterward.
df[, newindex := rowid(user) - shift_by]
df[newindex < 0, newindex := 0]
df[newindex > 0, spend_shifted := df[, spend[newindex], by = .(user)]$V1]
# user spend shift_by newindex spend_shifted
# 1: a 1 1 0 NA
# 2: a 2 1 1 1
# 3: a 3 2 1 1
# 4: b 4 1 0 NA
# 5: b 5 1 1 4
Here's another approach, using a data.table join. I use two helper-columns to join on:
df[, row := .I, by = .(user)]
df[, match_row := row - shift_by]
df[df, on = .(user, match_row = row), x := i.spend]
df[, c('row', 'match_row') := NULL]
# user spend shift_by spend_shifted x
# 1: a 1 1 NA NA
# 2: a 2 1 1 1
# 3: a 3 2 1 1
# 4: b 4 1 NA NA
# 5: b 5 1 4 4
Using matrix subsetting of data.frames:
df[,
spend_shifted :=
data.frame(shift(spend, n = unique(sort(shift_by))))[cbind(1:.N, shift_by)],
by = user]
Another solution (in addition to Wimpel's) without shift:
df[, {rows <- 1:nrow(.SD) - shift_by; .SD[replace(rows, rows <= 0, NA), spend]},
by = user]
Maybe this could help
> df[, spend_shifted := spend[replace(seq(.N) - shift_by, seq(.N) <= shift_by, NA)], user][]
user spend shift_by spend_shifted
1: a 1 1 NA
2: a 2 1 1
3: a 3 2 1
4: b 4 1 NA
5: b 5 1 4
I have carried out a benchmark test as scalability is very important for me.
df is same as original only repeating itself 10,000,000. Thus, 50,000,000 rows.
x <- 1e7
df <- data.table(user = rep(c('a', 'a', 'a', 'b', 'b'), x)
, spend = rep(1:5, x)
, shift_by = rep(c(1,1,2,1,1), x)
); df
user spend shift_by
1: a 1 1
2: a 2 1
3: a 3 2
4: b 4 1
5: b 5 1
benchmark:
a <-
microbenchmark(wimpel = {df[, newindex := rowid(user) - shift_by]
df[newindex < 0, newindex := 0]
df[newindex > 0, spend_shifted := df[, spend[newindex], by = .(user)]$V1]
}
, r2evans = {df[, spend_shifted := spend[{o <- seq_len(.N) - shift_by; o[o<1] <- NA; o; }], by = user]}
, sindri_1 = {df[, spend_shifted := data.frame(shift(spend, n = unique(sort(shift_by))))[cbind(1:.N, shift_by)], by = user]}
, sindri_2 = {df[, {rows <- 1:nrow(.SD) - shift_by; .SD[replace(rows, rows == 0, NA), spend]}, by = user]}
, talat = {df[, row := .I, by = .(user)]
df[, match_row := row - shift_by]
df[df, on = .(user, match_row = row), x := i.spend]
df[, c('row', 'match_row') := NULL]
}
, thomas = {df[, spend_shifted := spend[replace(seq(.N) - shift_by, seq(.N) <= shift_by, NA)], user]}
, times = 20
)
autoplot(a)
#ThomasIsCoding and #r2evans' methods are almost identical.
a[, .(mean=mean(time)), expr][order(mean)]]
expr mean
1: thomas 1974759530
2: r2evans 2121604845
3: sindri_2 2530492745
4: wimpel 4337907900
5: sindri_1 4585692780
6: talat 7252938170
I am still in the process of parsing the logic of all methods provided. I cannot thank you all enough for your methods contributed (of which there are many). I shall be voting for an answer in due course.
Given the data.table dt <- data.table(a=c(1,NA,3), b = c(4:6))
a b
1: 1 4
2: NA 5
3: 3 6
... , the result for dt[is.na(a), a := sum(a, na.rm = T)] is:
a b
1: 1 4
2: 0 5
3: 3 6
... , instead of the expected:
a b
1: 1 4
2: 4 5
3: 3 6
What is going on? I am using data.table 1.12.8
We could use fcoalesce
library(data.table)
dt[, a := fcoalesce(a, sum(a, na.rm = TRUE))]
library(data.table)
data = data.table("cat" = c(0,5,NA,0,0,0),
"horse" = c(0,4,2,1,1,3),
"fox" = c(2,2,NA,NA,7,0))
I wish to replace values of 'cat' and 'fox' that are equal to '0' or '2' with '-99'
I can do it one at a time but how to do them both?
dat[fox == 0 | fox == 2, fox := -99]
Another approach with data.table is using a for(...) set(...)-approach, which is in this case both fast and memory efficient:
cols <- c('fox', 'cat')
# option 1
for (j in cols) d[get(j) %in% c(0, 2), (j) := -99]
# option 2 (thx to #Cole for highlighting)
for (j in cols) set(d, which(d[[j]] %in% c(0, 2)), j, value = -99)
# option 3 (thx to #Frank for highlighting)
for (j in cols) d[.(c(0,2)), on = j, (j) := -99]
which gives:
> d
cat horse fox
1: -99 0 -99
2: 5 4 -99
3: NA 2 NA
4: -99 1 NA
5: -99 1 7
6: -99 3 -99
d <- data.table("cat" = c(0,5,NA,0,0,0),
"horse" = c(0,4,2,1,1,3),
"fox" = c(2,2,NA,NA,7,0))
Here's a not-so-elegant way of doing this:
> data
cat horse fox
1: 0 0 2
2: 5 4 2
3: NA 2 NA
4: 0 1 NA
5: 0 1 7
6: 0 3 0
> data[, c('fox', 'cat') := list(ifelse(cat %in% c(0,2) | fox %in% c(0,2), 99, cat ), ifelse(cat %in% c(0,2) | fox %in% c(0,2), 99, cat ))]
> data
cat horse fox
1: 99 0 99
2: 99 4 99
3: NA 2 NA
4: 99 1 99
5: 99 1 99
6: 99 3 99
I'm calling (c('cat', 'fox')) explicitly, but you could save them as mycols and assign using := operator: data[, mycols := ...]
Similarly, I'm passing a list explicitly based on the conditions - this could be better done using a function instead.
If I understand, this would work as well:
cols = c("cat", "fox")
data[, (cols) := lapply(.SD, function (x) fifelse(x %in% c(0, 2), -99, x)), .SDcols = cols]
I want to change the default value (which is 255) to NA.
dt <- data.table(x = c(1,5,255,0,NA), y = c(1,7,255,0,0), z = c(4,2,7,8,255))
coords <- c('x', 'y')
Which gives the following code:
x y z
1: 1 1 4
2: 5 7 2
3: 255 255 7
4: 0 0 8
5: NA 0 255
I the furthest I came up with is this:
dt[.SD == 255, (.SD) := NA, .SDcols = coords]
Please note that column z stays the same. So just the columns which are specified and not all columns.
But that doesn't help me to get the sollution:
x y z
1: 1 1 4
2: 5 7 2
3: NA NA 7
4: 0 0 8
5: NA 0 255
I am looking for a sustainable solution because the original dataset is a couple of million rows.
EDIT:
I have found a solution but it is quite ugly and is definately too slow as it takes almost 10 seconds to get through a dataframe of 22009 x 86. Does anyone have a better solution?
The code:
dt[, replace(.SD, .SD == 255, NA), .SDcols = coords, by = c(colnames(dt)[!colnames(dt) %in% coords])]
Here is how you can keep the columns outside .SDcols,
library(data.table)
dt[, (coords) := replace(.SD, .SD == 255, NA), .SDcols = coords]
which gives,
x y z
1: 1 1 4
2: 5 7 2
3: NA NA 7
4: 0 0 8
5: NA 0 255
You could also do:
require(data.table)
dt[ ,
(coords) := lapply(.SD, function(x) fifelse(x == 255, NA_real_, x)),
.SDcols = coords ]
Having compared it to Sotos' answer, it also seems a little bit faster.
DT0 = data.table(x=rep(c(NA,NA,NA)), y=c(0,1,NA), v=c(0, 0, NA), l=c(1,1,1))
DT0
# x y v l
#1: NA 0 0 1
#2: NA 1 0 1
#3: NA NA NA 1
Based on the first three cols x, y and v I want to add a new col with following output
#1: No
#2: Yes
#3: NA
NA if all rows are NA. Yes if any of them is 1 else 0. My current approach is
relevant_cols <- c('x', 'y', 'v')
new <- data.table(apply(DT0[, relevant_cols, with=F], 1, function(val) { ifelse(all(is.na(val)), NA_character_, ifelse(any(val == TRUE, na.rm = TRUE), 'Yes', 'No')) }))
DT0[, new:= new]
DT0
# x y v l new
#1: NA 0 0 1 No
#2: NA 1 0 1 Yes
#3: NA NA NA 1 NA
However, as the actual data.table is large, is there a better way to do this?
Edit:
Often the data.table entries are non-numeric hence it would be quite helpful if I can have a more general solution than using pmax e.g.,
DT = data.table(x=rep(c(NA,NA,NA)), y=c('No','Yes',NA), v=c('No', 'No', NA), l=c(1,1,1))
DT
# x y v l
#1: NA No No 1
#2: NA Yes No 1
#3: NA NA NA 1
Here's one option:
DT[, new := ifelse(rowSums(.SD == "Yes", na.rm = T) > 0,
'Yes',
ifelse(rowSums(is.na(.SD)) != ncol(.SD), "No", NA))
, .SDcols = x:v]
# x y v l new
#1: NA No No 1 No
#2: NA Yes No 1 Yes
#3: NA NA NA 1 NA
pmax is pretty well-suited to this problem.
First example In the 0/1 case...
DT0[, do.call(pmax, c(na.rm=TRUE, .SD)), .SDcols=x:v]
# 0 1 NA
Second example If you've encoded 0/1 as my_lvls = c("No","Yes") instead...
DT[,
factor(
labels = my_lvls,
x = do.call(pmax, c(na.rm=TRUE, lapply(.SD, function(x)
as.integer(factor(x, levels=my_lvls)))))
)
, .SDcols=x:v]
# [1] No Yes <NA>
# Levels: No Yes
As shown in #eddi's answer, to add a new col, you can put the x:v in .SDcols and use .SD.