R Indicate If Group Has Value - r

data=data.frame(GROUP=c(1,1,1,2,2,2,3,3,3,4,4,4),
VAR=c('A','B','C','A','B','C','A','B','C','A','B','C'),
SCORE=c(3,6,NA,NA,NA,NA,1,NA,5,5,2,NA),
NEWVAR=c(1,1,1,NA,NA,NA,2,2,2,1,1,1))
score1 = c(2,3,7)
score2 = c(0,5,6)
I have 'data' with all columns but 'NEWVAR' and I wish to create it like this:
If score1 %in% SCORE for a particular 'GROUP' then a value '1' is given to 'NEWVAR'
If score2 %in% SCORE & score1 is not in SCORE for a particular 'GROUP' then a value '2' is given to 'NEWVAR'

Using dplyr:
input <- data %>% select(-NEWVAR)
input %>%
group_by(GROUP) %>%
mutate(NEWVAR=case_when(any(SCORE %in% score1) ~ 1,
any(SCORE %in% score2 & !(SCORE %in% score1)) ~ 2))

Since you have tagged this data.table using ifelse in data.table syntax.
library(data.table)
setDT(data)[, NEWVAR := ifelse(any(SCORE %in% score1), 1,
ifelse(any(SCORE %in% score2), 2, NA_integer_)), GROUP]
data
# GROUP VAR SCORE NEWVAR
# 1: 1 A 3 1
# 2: 1 B 6 1
# 3: 1 C NA 1
# 4: 2 A NA NA
# 5: 2 B NA NA
# 6: 2 C NA NA
# 7: 3 A 1 2
# 8: 3 B NA 2
# 9: 3 C 5 2
#10: 4 A 5 1
#11: 4 B 2 1
#12: 4 C NA 1
I think there is also fcase in development version of data.table which is similar to case_when in dplyr and makes writing such nested ifelse easy.

scores = list(score1, score2)
ave(data$SCORE, data$GROUP, FUN = function(x){
chk = sapply(scores, function(y) any(y %in% x))
seq_along(scores)[chk][1]
})
# [1] 1 1 1 NA NA NA 2 2 2 1 1 1

We can use fifelse in data.table
library(data.table)
setDT(data)[, NEWVAR := fifelse(any(SCORE %in% score1), 1,
fifelse(any(SCORE %in% score2), 2, NA_integer_)), GROUP]
data
# GROUP VAR SCORE NEWVAR
# 1: 1 A 3 1
# 2: 1 B 6 1
# 3: 1 C NA 1
# 4: 2 A NA NA
# 5: 2 B NA NA
# 6: 2 C NA NA
# 7: 3 A 1 2
# 8: 3 B NA 2
# 9: 3 C 5 2
#10: 4 A 5 1
#11: 4 B 2 1
#12: 4 C NA 1
Or with fcase from the devel version of data.table
setDT(data)[, NEWVAR := fcase(any(SCORE %in% score1), 1,
any(SCORE %in% score2), 2,
default = NA_integer_), GROUP]

Related

How to calculate yearly retention rate by group in R?

I have a large data set with individuals located in counties over a period of multiple years. Each year, some individuals move to a different county or leave the data set and new individuals join.
I would like to count the number of individuals that stayed in the same county from year to year and from year 1. Here is the question I found that comes closest to this task (without the additional grouping by counties): Month-over-month Customer Retention Rate in R
Here is a simplified version of the data set:
dt <- setDT(data.frame(ID = rep(c('a', 'b', 'c', 'd', 'a', 'c', 'd', 'e', 'c', 'e', 'f'),2),
CTY = rep(c(1, 2), each = 11),
YEAR = rep(c(1,1,1,1,2,2,2,2,3,3,3),2)))
My solution, so far, relies on a loop
x =matrix(NA, 2,3)
y =matrix(NA, 2,3)
for (i in 1:2) {
for (j in 1:3) {
x[i,j] = ifelse(j == 1, NA, sum(dt[CTY == i & YEAR == j, ID] %in% dt[CTY == i & YEAR == j-1, ID] == T))
y[i,j] = ifelse(j == 1, NA, sum(dt[CTY == i & YEAR == 1, ID] %in% dt[CTY == i & YEAR == j, ID] == T))
}
}
Which gives after joining
colnames(x) <- unique(dt$YEAR)
rownames(x) <- unique(dt$CTY)
x <- reshape2::melt(x)
names(x) <- c("CTY", "YEAR", "stayed")
x <- x[order(x$CTY),]
colnames(y) <- unique(dt$YEAR)
rownames(y) <- unique(dt$CTY)
y <- reshape2::melt(y)
names(y) <- c("CTY", "YEAR", "stayed2")
y <- y[order(y$CTY),]
dt <-dt[x, on = c("CTY", "YEAR")]
dt <-dt[y, on = c("CTY", "YEAR")]
dt
# ID CTY YEAR stayed stayed2
# 1: a 1 1 NA NA
# 2: b 1 1 NA NA
# 3: c 1 1 NA NA
# 4: d 1 1 NA NA
# 5: a 1 2 3 3
# 6: c 1 2 3 3
# 7: d 1 2 3 3
# 8: e 1 2 3 3
# 9: c 1 3 2 1
# 10: e 1 3 2 1
# 11: f 1 3 2 1
# 12: a 2 1 NA NA
# 13: b 2 1 NA NA
# 14: c 2 1 NA NA
# 15: d 2 1 NA NA
# 16: a 2 2 3 3
# 17: c 2 2 3 3
# 18: d 2 2 3 3
# 19: e 2 2 3 3
# 20: c 2 3 2 1
# 21: e 2 3 2 1
# 22: f 2 3 2 1
This is the right final table but it requires manipulation of the loop output that seems unnecessary; in sum, this works but it is clunky and slow.
I have experimented with data.table and dplyr solutions but can't seem to make it work.
Try sapply function like this:
fx <- function(x) ifelse(x$YEAR == 1, NA, sum(dt[CTY == x$CTY & YEAR == x$YEAR, ID] %in% dt[CTY == x$CTY & YEAR == x$YEAR-1, ID] == T))
fy <- function(y) ifelse(y$YEAR == 1, NA, sum(dt[CTY == y$CTY & YEAR == 1, ID] %in% dt[CTY == y$CTY & YEAR == y$YEAR, ID] == T))
x <- merge(data.frame(CTY=1:2),data.frame(YEAR=1:3))
s <- data.frame(x,stayed=sapply(split(x,1:nrow(x)),fx))
s <- data.frame(s,stayed2=sapply(split(x,1:nrow(x)),fy))
merge(dt,s)
# CTY YEAR ID stayed stayed2
# 1: 1 1 a NA NA
# 2: 1 1 b NA NA
# 3: 1 1 c NA NA
# 4: 1 1 d NA NA
# 5: 1 2 a 3 3
# 6: 1 2 c 3 3
# 7: 1 2 d 3 3
# 8: 1 2 e 3 3
# 9: 1 3 c 2 1
# 10: 1 3 e 2 1
# 11: 1 3 f 2 1
# 12: 2 1 a NA NA
# 13: 2 1 b NA NA
# 14: 2 1 c NA NA
# 15: 2 1 d NA NA
# 16: 2 2 a 3 3
# 17: 2 2 c 3 3
# 18: 2 2 d 3 3
# 19: 2 2 e 3 3
# 20: 2 3 c 2 1
# 21: 2 3 e 2 1
# 22: 2 3 f 2 1

Data Table Solution In R To Find Group Min/Max

Data
data=data.frame("student"=c(1,1,1,2,2,2,3,3,3),
"score"=c(NA,7,6,6,1,4,8,NA,3),
"min"=c(6,6,6,1,1,1,3,3,3),
"max"=c(7,7,7,6,6,6,8,8,8))
I have columns 'student' and 'score' and wish to use data.table to create 'min' and 'max' which simply put are the minimum and maximum values for each student IGNORING NA Values. If all values are NA then simply list 'NA' as the min/max.
Using data.table
library(data.table)
setDT(data)
data[, c("min", "max"):= list(min(score, na.rm = TRUE),
max(score, na.rm = TRUE)), student]
data
# student score min max
#1: 1 NA 6 7
#2: 1 7 6 7
#3: 1 6 6 7
#4: 2 6 1 6
#5: 2 1 1 6
#6: 2 4 1 6
#7: 3 8 3 8
#8: 3 NA 3 8
#9: 3 3 3 8
Or with dplyr
library(dplyr)
data %>%
group_by(student) %>%
mutate(min = min(score, na.rm = TRUE), max = max(score, na.rm = TRUE))
But OP wanted to return NA if all scores for any student were NA. This solution fixes the Inf problem.
data=data.frame("student"=c(1,1,1,2,2,2,3,3,3),
"score"=c(NA,NA,NA,6,1,4,8,NA,3))
> dt <- data.table(data); dt
student score
1: 1 NA
2: 1 NA
3: 1 NA
4: 2 6
5: 2 1
6: 2 4
7: 3 8
8: 3 NA
9: 3 3
Create a function to handle the case where all values are NA, to return NA
min.na = function(x) if (all(is.na(x))) x[NA_integer_] else min(x, na.rm = TRUE)
max.na = function(x) if (all(is.na(x))) x[NA_integer_] else max(x, na.rm = TRUE)
dt[, c("min", "max") := list(min.na(score), max.na(score)), by=student]
dt
student score min max
1: 1 NA NA NA
2: 1 NA NA NA
3: 1 NA NA NA
4: 2 6 1 6
5: 2 1 1 6
6: 2 4 1 6
7: 3 8 3 8
8: 3 NA 3 8
9: 3 3 3 8
Edit: And I'm not sure why you'd want to do this anyway. Combining summary statistics to the original data is bad practice. It results in redundancy/duplication. Surely you just want a separate result for each student:
dt[, .(min=min.na(score), max=max.na(score)), by=student]
student min max
1: 1 NA NA
2: 2 1 6
3: 3 3 8
I know this last part is not what was asked, but I always check that what they ask for is what they really wanted. ;)
Another data.table option:
setDT(data)[, c("min","max") := as.list(range(score, na.rm=TRUE)), student]
You can do this using the function ave:
data=data.frame("student"=c(1,1,1,2,2,2,3,3,3),
"score"=c(NA,7,6,6,1,4,8,NA,3))
data$min = ave(data$score, data$student, FUN = function(x){ min(x, na.rm = T) })
data$max = ave(data$score, data$student, FUN = function(x){ max(x, na.rm = T) })
Result:
> data
student score min max
1 1 NA 6 7
2 1 7 6 7
3 1 6 6 7
4 2 6 1 6
5 2 1 1 6
6 2 4 1 6
7 3 8 3 8
8 3 NA 3 8
9 3 3 3 8
The function ave takes a numeric vector as the first parameter and all following vectors are the grouping variables. The FUN parameter is the function you wish to apply.

data.table fill missing values from other rows by group

# have
> aDT <- data.table(colA = c(1,1,1,1,2,2,2,2,3,3,3,3), colB = c(4,NA,NA,1,4,3,NA,NA,4,NA,2,NA))
> aDT
colA colB
1: 1 4
2: 1 NA
3: 1 NA
4: 1 1
5: 2 4
6: 2 3
7: 2 NA
8: 2 NA
9: 3 4
10: 3 NA
11: 3 2
12: 3 NA
# want
> bDT <- data.table(colA = c(1,1,1,1,2,2,2,2,3,3,3,3), colB = c(4,1,1,1,4,3,3,3,4,2,2,2))
> bDT
colA colB
1: 1 4
2: 1 1
3: 1 1
4: 1 1
5: 2 4
6: 2 3
7: 2 3
8: 2 3
9: 3 4
10: 3 2
11: 3 2
12: 3 2
Would like to fill missing values according to the algorithm below:
within each group ('colA'),
use the value from one row below, if it's still NA, keeps going until the last row within that group
if all NAs in rows below, look at rows above (go up 1 row at a time)
if all NAs, then NA
Since the dataset is quite large, algorithmic efficiency is part of consideration. Not sure if there's any package for this type of operation already. How to do it?
With data.table and zoo:
library(data.table)
library(zoo)
# Last observation carried forward from last row of group
dt <- dt[, colB := na.locf0(colB, fromLast = TRUE), by = colA]
# Last observation carried forward for first row of group
dt[, colB := na.locf(colB), by = colA][]
Or in a single chain:
dt[, colB := na.locf0(colB, fromLast = TRUE), by = colA][
, colB := na.locf(colB), by = colA][]
Both return:
colA colB
1: 1 4
2: 1 1
3: 1 1
4: 1 1
5: 2 4
6: 2 3
7: 2 3
8: 2 3
9: 3 4
10: 3 2
11: 3 2
12: 3 2
Data:
text <- "colA colB
1 4
1 NA
1 NA
1 1
2 4
2 3
2 NA
2 NA
3 4
3 NA
3 2
3 NA"
dt <- fread(input = text, stringsAsFactors = FALSE)
Here is one way using tidyverse and zoo::na.locf:
library(tidyverse);
library(zoo);
df %>%
group_by(colA) %>%
arrange(colA) %>%
mutate(colB = na.locf(colB, na.rm = F, fromLast = TRUE)) %>%
mutate(colB = na.locf(colB, na.rm = F));
## A tibble: 12 x 2
## Groups: colA [3]
# colA colB
# <dbl> <dbl>
# 1 1.00 4.00
# 2 1.00 1.00
# 3 1.00 1.00
# 4 1.00 1.00
# 5 2.00 4.00
# 6 2.00 3.00
# 7 2.00 3.00
# 8 2.00 3.00
# 9 3.00 4.00
#10 3.00 2.00
#11 3.00 2.00
#12 3.00 2.00
Or the data.table way:
library(data.table);
dt[, .(na.locf(na.locf(colB, na.rm = F, fromLast = T), na.rm = F)), by = .(colA)];
# colA V1
# 1: 1 4
# 2: 1 1
# 3: 1 1
# 4: 1 1
# 5: 2 4
# 6: 2 3
# 7: 2 3
# 8: 2 3
# 9: 3 4
#10: 3 2
#11: 3 2
#12: 3 2
The key in both cases is to apply na.locf twice: First to replace NAs from the bottom, then replace the remaining NAs from the top.
Sample data
# As data.frame
df <- data.frame(colA = c(1,1,1,1,2,2,2,2,3,3,3,3), colB = c(4,NA,NA,1,4,3,NA,NA,4,NA,2,NA));
# As data.table
dt <- data.table(colA = c(1,1,1,1,2,2,2,2,3,3,3,3), colB = c(4,NA,NA,1,4,3,NA,NA,4,NA,2,NA));
library(tidyverse)
aDT%>%group_by(colA)%>%fill(colB,.direction="up")%>%fill(colB)
# A tibble: 12 x 2
# Groups: colA [3]
colA colB
<dbl> <dbl>
1 1 4
2 1 1
3 1 1
4 1 1
5 2 4
6 2 3
7 2 3
8 2 3
9 3 4
10 3 2
11 3 2
12 3 2

Relative reference to rows in large data set

I have a very large data set (millions of rows) where I need to turn into NA certain rows when a var1 equals "Z". However, I also need to turn into NA the preceding row to a row with var1="Z".
E.g.:
id var1
1 A
1 B
1 Z
1 S
1 A
1 B
2 A
2 B
3 A
3 B
3 A
3 B
4 A
4 B
4 A
4 B
In this case, the second row and the third row for id==1 should be NA.
I have tried a loop but it doesn't work as the data set is very large.
for (i in 1:length(df$var1)){
if(df$var1[i] =="Z"){
df[i,] <- NA
df[(i-1),] <-- NA
}
}
I have also tried to use data.table package unsuccessfully. Do you have any idea of how I could do it or what is the right term to look for info on what I am trying to do?
Maybe do it like this using data.table:
df <- as.data.table(read.table(header=T, file='clipboard'))
df$var1 <- as.character(df$var1)
#find where var1 == Z
index <- df[, which(var1 == 'Z')]
#add the previous lines too
index <- c(index, index-1)
#convert to NA
df[index, var1 := NA ]
Or in one call:
df[c(which(var1 == 'Z'), which(var1 == 'Z') - 1), var1 := NA ]
Output:
> df
id var1
1: 1 A
2: 1 NA
3: 1 NA
4: 1 S
5: 1 A
6: 1 B
7: 2 A
8: 2 B
9: 3 A
10: 3 B
11: 3 A
12: 3 B
13: 4 A
14: 4 B
15: 4 A
16: 4 B
If you want to take in count the preceding indices only if they are from the same id, I would suggest to use the .I and by combination which will make sure that you are not taking indecies from previous id
setDT(df)[, var1 := as.character(var1)]
indx <- df[, {indx <- which(var1 == "Z") ; .I[c(indx - 1L, indx)]}, by = id]$V1
df[indx, var1 := NA_character_]
df
# id var1
# 1: 1 A
# 2: 1 NA
# 3: 1 NA
# 4: 1 S
# 5: 1 A
# 6: 1 B
# 7: 2 A
# 8: 2 B
# 9: 3 A
# 10: 3 B
# 11: 3 A
# 12: 3 B
# 13: 4 A
# 14: 4 B
# 15: 4 A
# 16: 4 B
You can have a base R approach:
x = var1=='Z'
df[x | c(x[-1],F), 'var1'] <- NA
# id var1
#1 1 A
#2 1 <NA>
#3 1 <NA>
#4 1 S
#5 1 A
#6 1 B
#7 2 A
#8 2 B
#9 3 A
#10 3 B
#11 3 A
#12 3 B
#13 4 A
#14 4 B
#15 4 A
#16 4 B

R: How to calculate lag for multiple columns by group for data table

I would like to calculate the diff of variables in a data table, grouped by id. Here is some sample data. The data is recorded at a sample rate of 1 Hz. I would like to estimate the first and second derivatives (speed, acceleration)
df <- read.table(text='x y id
1 2 1
2 4 1
3 5 1
1 8 2
5 2 2
6 3 2',header=TRUE)
dt<-data.table(df)
Expected output
# dx dy id
# NA NA 1
# 1 2 1
# 1 1 1
# NA NA 2
# 4 -6 2
# 1 1 2
Here's what I've tried
dx_dt<-dt[, diff:=c(NA,diff(dt[,'x',with=FALSE])),by = id]
Output is
Error in `[.data.frame`(dt, , `:=`(diff, c(NA, diff(dt[, "x", with = FALSE]))), :
unused argument (by = id)
As pointed out by Akrun, the 'speed' terms (dx, dy) can be obtained using either data table or plyr. However, I'm unable to understand the calculation well enough to extend it to acceleration terms. So, how to calculate the 2nd lag terms?
dt[, c('dx', 'dy'):=lapply(.SD, function(x) c(NA, diff(x))),
+ by=id]
produces
x y id dx dy
1: 1 2 1 NA NA
2: 2 4 1 1 2
3: 3 5 1 1 1
4: 1 8 2 NA NA
5: 5 2 2 4 -6
6: 6 3 2 1 1
How to expand to get a second diff, or the diff of dx, dy?
x y id dx dy dx2 dy2
1: 1 2 1 NA NA NA NA
2: 2 4 1 1 2 NA NA
3: 3 5 1 1 1 0 -1
4: 1 8 2 NA NA NA NA
5: 5 2 2 4 -6 NA NA
6: 6 3 2 1 1 -3 7
You can try
setnames(dt[, lapply(.SD, function(x) c(NA,diff(x))), by=id],
2:3, c('dx', 'dy'))[]
# id dx dy
#1: 1 NA NA
#2: 1 1 2
#3: 1 1 1
#4: 2 NA NA
#5: 2 4 -6
#6: 2 1 1
Another option would be to use dplyr
library(dplyr)
df %>%
group_by(id) %>%
mutate_each(funs(c(NA,diff(.))))%>%
rename(dx=x, dy=y)
Update
You can repeat the step twice
dt[, c('dx', 'dy'):=lapply(.SD, function(x) c(NA, diff(x))), by=id]
dt[,c('dx2', 'dy2'):= lapply(.SD, function(x) c(NA, diff(x))),
by=id, .SDcols=4:5]
dt
# x y id dx dy dx2 dy2
#1: 1 2 1 NA NA NA NA
#2: 2 4 1 1 2 NA NA
#3: 3 5 1 1 1 0 -1
#4: 1 8 2 NA NA NA NA
#5: 5 2 2 4 -6 NA NA
#6: 6 3 2 1 1 -3 7
Or we can use the shift function from data.table
dt[, paste0("d", c("x", "y")) := .SD - shift(.SD), by = id
][, paste0("d", c("x2", "y2")) := .SD - shift(.SD) , by = id, .SDcols = 4:5 ]

Resources