change variable values based on preceding value - r

I have the following dataset:
df <- data.frame(subject = c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3),
time = c(1,2,3,4,5,6,7,8,9,10,11,12,1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10,11),
performance = c(1,0,-1,-1,0,1,1,-1,0,0,0,1,1,1,-1,0,1,1,-1,0,0,1,-1,1,1,0,1,1,-1,0,-1,-1,0))
What I would like to do is to change some of the entries in the performance variable. More specifically, if a "-1" entry is preceded by a "1", I want to change the "-1" to "0".
However, this should be done within subjects only, but not across subjects (all of the subjects have a varying number of sessions).
So, this is what I'd like to have in the end:
df2 =data.frame(subject = c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3),
time = c(1,2,3,4,5,6,7,8,9,10,11,12,1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10,11),
performance = c(1,0,-1,-1,0,1,1,0,0,0,0,1,1,1,0,0,1,1,0,0,0,1,-1,1,1,0,1,1,-1,0,-1,-1,0))
Does anyone have an idea how to do this?
Thanks in advance!
S.

Using dplyr,
df %>%
group_by(subject) %>%
mutate(performance = replace(performance, which(performance + lag(performance)==0 & performance == -1), 0))

Here's a data.table approach, where I first create a flag column which is then used to subset the data and update the performance column by reference.
library(data.table)
dt <- as.data.table(df) # or setDT(df)
dt[, flag := performance == -1 & shift(performance, 1L) == 1, by = subject]
dt[(flag), performance := 0][, flag := NULL]
I chose to do it with an intermediate flag-column because I expect that to perform very well for large data sets. If performance is not your concern, you could of course use ifelse or replace instead.

This is ugly, but should work:
dftest <- df
for (i in 2:nrow(dftest)) {
if(
dftest$performance[i] == -1 && dftest$performance[i - 1] == 1
){
if(
dftest$subject[i] == dftest$subject[i - 1]
) {
dftest$performance[i] <- 0
}
}
}
all.equal(df2, dftest) # ONE ERROR
This gives an error in line 29 - can you check whether your example df2 is correct here? If I understand the question correctly df2$performance[29] should be 0?

A base R solution using by and sapply:
gr <- do.call(c, by(df, df$subject, function(x) {
c(FALSE, unlist(sapply(1:length(x$performance),
function(y) (x$performance[y] == -1) & (x$performance[y-1] == 1))))
}))
df[gr, 3] <- 0
cbind(df, df2)

Related

R dates comparison using loop

I have a dataframe with 1 date column (converted as.Date).
I am trying to write a loop to create a value in another column to "check" the death date compare to a fix value (today's date).
fix_date= as.Date(2021-10-28)
for (i in 1:length(df$Death.date)) {
if (df$Death.date[i] < as.Date(fix_date)){
df$death_check[i]<-"good"
}
}
So for each row if Death.date < fix_date, fill death_check column with "good".
It is giving me this error code:
Error in if (new_possible_population$Death.date[i] <
as.Date(exploratory_date)) { : missing value where TRUE/FALSE
needed
Is this the correct way to code for the loop concerning date values? or is there a better way than using loops for this?
You definitely want to use vectorised functions for this, check out the dplyr package:
df %>%
mutate(death_check = case_when(Death.date < as.Date("2021-10-28") ~ "good"))
As you can see I added "" around the date as well, this is neccessary. If your df$Death.date is not actually in Date format you can change that here as well.
library(data.table)
df <- data.table(
Death.date = sample(seq(as.Date("2020-01-01"), by = "month", length.out = 25))
)
# just a TRUE for "good" which makes FALSE "bad"
df[, death_check_1 := Death.date < Sys.time()]
# written "good"
df[Death.date < Sys.time(), death_check_2 := "good"]
Here's another option using sapply and an ifelse:
# make df using Merijn's code
df <- data.frame(Death.date = sample(seq(as.Date("2020-01-01"),
by = "month",
length.out = 25)))
# set the date to check against
fix_date <- as.Date("2021-10-28")
# make the comparison, return "good" or NA
df$death_check <- sapply(df$Death.date, function(x) {
ifelse(x < fix_date, "good", NA)
})
df

R: remove duplicate rows with full overlap of non-missing variables

Many previous questions highlight various ways to remove duplicate rows with missing values, however none deal with the following case. Example starting data:
df <- data.frame(x = c(1, NA, 1), y=c(NA, 1, 1), z=c(0, NA, NA))
print(df)
Desired output:
df2 <- data.frame(x = c(1, 1), y=c(NA, 1), z=c(0, NA))
print(df2)
In this case the second row was removed because it was a perfect subset of row 3. In the real application I want to remove rows that contain all redundant info in non-missing columns, and keep the row that has less missing overall.
I thought this might be accomplished using dplyr and a rowwise application of distinct(), but to no avail. I could do this with a very slow for loop, but with hundreds of columns and thousands of rows this is a poor option.
Here is another option using data.table:
library(data.table)
#convert into long format and discard NAs
mDT <- melt(setDT(df)[, rn := .I], id.var="rn", na.rm=TRUE)[, cnt := .N , rn]
#self join and filter for rows that match to other rows
merged <- mDT[mDT, on=.(variable, value), {
diffrow <- i.rn!=x.rn
.(irn=i.rn[diffrow], xrn=x.rn[diffrow], icnt=i.cnt[diffrow])
}]
#count the occurrence and delete rows where all values are matched to another row
ix <- merged[, xcnt := .N, .(irn, xrn)][
icnt==xcnt]$irn
#delete dupe rows
df[-ix]
I'm not sure how to do it with dplyr, but here is soultion with loop. Also I'm not sure that dplyr solution can be faster than loop one (at the end it must use some loop), here you can at least control loop flow.
Subset vector function determines if vector a is subset of vector b (return 1) or if vector b is subset of vector a (returns 2) otherwise it returns 0. Then I loop over all rows of data.frame and remove subset rows.
subsetVector <- function(a, b){
na_a <- which(is.na(a))
na_b <- which(is.na(b))
if(all(na_a %in% na_b)){
if(all(a[-na_b] == b[-na_b])) return(2)
}else if(all(na_b %in% na_a)){
if(all(b[-na_a] == a[-na_a])) return(1)
}
return(0)
}
i <- 1
while(i < nrow(df)){
remove_rows <- NULL
for(j in (i+1):nrow(df)){
p <- subsetVector(df[i,], df[j,])
if(p == 1){
remove_rows <- c(remove_rows, i)
break()
}else if(p == 2){
remove_rows <- c(remove_rows, j)
}
}
if(length(remove_rows) > 0)
df <- df[-remove_rows,]
if(!1 %in% remove_rows)
i <- i + 1
}

how to insert sequential rows in data.table in R (Example given)?

df is data.table and df_expected is desired data.table . I want to add hour column from 0 to 23 and visits value would be filled as 0 for hours newly added .
df<-data.table(customer=c("x","x","x","y","y"),location_id=c(1,1,1,2,3),hour=c(2,5,7,0,4),visits=c(40,50,60,70,80))
df_expected<-data.table(customer=c("x","x","x","x","x","x","x","x","x","x","x","x","x","x","x","x","x","x","x","x","x","x","x","x",
"y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y",
"y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y","y"),
location_id=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3),
hour=c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23),
visits=c(0,0,40,0,0,50,0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
70,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0))
This is what I tried to obtain my result , but it did not work
df1<-df[,':='(hour=seq(0:23)),by=(customer)]
Error in `[.data.table`(df, , `:=`(hour = seq(0L:23L)), by = (customer)) :
Type of RHS ('integer') must match LHS ('double'). To check and coerce would impact
performance too much for the fastest cases. Either change the type of the target column, or
coerce the RHS of := yourself (e.g. by using 1L instead of 1)
Here's an approach that creates the target and then uses a join to add in the visits information. The ifelse statement just helps up clean up the NA from the merge. You could also leave them in and replace them with := in the new data.table.
target <- data.table(
customer = rep(unique(df$customer), each = 24),
hour = 0:23)
df_join <- df[target, on = c("customer", "hour"),
.(customer, hour, visits = ifelse(is.na(visits), 0, visits))
]
all.equal(df_expected, df_join)
Edit:
This addresses the request to include the location_id column. One way to do this is with by=location in the creation of the target. I've also added in some of the code from chinsoon12's answer.
target <- df[ , .("customer" = rep(unique(customer), each = 24L),
"hour" = rep(0L:23L, times = uniqueN(customer))),
by = location_id]
df_join <- df[target, on = .NATURAL,
.(customer, location_id, hour, visits = fcoalesce(visits, 0))]
all.equal(df_expected, df_join)
Another option using CJ to generate your universe, on=.NATURAL for joining on identically named columns, and fcoalesce to handle NAs:
df[CJ(customer, hour=0L:23L, unique=TRUE), on=.NATURAL, allow.cartesian=TRUE,
.(customer=i.customer, hour=i.hour, visits=fcoalesce(visits, 0))]
here's a for-loop answer.
df_final <- data.table()
for(i in seq(24)){
if(i %in% df[,hour]){
a <- df[hour==i]
}else{
a <- data.table(customer="x", hour=i, visits=0)}
df_final <- rbind(df_final, a)
}
df_final
You can wrap this in another for-loop to have your multiple customers x, y, etc. (the following loop isnt very clean but gets the job done).
df_final <- data.table()
for(j in unique(df[,customer])){
for(i in seq(24)){
if(i %in% df[,hour]){
if(df[hour==i,customer] %in% j){
a <- df[hour==i]
}else{
a <- data.table(customer=j, hour=i, visits=0)
}
}else{
a <- data.table(customer=j, hour=i, visits=0)
}
df_final <- rbind(df_final, a)
}
}
df_final

Removing duplicate rows from a data frame in R, keeping those with a smaller/larger value

I am trying to remove duplicate rows in an R data frame, but I want the condition that the row with a smaller or larger value (not bothered for the purpose of this question) in a certain column should be kept.
I can remove duplicate rows normally (from either side) like this:
df = data.frame( x = c(1,1,2,3,4,5,5,6,1,2,3,3,4,5,6),
y = c(rnorm(4),NA,rnorm(10)),
id = c(rep(1,8), rep(2,7)))
splitID <- split(df , df$id)
lapply(splitID, function(x) x[!duplicated(x$x),] )
How can I condition the removal of duplicate rows?
Thanks!
Use ave() to return a logical index to subset your data.frame
idx = as.logical(ave(df$y, df$x, df$id, FUN=fun))
df[idx,, drop=FALSE]
Some possible fun include
fun1 = function(x)
!is.na(x) & !duplicated(x) & (x == min(x, na.rm=TRUE))
fun2 = function(x) {
res = logical(length(x))
res[which.min(x)] = TRUE
res
}
The dplyr version of this might be
df %>% group_by(x, id) %>% filter(fun2(y))
We may need to order before applying the duplicated
lapply(splitID, function(x) x[!duplicated(x[order(x$x, x$y),]$x),] )
and for the reverse, i.e. keeping the larger values, order with decreasing = TRUE

How do I make an indicator variable based upon what happened first?

Using R I am attempting to create an indicator variable based upon which of the two indicators in the linked data table "afdcwvr" or "tanfimp" occurred first for a given fips code (in other words, in which column did a "1" appear first for a give st_fips). How would I go about implementing the process?
Data
Using library(data.table)
#read data
dt <- fread('FSP_State_FYear_ProblemSet3.csv')
#find indexes of events
res <- dt[, list(i1 = which(tanfimp == 1)[1], i2 = which(afdcwvr == 1)[1]), by = st_fips]
#finding answer
res[, first := pmin(i1, i2, na.rm = TRUE)]
res[i1 == first & !is.na(i1), ans := 1]
res[is.na(ans), ans := 2]
So ans is 1 if first 1 is in tanfimp and 2 otherwise.

Resources