I have dated data frame like this one with approximately 1 million rows
id date variable
1 1 2015-01-01 NA
2 1 2015-01-02 -1.1874087
3 1 2015-01-03 -0.5936396
4 1 2015-01-04 -0.6131957
5 1 2015-01-05 1.0291688
6 1 2015-01-06 -1.5810152
Reproducible example is here:
#create example data set
Df <- data.frame(id = factor(rep(1:3, each = 10)),
date = rep(seq.Date(from = as.Date('2015-01-01'),
to = as.Date('2015-01-10'), by = 1),3),
variable = rnorm(30))
Df$variable[c(1,7,12,18,22,23,29)] <- NA
What I want to do is replace NA values in variable with values from previous date for each id. I created loop which works but very slow (You can find it below). Can you please advice fast alternative for this task. Thank you!
library(dplyr)
#create new variable
Df$variableNew <- Df$variable
#create row numbers vector
Df$n <- 1:dim(Df)[1]
#order data frame by date
Df <- arrange(Df, date)
for (id in levels(Df$id)){
I <- Df$n[Df$id == id] # create vector of rows for specific id
for (row in 1:length(I)){ #if variable == NA for the first date change it to mean value
if (is.na(Df$variableNew[I[1]])) {
Df$variableNew[I[row]] <- mean(Df$variable,na.rm = T)
}
if (is.na(Df$variableNew[I[row]])){ # if variable == NA fassign to this date value from previous date
Df$variableNew[I[row]] <- Df$variableNew[I[row-1]]
}
}
}
This data.table solution should be extremely fast.
library(zoo) # for na.locf(...)
library(data.table)
setDT(Df)[,variable:=na.locf(variable, na.rm=FALSE),by=id]
Df[,variable:=if (is.na(variable[1])) c(mean(variable,na.rm=TRUE),variable[-1]) else variable,by=id]
Df
# id date variable
# 1: 1 2015-01-01 -0.288720759
# 2: 1 2015-01-02 -0.005344028
# 3: 1 2015-01-03 0.707310667
# 4: 1 2015-01-04 1.034107735
# 5: 1 2015-01-05 0.223480415
# 6: 1 2015-01-06 -0.878707613
# 7: 1 2015-01-07 -0.878707613
# 8: 1 2015-01-08 -2.000164945
# 9: 1 2015-01-09 -0.544790740
# 10: 1 2015-01-10 -0.255670709
# ...
So this replaces all embedded NA using locf by id, and then makes a second pass replacing any leading NA with the average of variable for that id. Note that if you do this is the reverse order you may get a different answer.
If you get the dev version of tidyr(0.3.0) available on github, there is a function fill which will do this exactly:
#devtools::install_github("hadley/tidyr")
library(tidyr)
library(dplyr)
Df %>% group_by(id) %>%
fill(variable)
It will not do the first value - We can do that with a mutate and replace:
Df %>% group_by(id) %>%
mutate(variable = ifelse(is.na(variable) & row_number()==1,
replace(variable, 1, mean(variable, na.rm = TRUE)),
variable)) %>%
fill(variable)
Related
I have measured hourly data of ground O3 but with some missing data (marked as NA). I want to calculate daily maximums, but only in case there are more than 17 hourly measurements per date. In case it is less than 18 measurement per date I want to write NA.
head(o3sat)
date hour O3
1/1/2010 0 50.2
1/1/2010 1 39.8
1/1/2010 2 41.8
1/1/2010 3 NA
1/1/2010 4 9.2
1/1/2010 5 6.0
Is there a possibility to add some argument to this function to indicate that at least 75% of the data must be available in a day for the value to be calculated, else the data is removed
maximums <- aggregate(o3sat["dnevnik"], list(Date = as.Date(o3sat$datum)), max, na.rm = TRUE)
It is better to provide a reproducible example when asking a question. Here, I created an example data frame based on the information you provided. This data frame contains hourly O3 measurements from 2010-01-01 to 2010-01-03.
library(dplyr)
library(tidyr)
library(lubridate)
o3sat <- read.table(text = " date hour O3
'1/1/2010' 0 50.2
'1/1/2010' 1 39.8
'1/1/2010' 2 41.8
'1/1/2010' 3 NA
'1/1/2010' 4 9.2
'1/1/2010' 5 6.0 ",
stringsAsFactors = FALSE, header = TRUE)
set.seed(1234)
o3sat_ex <- o3sat %>%
mutate(date = mdy(date)) %>%
complete(date = seq.Date(ymd("2010-01-01"), ymd("2010-01-03"), 1), hour = 0:23) %>%
mutate(O3 = c(o3sat$O3, rnorm(66, 30, 10))) %>%
mutate(O3 = ifelse(row_number() %in% sample(7:72, 18), NA, O3))
We can count how many non-NA value per day using the following code.
o3sat_ex %>%
group_by(date) %>%
summarize(sum(!is.na(O3)))
# # A tibble: 3 x 2
# date `sum(!is.na(O3))`
# <date> <int>
# 1 2010-01-01 18
# 2 2010-01-02 17
# 3 2010-01-03 18
Based on your description, we would like to calculate the maximum for 2010-01-01 and 2010-01-03, but not 2010-01-02 as it only contains 17 non-NA values.
Here is one way to achieve the task, we can define a function, max_helper, that only returns maximum if the count of non-NA values is larger than 17.
max_helper <- function(x, threshold){
if (sum(!is.na(x)) >= threshold) {
r <- max(x, na.rm = TRUE)
} else {
r <- NA
}
return(r)
}
We can apply this number using the dplyr code to get the answer.
o3sat_ex2 <- o3sat_ex %>%
group_by(date) %>%
summarize(O3 = max_helper(O3, 18))
o3sat_ex2
# # A tibble: 3 x 2
# date O3
# <date> <dbl>
# 1 2010-01-01 50.2
# 2 2010-01-02 NA
# 3 2010-01-03 47.8
I have the following dataset:
I want to measure the cumulative total at a daily level. So the result look something like:
I can use dplyr's cumsum function but the count for "missing days" won't show up. As an example, the date 1/3/18 does not exist in the original dataframe. I want this missed date to be in the resultant dataframe and its cumulative sum should be the same as the last known date i.e. 1/2/18 with the sum being 5.
Any help is appreciated! I am new to the language.
I'll use this second data.frame to fill out the missing dates:
daterange <- data.frame(Date = seq(min(x$Date), max(x$Date), by = "1 day"))
Base R:
transform(merge(x, daterange, all = TRUE),
Count = cumsum(ifelse(is.na(Count), 0, Count)))
# Date Count
# 1 2018-01-01 2
# 2 2018-01-02 5
# 3 2018-01-03 5
# 4 2018-01-04 5
# 5 2018-01-05 10
# 6 2018-01-06 10
# 7 2018-01-07 10
# 8 2018-01-08 11
# ...
# 32 2018-02-01 17
dplyr
library(dplyr)
x %>%
right_join(daterange) %>%
mutate(Count = cumsum(if_else(is.na(Count), 0, Count)))
Data:
x <- data.frame(Date = as.Date(c("1/1/18", "1/2/18", "1/5/18", "1/8/18", "2/1/18"), format="%m/%d/%y"),
Count = c(2,3,5,1,6))
I am trying to create a flag for unique people (defined by id) that have a flight duration that is over 14 hours and they have another flight greater than or equal to 25 days after the 14-hour flight.
To tackle this, I decided to use an if-else statement where the max date grouped by id was subtracted by row date, but the flagging only seems to work for cases where the first flight is above 14 hours.
#Setup Data Frame
id <- c(1,1,2,2,3,3,4,4,4,4,5,5)
flght_dur <- c(27,13,13,17,19,12,7,9,27,14,13,45)
flght_dt <- as.Date(c("2016-03-29","2016-09-01","2015-07-23","2016-06-16","2015-11-12","2016-03-25","2015-12-23","2016-05-19","2016-08-18","2016-09-27","2016-08-18","2016-09-27"))
df <- data.frame(id, flght_dur, flght_dt)
df2 <- df %>% group_by(id) %>% mutate(flag = ifelse(flght_dur >= 14 && (max(as.Date(flght_dt)) - as.Date(flght_dt)) >= 25, 1,0))
df2
Any suggestions on next steps would be appreciated,
You are using the scalar and condition && with vectors, which will only look at the first element of the vector; To look at all possible conditions and return a scalar per group, you can use & on vectors and then use any to reduce the boolean result:
df2 <- df %>%
group_by(id) %>%
mutate(flag = +any(flght_dur >= 14 & max(as.Date(flght_dt)) - as.Date(flght_dt) >= 25))
# ^ used + here to convert boolean to 1 and 0 instead of if/else for short
df2
# A tibble: 12 x 4
# Groups: id [5]
# id flght_dur flght_dt flag
# <dbl> <dbl> <date> <int>
# 1 1. 27. 2016-03-29 1
# 2 1. 13. 2016-09-01 1
# 3 2. 13. 2015-07-23 0
# 4 2. 17. 2016-06-16 0
# 5 3. 19. 2015-11-12 1
# 6 3. 12. 2016-03-25 1
# 7 4. 7. 2015-12-23 1
# 8 4. 9. 2016-05-19 1
# 9 4. 27. 2016-08-18 1
#10 4. 14. 2016-09-27 1
#11 5. 13. 2016-08-18 0
#12 5. 45. 2016-09-27 0
Try using chaining with data.table as follows:
DF[, longHaul := ifelse(flght_dur > 14, TRUE, FALSE)][, maxFlight_DATE := max(flght_dt), by = "id"][longHaul == TRUE & (maxFlight_DATE - flght_dt > 25),]
This is after converting your data.frame to data.table with DF = data.table(df)
It gives me the following output, which appears to follow the logic you want.
id flght_dur flght_dt longHaul maxFlight_DATE
1: 1 27 2016-03-29 TRUE 2016-09-01
2: 3 19 2015-11-12 TRUE 2016-03-25
3: 4 27 2016-08-18 TRUE 2016-09-27
You can do this avoiding loops using rollapply as below.
df$sameid <- c(rollapply(df$id, width = 2, by = 1, FUN = function(x) x[1]==x[2] , align = "right"),NA)
df$nextdurcondition <- c(diff(df$flght_dt)>25 ,NA)
df$flag <- df$sameid &df$nextdurcondition
df
However, for these rolling functions, I personally always use loops
I have a Data set consisting of dates when a person left the network. A person can leave a network multiple times as they may join the network again after leaving it. Following code replicates the scenario.
library(data.table)
Leaving_Date<- data.table(Id= c(1,2,3,4,3,5),Date = as.Date(
c("2017-01-01","2017-02-03","2017-01-01","2017-03-10","2017-02-09","2017-02-05")))
(ids repeat multiple times in this table as a person can leave a network multiple times given they joined it again)
> Leaving_Date
Id Date
1: 1 2017-01-01
2: 2 2017-02-03
3: 3 2017-01-01
4: 4 2017-03-10
5: 3 2017-02-09
6: 5 2017-02-05
I have another dataset giving the dates whenever a particular person was followed up which can be before or after they left the network. Following code replicates the scenario.
FOLLOWUPs <- data.table(Id = c(1,2,3,2,2,3,3,4,1,5),
Date =as.Date(c("2016-10-01","2017-02-04",
"2017-01-17","2017-02-23", "2017-03-03",
"2017-02-10","2017-02-11","2017-01-01",
"2017-01-15","2017-01-01")))
> FOLLOWUPs
Id Date
1: 1 2016-10-01
2: 2 2017-02-04
3: 3 2017-01-17
4: 2 2017-02-23
5: 2 2017-03-03
6: 3 2017-02-10
7: 3 2017-02-11
8: 4 2017-01-01
9: 1 2017-01-15
10: 5 2017-01-01
Now I want to lookup each case in Leaving_Date and find dates when they were followed up and create three columns(SevenDay, FourteenDay,ThirtyDay) indicating time period of followup(incase if there was any) in 0s and 1s. I am using following code :
SEVENDAY_FOLLOWUP <- vector()
FOURTEEN_FOLLOWUP <- vector()
THIRTYDAY_FOLLOWUP <- vector()
for(i in 1:nrow(Leaving_Date)){
sub_data <- FOLLOWUPs[Id== Leaving_Date[i,1]]
if(nrow(sub_data[Date > Leaving_Date[i,Date] &
Date < (Leaving_Date[i,Date]+7)])== 0){
SEVENDAY_FOLLOWUP <- rbind(SEVENDAY_FOLLOWUP,0)
}
else{
SEVENDAY_FOLLOWUP <- rbind(SEVENDAY_FOLLOWUP,1)
}
if(nrow(sub_data[Date > Leaving_Date[i,Date] &
Date < (Leaving_Date[i,Date]+14)])== 0){
FOURTEEN_FOLLOWUP <- rbind(FOURTEEN_FOLLOWUP,0)
}
else{
FOURTEEN_FOLLOWUP <- rbind(FOURTEEN_FOLLOWUP,1)
}
if(nrow(sub_data[Date > Leaving_Date[i,Date] &
Date < (Leaving_Date[i,Date]+30)])== 0){
THIRTYDAY_FOLLOWUP <- rbind(THIRTYDAY_FOLLOWUP,0)
}
else{
THIRTYDAY_FOLLOWUP <- rbind(THIRTYDAY_FOLLOWUP,1)
}
}
Leaving_Date$SEVENDAY <- as.vector(SEVENDAY_FOLLOWUP)
Leaving_Date$FOURTEENDAY <- as.vector(FOURTEEN_FOLLOWUP)
Leaving_Date$THIRTYDAY <- as.vector(THIRTYDAY_FOLLOWUP)
Final Data
> Leaving_Date
Id Date SEVENDAY FOURTEENDAY THIRTYDAY
1: 1 2017-01-01 0 0 1
2: 2 2017-02-03 1 1 1
3: 3 2017-01-01 0 0 1
4: 4 2017-03-10 0 0 0
5: 3 2017-02-09 1 1 1
6: 5 2017-02-05 0 0 0
This code is very inefficient as I have to run it for 100k observations and it takes a lot of time. Is there any efficient way to do this.
Using a non-equi join:
setorder(FOLLOWUPs, Id, Date)
Leaving_Date[, n :=
FOLLOWUPs[.SD, on=.(Id, Date > Date), mult = "first", x.Date - i.Date]
]
Id Date n
1: 1 2017-01-01 14 days
2: 2 2017-02-03 1 days
3: 3 2017-01-01 16 days
4: 4 2017-03-10 NA days
5: 3 2017-02-09 1 days
6: 5 2017-02-05 NA days
Switching from Date to IDate will probably make this about twice as fast. See ?IDate.
I think it's best to stop here, but n can be compared against 7, 14, 30 if necessary, like
Leaving_Date[, bin := c(7, 14, 30)[ findInterval(n, c(0, 7, 14, 30)) ]]
Id Date n bin
1: 1 2017-01-01 14 days 30
2: 2 2017-02-03 1 days 7
3: 3 2017-01-01 16 days 30
4: 4 2017-03-10 NA days NA
5: 3 2017-02-09 1 days 7
6: 5 2017-02-05 NA days NA
Side note: Please don't give tables names like this.
I think this does what you are looking for using dplyr.
It does an 'inner join' by Id - generating all combinations of dates in the two data frames for a given Id - then calculates the date differences, groups by Id, then checks whether there are values falling in the ranges for your three categories.
library(dplyr)
Leaving_Date2 <- Leaving_Date %>% inner_join(FOLLOWUPs %>% rename(FU_Date=Date)) %>%
mutate(datediff=as.numeric(FU_Date-Date)) %>% group_by(Id,Date) %>%
summarise(SEVENDAY=as.numeric(any(datediff %in% 0:6)),
FOURTEENDAY=as.numeric(any(datediff %in% 0:13)),
THIRTYDAY=as.numeric(any(datediff %in% 0:29)))
We can do this as a query instead of a loop. First, I cleaned your data.tables a bit because I was getting confused by the variable names.
To make things easier for the comparison step, we first pre-compute the follow up date limit for the 7, 14 and 30 day thresholds.
library(dplyr)
dt_leaving_neat = Leaving_Date %>%
mutate(.id = 1:n()) %>%
mutate(limit_07 = Date + 7) %>%
mutate(limit_14 = Date + 14) %>%
mutate(limit_30 = Date + 30) %>%
rename(id = .id, id_person = Id, leaving_date = Date)
dt_follow_neat = FOLLOWUPs %>%
select(id_person = Id, followed_up_date = Date)
The actual operation is just a query. It's written out in dplyr for readability, but if speed is a main concern of yours, you could translate it to data.table. I'd recommend running each step in the pipeline to make sure you understand what's going on.
dt_followed_up = dt_leaving_neat %>%
tidyr::gather(follow_up, limit_date, limit_07:limit_30) %>%
left_join(dt_follow_neat, by = "id_person") %>%
mutate(followed_up = (followed_up_date > leaving_date) & (followed_up_date < limit_date)) %>%
select(id, id_person, leaving_date, follow_up, followed_up) %>%
filter(followed_up == TRUE) %>%
unique() %>%
tidyr::spread(follow_up, followed_up, fill = 0) %>%
select(id, id_person, leaving_date, limit_07, limit_14, limit_30)
The idea is to join the leaving dates to the follow up dates and check whether the follow up date is within the threshold (and also after the leaving date, as presumably you can't follow up before leaving).
Then some final cleaning to return your desired format. You can use select or rename to change the column names back too.
dt_result = dt_leaving_neat %>%
select(id, id_person, leaving_date) %>%
left_join(dt_followed_up, by = c("id", "id_person", "leaving_date"))
dt_result[is.na(dt_result)] = 0
Result
> dt_result
id id_person leaving_date limit_07 limit_14 limit_30
1 1 1 2017-01-01 0 0 1
2 2 2 2017-02-03 1 1 1
3 3 3 2017-01-01 0 0 1
4 4 4 2017-03-10 0 0 0
5 5 3 2017-02-09 1 1 1
6 6 5 2017-02-05 0 0 0
And following Andrew's answer, an equivalent 1 line data.table soln is
FOLLOWUPs[Leaving_Date, on = "Id", .(Id, follow_date = Date, leaving_date = i.Date)][, diff := follow_date - leaving_date][, .(seven = any(diff %in% 0:6), fourteen = any(diff %in% 0:13), thirty = any(diff %in% 0:29)), .(Id, leaving_date)]
df <- data.frame(group=c(1,2,4,2,1,4,2,3,3),
ts=c("2014-02-13","2014-06-01","2014-02-14","2014-02-11","2013-02-01","2014-02-02","2014-03-21","2014-12-01","2014-02-11"),
letter=letters[1:9])
df$ts <- as.Date(df$ts,format='%Y-%m-%d')
I want to find an operation that will produce the complete rows containing the minimum timestamp per group, in this case,
group ts letter
1 2013-02-01 e
4 2014-02-02 f
2 2014-02-11 d
3 2014-02-11 i
A quick and dirty (and slow) base R solution would be
dfo <- data.frame(df[order(df$ts,decreasing=F),],index=seq(1:nrow(df)))
mins <- tapply(dfo$index,dfo$group,min)
dfo[dfo$index %in% mins,]
Intuitively, I think if there was a way to add an order index by group then I could just filter to where that column's value is 1, but I'm not sure how to execute it without lots of subsetting and rejoining.
You could use dplyr
library(dplyr)
group_by(df, group) %>% summarise(min = min(ts), letter = letter[which.min(ts)])
# group min letter
# 1 1 2013-02-01 e
# 2 2 2014-02-11 d
# 3 3 2014-02-11 i
# 4 4 2014-02-02 f
You could also slice the ranked rows
group_by(df, group) %>%
mutate(rank = row_number(ts)) %>%
arrange(rank) %>%
slice(1)
Here's a data.table solution. You seem to want the result orders by ts, not group. THis does that.
library(data.table)
setDT(df)[,.SD[which.min(ts)],by=group][order(ts)]
# group ts letter
# 1: 1 2013-02-01 e
# 2: 4 2014-02-02 f
# 3: 2 2014-02-11 d
# 4: 3 2014-02-11 i
Here's a one-liner using base R.
df[sapply(split(df,df$group), function(x) row.names(x)[which.min(x$ts)] ),]
Breaking it down some:
list.by.group <- split(df,df$group)
#a vector of the row names corresponding to the earliest date in each group
names.of.which.min <- sapply(list.by.group, function(x) row.names(x)[which.min(x$ts)])
#subset the data frame by row name
df[names.of.which.min,]