I have huge dataframe like this:
df <- read.table(text="
id date
1 1 2016-12-01
2 2 2016-12-02
3 4 2017-01-03
4 6 2016-11-04
5 7 2017-11-05
6 9 2017-12-06", header=TRUE)
I generate randomly 1 or 0 for each id. I'm doing it with this code.
set.seed(5)
df %>%
arrange(id) %>%
mutate(
rn = runif(id),
discount = if_else(rn < 0.5, 0, 1)
)
It works perfectly until I add new rows to my dataframe. Then are my random numbers different.
But what I need is not just generate random number for each id, but that number has to remain same even if new rows are added.
That means:
id date discount
1 1 2016-12-01 1
2 2 2016-12-02 0
3 4 2017-01-03 0
4 6 2016-11-04 1
5 7 2017-11-05 1
6 9 2017-12-06 1
When new rows are added
id date discount
1 1 2016-12-01 1
2 2 2016-12-02 0
3 4 2017-01-03 0
4 6 2016-11-04 1
5 7 2017-11-05 1
6 9 2017-12-06 1
7 12 2017-12-06 0
8 13 2017-12-06 1
You need to reset the same seed before the "new" data.frame "call":
set.seed(5) # first call
df %>%
arrange(id) %>%
mutate(
rn = runif(id),
discount = if_else(rn < 0.5, 0, 1)
)
# id date rn discount
# 1 1 2016-12-01 0.2002145 0
# 2 2 2016-12-02 0.6852186 1
# 3 4 2017-01-03 0.9168758 1
# 4 6 2016-11-04 0.2843995 0
# 5 7 2017-11-05 0.1046501 0
# 6 9 2017-12-06 0.7010575 1
set.seed(5) # added two rows, reset the seed
df2 %>%
arrange(id) %>%
mutate(
rn = runif(id),
discount = if_else(rn < 0.5, 0, 1)
)
# id date rn discount
# 1 1 2016-12-01 0.2002145 0
# 2 2 2016-12-02 0.6852186 1
# 3 4 2017-01-03 0.9168758 1
# 4 6 2016-11-04 0.2843995 0
# 5 7 2017-11-05 0.1046501 0
# 6 9 2017-12-06 0.7010575 1
# 7 12 2017-12-06 0.5279600 1
# 8 13 2017-12-06 0.8079352 1
Data:
df <- read.table(text="
id date
1 1 2016-12-01
2 2 2016-12-02
3 4 2017-01-03
4 6 2016-11-04
5 7 2017-11-05
6 9 2017-12-06", header=TRUE)
df2 <- read.table(text="
id date
1 1 2016-12-01
2 2 2016-12-02
3 4 2017-01-03
4 6 2016-11-04
5 7 2017-11-05
6 9 2017-12-06
7 12 2017-12-06
8 13 2017-12-06", header=TRUE)
Related
Consider the following sample dataset. Id is an individual identifier.
rm(list=ls()); set.seed(1)
n<-100
X<-rbinom(n, 1, 0.5) #binary covariate
j<-rep (1:n)
dat<-data.frame(id=1:n, X)
ntp<- rep(4, n)
mat<-matrix(ncol=3,nrow=1)
m=0; w <- mat
for(l in ntp)
{
m=m+1
ft<- seq(from = 2, to = 8, length.out = l)
# ft<- seq(from = 1, to = 9, length.out = l)
ft<-sort(ft)
seq<-rep(ft,each=2)
seq<-c(0,seq,10)
matid<-cbind( matrix(seq,ncol=2,nrow=l+1,byrow=T ) ,m)
w<-rbind(w,matid)
}
d<-data.frame(w[-1,])
colnames(d)<-c("time1","time2","id")
D <- round( merge(d,dat,by="id") ,2) #merging dataset
nr<-nrow(D)
D$Survival_time<-round(rexp(nr, 0.1)+1,3)
head(D,15)
id time1 time2 X Survival_time
1 1 0 2 0 21.341
2 1 2 4 0 18.987
3 1 4 6 0 4.740
4 1 6 8 0 13.296
5 1 8 10 0 6.397
6 2 0 2 0 10.566
7 2 2 4 0 2.470
8 2 4 6 0 14.907
9 2 6 8 0 8.620
10 2 8 10 0 13.376
11 3 0 2 1 45.239
12 3 2 4 1 11.545
13 3 4 6 1 11.352
14 3 6 8 1 19.760
15 3 8 10 1 7.547
How can I obtain the value at which Survival_time is less that time2 for the very first time per individual. I should end up with the following values
id Survival_time
1 4.740
2 2.470
3 7.547
Also, how can I subset the data to stop individualwise when this condition occurs. i.e obtain
id time1 time2 X Survival_time
1 1 0 2 0 21.341
2 1 2 4 0 18.987
3 1 4 6 0 4.740
6 2 0 2 0 10.566
7 2 2 4 0 2.470
11 3 0 2 1 45.239
12 3 2 4 1 11.545
13 3 4 6 1 11.352
14 3 6 8 1 19.760
15 3 8 10 1 7.547
Using data.table
library(data.table)
setDT(D)[, .SD[seq_len(.N) <= which(Survival_time < time2)[1]], id]
-output
id time1 time2 X Survival_time
1: 1 0 2 0 21.341
2: 1 2 4 0 18.987
3: 1 4 6 0 4.740
4: 2 0 2 0 10.566
5: 2 2 4 0 2.470
6: 3 0 2 1 45.239
7: 3 2 4 1 11.545
8: 3 4 6 1 11.352
9: 3 6 8 1 19.760
10: 3 8 10 1 7.547
Slight variation:
library(dplyr)
D %>% # Take D, and then
group_by(id) %>% # group by id, and then
filter(Survival_time < time2) %>% # keep Survival times < time2, and then
slice(1) %>% # keep the first row per id, and then
ungroup() # ungroup
You can use -
library(dplyr)
D %>%
group_by(id) %>%
summarise(Survival_time = Survival_time[match(TRUE, Survival_time < time2)])
#Also using which.max
#summarise(Survival_time = Survival_time[which.max(Survival_time < time2)])
# id Survival_time
# <int> <dbl>
#1 1 4.74
#2 2 2.47
#3 3 7.55
To select the rows you may till that point you may use -
D %>%
group_by(id) %>%
filter(row_number() <= match(TRUE, Survival_time < time2)) %>%
ungroup
# id time1 time2 X Survival_time
# <int> <int> <int> <int> <dbl>
# 1 1 0 2 0 21.3
# 2 1 2 4 0 19.0
# 3 1 4 6 0 4.74
# 4 2 0 2 0 10.6
# 5 2 2 4 0 2.47
# 6 3 0 2 1 45.2
# 7 3 2 4 1 11.5
# 8 3 4 6 1 11.4
# 9 3 6 8 1 19.8
#10 3 8 10 1 7.55
DF<-data.frame(id=c(1,2,3,4,5,6),date=c("2001-01-01","NA","2005-05-05","2006-06-06","2007-07-07","NA"))
id date
1 1 2001-01-01
2 2 NA
3 3 2005-05-05
4 4 2006-06-06
5 5 2007-07-07
6 6 NA
DF1<-data.frame(id=c(1,3,7,8,9,11),cdate=c("2000-12-20","2002-02-02","2003-03-03","2004-04-04","2005-05-05","2006-06-06"),atc=c("AAA1","BBB","AAA","DDD","CCC","AAA"))
id cdate atc
1 1 2000-12-20 AAA1
2 3 2002-02-02 BBB
3 7 2003-03-03 AAA
4 8 2004-04-04 DDD
5 9 2005-05-05 CCC
6 11 2006-06-06 AAA
DF2<-data.frame(id=c(1,2,3,4,8,9),cdate=c("2000-12-20","2002-02-02","2005-04-05","2004-04-04","2005-05-05","2006-06-06"),op=c("AA3","BB","AA","DD","CC","AA"))
id cdate op
1 1 2000-12-20 AA3
2 2 2002-02-02 BB
3 3 2005-04-05 AA
4 4 2004-04-04 DD
5 8 2005-05-05 CC
6 9 2006-06-06 AA
LM<-c("AAA","AA","BB")
Now I want to check if DF1 and DF2 contains any id in DF where atc/op starting with the values in LM. Finally, the cdate should not be more than 90 older than date.
Desired output:
id opatc
1 1 1
2 2 0
3 3 1
4 4 0
5 5 0
6 6 0
Which is the smartest way (shortest code) of solving this problem?
Best H
using tidyverse, you could do:
library(tidyverse)
DF %>%
left_join(DF1, "id")%>%
left_join(DF2, "id")%>%
mutate( cdate.x = as.Date(cdate.x),cdate.y = as.Date(cdate.y),
date = as.Date(date),
opatc = date - pmax(coalesce(cdate.x,cdate.y), cdate.y) <= 90) %>%
group_by(id)%>%
mutate(opatc = +(opatc&invoke("|",across(c(atc, op),
~any(sub("\\d+$","",.x)%in%LM)))))%>%
replace_na(list(opatc = 0))%>%
select(id, opatc)
# A tibble: 6 x 2
# Groups: id [6]
id opatc
<dbl> <dbl>
1 1 1
2 2 0
3 3 1
4 4 0
5 5 0
6 6 0
I have a data.table with firm information.
library(data.table)
DT <- fread("
iso Firm GDP year
A 1 1 1
A 2 1 1
A 3 1 1
A 4 1 1
A 5 3 2
A 6 3 2
A 7 3 2
A 8 3 2
B 9 2 1
B 10 2 1
B 11 2 1
B 12 2 1
B 13 4 1
B 14 4 1
B 15 4 1
B 16 4 1",
header = TRUE)
I want to calculate GDPgrowth (per country) from one year to the other and add it to the dataset ((N-O)/O). However, if I do:
DT <- DT[,GDPgrowth :=((GDP- shift(GDP))/shift(GDP)), by=iso]
the outcome will be zero because it subtracts the firm observations from each other.
How can I make sure it calculates for the whole group of firms belonging to the country together?
Desired output:
library(data.table)
DT <- fread("
iso Firm GDP GDPgrowth year
A 1 1 NA 1
A 2 1 NA 1
A 3 1 NA 1
A 4 1 NA 1
A 5 3 2 2
A 6 3 2 2
A 7 3 2 2
A 8 3 2 2
B 9 2 NA 1
B 10 2 NA 1
B 11 2 NA 1
B 12 2 NA 1
B 13 4 1 1
B 14 4 1 1
B 15 4 1 1
B 16 4 1 1",
header = TRUE)
Here is one way continuing from your current approach :
library(data.table)
DT[,GDPgrowth :=((GDP- shift(GDP))/shift(GDP)), by=iso]
DT[GDPgrowth == 0, GDPgrowth := NA]
DT[, GDPgrowth:= zoo::na.locf(GDPgrowth, na.rm = FALSE), .(iso, year)]
DT
# iso Firm GDP year GDPgrowth
# 1: A 1 1 1 NA
# 2: A 2 1 1 NA
# 3: A 3 1 1 NA
# 4: A 4 1 1 NA
# 5: A 5 3 2 2
# 6: A 6 3 2 2
# 7: A 7 3 2 2
# 8: A 8 3 2 2
# 9: B 9 2 1 NA
#10: B 10 2 1 NA
#11: B 11 2 1 NA
#12: B 12 2 1 NA
#13: B 13 4 1 1
#14: B 14 4 1 1
#15: B 15 4 1 1
#16: B 16 4 1 1
Using dplyr and tidyr::fill it can be done as
library(dplyr)
DT %>%
group_by(iso) %>%
mutate(GDPgrowth = (GDP - lag(GDP))/lag(GDP),
GDPgrowth = replace(GDPgrowth, GDPgrowth == 0, NA)) %>%
group_by(iso, year) %>%
tidyr::fill(GDPgrowth)
I want to sum every two prior observations for each ID, and place them in a new column that is named 'prior_work'. May sound strange, but here is an example that should clarify what I'm trying to do. My data frame:
ID Week Hours
1 1 .00
1 2 24.00
1 3 25.00
1 4 22.00
1 5 19.00
1 6 20.00
2 1 .00
2 2 .00
2 3 .00
2 4 .00
2 5 16.00
2 6 16.00
What I need:
ID Week Hours Hours_prior_two_weeks
1 1 .00 NA
1 2 24.00 NA
1 3 25.00 24.00
1 4 22.00 49.00
1 5 19.00 47.00
1 6 20.00 41.00
2 1 .00 NA #new ID / person here
2 2 .00 NA
2 3 .00 .00
2 4 .00 .00
2 5 16.00 .00
2 6 16.00 16.00
Tried basic aggregation and such, but I can't figure out how to sum 'prior observations'. Thanks!
library(dplyr)
df = data.frame(ID=rep(1:2, each=6),
Week=rep(1:6, each=2),
Hours=c(0,24,25,22,19,20,0,0,0,0,16,16))
df
# ID Week Hours
# 1 1 1 0
# 2 1 1 24
# 3 1 2 25
# 4 1 2 22
# 5 1 3 19
# 6 1 3 20
# 7 2 4 0
# 8 2 4 0
# 9 2 5 0
# 10 2 5 0
# 11 2 6 16
# 12 2 6 16
df %>% group_by(ID) %>% mutate(Hours_Prior_Two_Weeks = lag(Hours, 2) + lag(Hours, 1))
# Source: local data frame [12 x 4]
# Groups: ID [2]
#
# ID Week Hours Hours_Prior_Two_Weeks
# (int) (int) (dbl) (dbl)
# 1 1 1 0 NA
# 2 1 1 24 NA
# 3 1 2 25 24
# 4 1 2 22 49
# 5 1 3 19 47
# 6 1 3 20 41
# 7 2 4 0 NA
# 8 2 4 0 NA
# 9 2 5 0 0
# 10 2 5 0 0
# 11 2 6 16 0
# 12 2 6 16 16
The above code uses dplyr to group by your ID variable and then uses lag to look back at the last two values.
You can use the ave function with rollsum, Need to change from the defaults for fill and align arguments to get the structure you requisitioned:
> dat$Hours_prior_two_weeks <- with(dat, ave( Hours, ID, FUN=function(x) rollsum(x, k=3, fill=NA, align="right")))
> dat
ID Week Hours Hours_prior_two_weeks
1 1 1 0 NA
2 1 2 24 NA
3 1 3 25 49
4 1 4 22 71
5 1 5 19 66
6 1 6 20 61
7 2 1 0 NA
8 2 2 0 NA
9 2 3 0 0
10 2 4 0 0
11 2 5 16 16
12 2 6 16 32
But that didn't shift them so you need to add an extra NA at the beginning of the vectors within groups, as well as leaving one off the end (also within groups):
dat$Hours_prior_two_weeks <- with(dat, ave( Hours, ID,
FUN=function(x) c(NA, head(rollsum(x, k=2, fill=NA, align="right"), -1))) )
dat
#-----------
ID Week Hours Hours_prior_two_weeks
1 1 1 0 NA
2 1 2 24 NA
3 1 3 25 24
4 1 4 22 49
5 1 5 19 47
6 1 6 20 41
7 2 1 0 NA
8 2 2 0 NA
9 2 3 0 0
10 2 4 0 0
11 2 5 16 0
12 2 6 16 16
An option using data.table
library(data.table)
setDT(df1)[, Hours_prior_two_weeks := Reduce(`+`, shift(Hours, 1:2)), by = ID]
df1
# ID Week Hours Hours_prior_two_weeks
# 1: 1 1 0 NA
# 2: 1 2 24 NA
# 3: 1 3 25 24
# 4: 1 4 22 49
# 5: 1 5 19 47
# 6: 1 6 20 41
# 7: 2 1 0 NA
# 8: 2 2 0 NA
# 9: 2 3 0 0
#10: 2 4 0 0
#11: 2 5 16 0
#12: 2 6 16 16
I have an R dataframe such as:
df <- data.frame(period=rep(1:4,2),
farm=c(rep('A',4),rep('B',4)),
cumVol=c(1,5,15,31,10,12,16,24),
other = 1:8);
period farm cumVol other
1 1 A 1 1
2 2 A 5 2
3 3 A 15 3
4 4 A 31 4
5 1 B 10 5
6 2 B 12 6
7 3 B 16 7
8 4 B 24 8
How do I find the change in cumVol at each farm in each period, ignoring the 'other' column? I would like a dataframe like this (optionally with the cumVol column remaining):
period farm volume other
1 1 A 0 1
2 2 A 4 2
3 3 A 10 3
4 4 A 16 4
5 1 B 0 5
6 2 B 2 6
7 3 B 4 7
8 4 B 8 8
In practice there may be many 'farm'-like columns, and many 'other'-like (ie. ignored) columns. I'd like to be able to specify all the column names using variables.
I am using the dplyr package.
In dplyr:
require(dplyr)
df %>%
group_by(farm) %>%
mutate(volume = cumVol - lag(cumVol, default = cumVol[1]))
Source: local data frame [8 x 5]
Groups: farm
period farm cumVol other volume
1 1 A 1 1 0
2 2 A 5 2 4
3 3 A 15 3 10
4 4 A 31 4 16
5 1 B 10 5 0
6 2 B 12 6 2
7 3 B 16 7 4
8 4 B 24 8 8
Perhaps the desired output should actually be as follows?
df %>%
group_by(farm) %>%
mutate(volume = cumVol - lag(cumVol, default = 0))
period farm cumVol other volume
1 1 A 1 1 1
2 2 A 5 2 4
3 3 A 15 3 10
4 4 A 31 4 16
5 1 B 10 5 10
6 2 B 12 6 2
7 3 B 16 7 4
8 4 B 24 8 8
Edit: Following up on your comments I think you are looking for arrange(). It that is not the case it might be best to start a new question.
df1 <- data.frame(period=rep(1:4,4), farm=rep(c(rep('A',4),rep('B',4)),2), crop=(c(rep('apple',8), rep('pear',8))), cumCropVol=c(1,5,15,31,10,12,16,24,11,15,25,31,20,22,26,34), other = rep(1:8,2) );
df1 %>%
arrange(desc(period), desc(farm)) %>%
group_by(period, farm) %>%
summarise(cumVol=sum(cumCropVol))
Edit: Follow up #2
df1 <- data.frame(period=rep(1:4,4), farm=rep(c(rep('A',4),rep('B',4)),2), crop=(c(rep('apple',8), rep('pear',8))), cumCropVol=c(1,5,15,31,10,12,16,24,11,15,25,31,20,22,26,34), other = rep(1:8,2) );
df <- df1 %>%
arrange(desc(period), desc(farm)) %>%
group_by(period, farm) %>%
summarise(cumVol=sum(cumCropVol))
ungroup(df) %>%
arrange(farm) %>%
group_by(farm) %>%
mutate(volume = cumVol - lag(cumVol, default = 0))
Source: local data frame [8 x 4]
Groups: farm
period farm cumVol volume
1 1 A 12 12
2 2 A 20 8
3 3 A 40 20
4 4 A 62 22
5 1 B 30 30
6 2 B 34 4
7 3 B 42 8
8 4 B 58 16
In dplyr -- so you don't have to replace NAs
library(dplyr)
df %>%
group_by(farm)%>%
mutate(volume = c(0,diff(cumVol)))
period farm cumVol other volume
1 1 A 1 1 0
2 2 A 5 2 4
3 3 A 15 3 10
4 4 A 31 4 16
5 1 B 10 5 0
6 2 B 12 6 2
7 3 B 16 7 4
8 4 B 24 8 8
Would creating a new column in your original dataset be an option?
Here is an option using the data.table operator :=.
require("data.table")
DT <- data.table(df)
DT[, volume := c(0,diff(cumVol)), by="farm"]
or
diff_2 <- function(x) c(0,diff(x))
DT[, volume := diff_2(cumVol), by="farm"]
Output:
# > DT
# period farm cumVol other volume
# 1: 1 A 1 1 0
# 2: 2 A 5 2 4
# 3: 3 A 15 3 10
# 4: 4 A 31 4 16
# 5: 1 B 10 5 0
# 6: 2 B 12 6 2
# 7: 3 B 16 7 4
# 8: 4 B 24 8 8
tapply and transform?
> transform(df, volumen=unlist(tapply(cumVol, farm, function(x) c(0, diff(x)))))
period farm cumVol other volumen
A1 1 A 1 1 0
A2 2 A 5 2 4
A3 3 A 15 3 10
A4 4 A 31 4 16
B1 1 B 10 5 0
B2 2 B 12 6 2
B3 3 B 16 7 4
B4 4 B 24 8 8
ave is a better option, see # thelatemail's comment
with(df, ave(cumVol,farm,FUN=function(x) c(0,diff(x))) )