I have a dataframe of this form
ID panelid dummy1 dummy2
1 1 0 1
1 2 1 0
2 1 1 0
2 2 0 1
3 1 1 0
3 2 1 0
4 1 0 1
4 2 0 1
I want to generate a dummy variable equal to one in occurrence of panelid==2 and only if the same individual presents a value for the dummy1 equal to 1 in panelid==1 and a value for the dummy2 equal to 1 in panelid==2. Thus I want to obtain something like this
ID panelid dummy1 dummy2 result
1 1 0 1 0
1 2 1 0 0
2 1 1 0 0
2 2 0 1 1
3 1 1 0 0
3 2 1 0 0
4 1 0 1 0
4 2 0 1 0
Can someone help me with these?
Many thanks to everyone
This is almost identical solution to #Cole's solution.
dataset <- read.table(text = 'ID panelid dummy1 dummy2
1 1 0 1
1 2 1 0
2 1 1 0
2 2 0 1
3 1 1 0
3 2 1 0
4 1 0 1
4 2 0 1',
header = TRUE)
temp_ID <- dataset$ID[(dataset$panelid == 1) & (dataset$dummy1 == 1)]
dataset$result <- as.integer(x = ((dataset$panelid == 2) & (dataset$dummy2 == 1) & (dataset$ID %in% temp_ID)))
dataset
ID panelid dummy1 dummy2 result
1 1 1 0 1 0
2 1 2 1 0 0
3 2 1 1 0 0
4 2 2 0 1 1
5 3 1 1 0 0
6 3 2 1 0 0
7 4 1 0 1 0
8 4 2 0 1 0
Here's a base R approach:
dummy1_in_panelid <- with(df, ID[panelid == 1 & dummy1 == 1])
#initialize
df$result <- 0
df$result[with(df, which(panelid == 2 & ID %in% dummy1_in_panelid & dummy2 == 1))] <- 1
df
ID panelid dummy1 dummy2 result
1 1 1 0 1 0
2 1 2 1 0 0
3 2 1 1 0 0
4 2 2 0 1 1
5 3 1 1 0 0
6 3 2 1 0 0
7 4 1 0 1 0
8 4 2 0 1 0
And the data...
df <- as.data.frame(data.table::fread('
ID panelid dummy1 dummy2
1 1 0 1
1 2 1 0
2 1 1 0
2 2 0 1
3 1 1 0
3 2 1 0
4 1 0 1
4 2 0 1'))
Problem:
I am trying to create variable x2 which is equal to 1, for all rows within each ID group where over time x1 switches from 1 to 0.
Additionally, after the switch, every consecutive 0 in the run, x2 is set to 1.
I tried to figure out how to do this using library(dplyr), but could not figure out how to look at previous records within the group.
Input Data:
ID<-c("1","1","1","1","1","2","2","2","2","3","3","3","4","4","5","5","5")
time<-c("1","2","3","4","5","1","2","3","4","1","2","3","1","2","1","2","3")
x1<-c("0","1","1","1","1","0","0","0","0","1","0","0","1","1","1","0","1")
df<-data.frame(ID,time,x1)
Required Output:
ID time x1 x2
1 1 0 0
1 2 1 0
1 3 1 0
1 4 1 0
1 5 1 0
2 1 0 0
2 2 0 0
2 3 0 0
2 4 0 0
3 1 1 0
3 2 0 1
3 3 0 1
4 1 1 0
4 2 1 0
5 1 1 0
5 2 0 1
5 3 1 0
It is better to have the 'x1' as numeric column
library(data.table)
setDT(df)[, x2 := (cumsum(x1) < 2)*cumsum(c(FALSE, diff(x1) < 0)), ID]
df
# ID time x1 x2
# 1: 1 1 0 0
# 2: 1 2 1 0
# 3: 1 3 1 0
# 4: 1 4 1 0
# 5: 1 5 1 0
# 6: 2 1 0 0
# 7: 2 2 0 0
# 8: 2 3 0 0
# 9: 2 4 0 0
#10: 3 1 1 0
#11: 3 2 0 1
#12: 3 3 0 1
#13: 4 1 1 0
#14: 4 2 1 0
#15: 5 1 1 0
#16: 5 2 0 1
#17: 5 3 1 0
data
ID<-c("1","1","1","1","1","2","2","2","2","3","3","3","4","4","5","5","5")
time<-c("1","2","3","4","5","1","2","3","4","1","2","3","1","2","1","2","3")
x1<- as.integer(c("0","1","1","1","1","0","0","0","0","1","0","0","1","1","1","0","1"))
df<-data.frame(ID,time,x1)
If you want a dplyr answer, you can use #akrun's code in mutate after grouping by ID
library(dplyr)
ID<-c("1","1","1","1","1","2","2","2","2","3","3","3","4","4","5","5","5")
time<-c("1","2","3","4","5","1","2","3","4","1","2","3","1","2","1","2","3")
x1<- as.integer(c("0","1","1","1","1","0","0","0","0","1","0","0","1","1","1","0","1"))
df<-data.frame(ID,time,x1)
df <- df %>%
group_by(ID) %>%
mutate(x2 = (cumsum(x1) < 2)*cumsum(c(FALSE, diff(x1) < 0)))
df
# ID time x1 x2
# 1 1 0 0
# 1 2 1 0
# 1 3 1 0
# 1 4 1 0
# 1 5 1 0
# 2 1 0 0
# 2 2 0 0
# 2 3 0 0
# 2 4 0 0
# 3 1 1 0
# 3 2 0 1
# 3 3 0 1
# 4 1 1 0
# 4 2 1 0
# 5 1 1 0
# 5 2 0 1
# 5 3 1 0
My data set contains three variables:
id <- c(1,1,1,1,1,1,2,2,2,2,5,5,5,5,5,5)
ind <- c(0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1)
price <- c(1,2,3,4,5,6,1,2,3,4,1,2,3,4,5,6)
mdata <- data.frame(id,ind,price)
I need to create a new variable (ind2) that is if ind=0, then ind2=0.
also, if ind=1, then ind2=0, unless the price value is max, then ind2=1.
The new data looks like:
id ind ind2 price
1 0 0 1
1 0 0 2
1 0 0 3
1 0 0 4
1 0 0 5
1 0 0 6
2 1 0 1
2 1 0 2
2 1 0 3
2 1 1 4
5 1 0 1
5 1 0 2
5 1 0 3
5 1 0 4
5 1 0 5
5 1 1 6
library(dplyr)
mdata %>%
group_by(id) %>%
mutate(ind2 = +(ind == 1L & price == max(price)))
# id ind price ind2
# 1 1 0 1 0
# 2 1 0 2 0
# 3 1 0 3 0
# 4 1 0 4 0
# 5 1 0 5 0
# 6 1 0 6 0
# 7 2 1 1 0
# 8 2 1 2 0
# 9 2 1 3 0
# 10 2 1 4 1
# 11 5 1 1 0
# 12 5 1 2 0
# 13 5 1 3 0
# 14 5 1 4 0
# 15 5 1 5 0
# 16 5 1 6 1
Or if you prefer data.table
setDT(mdata)[, ind2 := +(ind == 1L & price == max(price)), by = id]
Or with base R
mdata$ind2 <- unlist(lapply(split(mdata,mdata$id),
function(x) +(x$ind == 1L & x$price == max(x$price))))
I have a sample dataframe sample.data as follows:
x y z
1 0 1
1 0 1
1 0 1
1 0 1
1 0 2
1 0 2
1 0 2
1 0 2
1 0 2
0 1 2
I need to find the max and sum of x and y for each category of z (z is like 1,2,...600). I use ddply from plyr for this:
library(plyr)
z.group<-ddply (sample.data,.(z),summarize,max_x=max(x), max_y=max(y), sum_x=sum(x), sum_y=sum(y))
z.group
z max_x max_y sum_x sum_y
1 1 0 4 0
2 1 1 5 1
Now, I need to insert these sum_x, sum_y, max_x, and max_y as the columns of sample.data under the related rows. For example, if max_x is 1 for z=1, then I insert max_x is 1 for all rows with z=1. The expected output is
x y z max_x max_y sum_x sum_y
1 0 1 1 0 4 0
1 0 1 1 0 4 0
1 0 1 1 0 4 0
1 0 1 1 0 4 0
1 0 2 1 1 5 1
1 0 2 1 1 5 1
1 0 2 1 1 5 1
1 0 2 1 1 5 1
1 0 2 1 1 5 1
0 1 2 1 1 5 1
I wonder how do I get the expected output?
You can do it directly in one step , using transform
.group<-ddply (sample.data,.(z),transform,max_x=max(x), max_y=max(y), sum_x=sum(x), sum_y=sum(y))
> z.group
x y z max_x max_y sum_x sum_y
1 1 0 1 1 0 4 0
2 1 0 1 1 0 4 0
3 1 0 1 1 0 4 0
4 1 0 1 1 0 4 0
5 1 0 2 1 1 5 1
6 1 0 2 1 1 5 1
7 1 0 2 1 1 5 1
8 1 0 2 1 1 5 1
9 1 0 2 1 1 5 1
10 0 1 2 1 1 5 1
I think you can do this with merge:
merge(sample.data, z.group, by="z")
# z x y max_x max_y sum_x sum_y
# 1 1 1 0 1 0 4 0
# 2 1 1 0 1 0 4 0
# 3 1 1 0 1 0 4 0
# 4 1 1 0 1 0 4 0
# 5 2 1 0 1 1 5 1
# 6 2 1 0 1 1 5 1
# 7 2 1 0 1 1 5 1
# 8 2 1 0 1 1 5 1
# 9 2 1 0 1 1 5 1
# 10 2 0 1 1 1 5 1
A data.table alternative:
require(data.table)
dt <- data.table(sample.data, key="z")
dt[, list(x=x, y=y, max_x=max(x), max_y=max(y), sum_x=sum(x), sum_y=sum(y)), by=z]
Even better/shorter solution (as #agstudy suggested, should be possible):
dt[, `:=`(max_x=max(x), max_y=max(y), sum_x=sum(x), sum_y=sum(y)), by=z]
I have a large data set that looks something like this:
Conv. Rev. ID Order path_no
0 0 1 1 1
1 50 1 2 1
0 0 1 3 2
1 100 1 4 2
0 0 2 1 1
0 0 2 2 1
1 150 2 3 1
1 100 2 4 2
I want to make a new ID column based on when there is a new path_no, then the ID will change. So I am hoping it will look something like this:
Conv. Rev. ID Order path_no
0 0 1 1 1
1 50 1 2 1
0 0 2 3 2
1 100 2 4 2
0 0 3 1 1
0 0 3 2 1
1 150 3 3 1
1 100 4 4 2
I think rleid from data.table should do the trick. Here's one solution that uses data.table and dplyr:
dplyr::mutate(df, ID = data.table::rleid(path_no))
Conv. Rev. ID Order path_no
1 0 0 1 1 1
2 1 50 1 2 1
3 0 0 2 3 2
4 1 100 2 4 2
5 0 0 3 1 1
6 0 0 3 2 1
7 1 150 3 3 1
8 1 100 4 4 2
Or with data.table only:
dt <- setDT(df)
dt[, ID := rleid(path_no)][]
Conv. Rev. ID Order path_no
1: 0 0 1 1 1
2: 1 50 1 2 1
3: 0 0 2 3 2
4: 1 100 2 4 2
5: 0 0 3 1 1
6: 0 0 3 2 1
7: 1 150 3 3 1
8: 1 100 4 4 2
Data:
text <- "Conv. Rev. ID Order path_no
0 0 1 1 1
1 50 1 2 1
0 0 1 3 2
1 100 1 4 2
0 0 2 1 1
0 0 2 2 1
1 150 2 3 1
1 100 2 4 2"
df <- read.table(text = text, stringsAsFactors = FALSE, header = TRUE)
Can go for a simple for loop:
vals <- c(1, 1, 1, 2, 2, 2, 1, 1, 2)
nobs <- length(vals)
idx <- rep(1, nobs)
for (i in 2:nobs) {
if (vals[i] != vals[i-1]) {
idx[i] <- idx[i-1] + 1
} else {
idx[i] <- idx[i-1]
}
}