Changing the ID value based on another column - r

I have a large data set that looks something like this:
Conv. Rev. ID Order path_no
0 0 1 1 1
1 50 1 2 1
0 0 1 3 2
1 100 1 4 2
0 0 2 1 1
0 0 2 2 1
1 150 2 3 1
1 100 2 4 2
I want to make a new ID column based on when there is a new path_no, then the ID will change. So I am hoping it will look something like this:
Conv. Rev. ID Order path_no
0 0 1 1 1
1 50 1 2 1
0 0 2 3 2
1 100 2 4 2
0 0 3 1 1
0 0 3 2 1
1 150 3 3 1
1 100 4 4 2

I think rleid from data.table should do the trick. Here's one solution that uses data.table and dplyr:
dplyr::mutate(df, ID = data.table::rleid(path_no))
Conv. Rev. ID Order path_no
1 0 0 1 1 1
2 1 50 1 2 1
3 0 0 2 3 2
4 1 100 2 4 2
5 0 0 3 1 1
6 0 0 3 2 1
7 1 150 3 3 1
8 1 100 4 4 2
Or with data.table only:
dt <- setDT(df)
dt[, ID := rleid(path_no)][]
Conv. Rev. ID Order path_no
1: 0 0 1 1 1
2: 1 50 1 2 1
3: 0 0 2 3 2
4: 1 100 2 4 2
5: 0 0 3 1 1
6: 0 0 3 2 1
7: 1 150 3 3 1
8: 1 100 4 4 2
Data:
text <- "Conv. Rev. ID Order path_no
0 0 1 1 1
1 50 1 2 1
0 0 1 3 2
1 100 1 4 2
0 0 2 1 1
0 0 2 2 1
1 150 2 3 1
1 100 2 4 2"
df <- read.table(text = text, stringsAsFactors = FALSE, header = TRUE)

Can go for a simple for loop:
vals <- c(1, 1, 1, 2, 2, 2, 1, 1, 2)
nobs <- length(vals)
idx <- rep(1, nobs)
for (i in 2:nobs) {
if (vals[i] != vals[i-1]) {
idx[i] <- idx[i-1] + 1
} else {
idx[i] <- idx[i-1]
}
}

Related

freq table for multiple variables in r

I would like to crosstab the items variable vs cat as a frequency table.
df1 <- data.frame(cat = c(1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4),
item1 = c(0,0,1,0,1,1,0,0,0,1,0,1,0,0,1,0,0,1),
item2 = c(1,1,0,1,0,1,1,0,0,0,1,0,1,1,0,0,1,0),
item3 = c(0,0,1,0,1,0,0,0,1,0,1,1,1,0,0,1,0,1))
> table(df1$cat, df1$item1)
0 1
1 3 1
2 3 2
3 3 2
4 2 2
Is there a way to print all the items variables freq table by cat together?
Thanks
Here is a quick solution in base-R
aggregate(.~ cat, df1, table)
cat item1.0 item1.1 item2.0 item2.1 item3.0 item3.1
1 1 3 1 1 3 3 1
2 2 3 2 3 2 3 2
3 3 3 2 2 3 2 3
4 4 2 2 3 1 2 2
You can use tally() to get the frequency for every combination of groups.
library(tidyverse)
df1 <- data.frame(cat = c(1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4),
item1 = c(0,0,1,0,1,1,0,0,0,1,0,1,0,0,1,0,0,1),
item2 = c(1,1,0,1,0,1,1,0,0,0,1,0,1,1,0,0,1,0),
item3 = c(0,0,1,0,1,0,0,0,1,0,1,1,1,0,0,1,0,1))
df1 %>% mutate_if(is.numeric, as.factor) %>%
group_by(cat, item1, item2, item3, .drop=F) %>%
tally()
First convert your variables to factors then you can then use group_by(, .drop=F) %>% tally() to tally all of your variables, including all groupings with zero frequencies. Remove .drop=F to remove all zero frequencies.
cat item1 item2 item3 n
1 1 0 0 0 0
2 1 0 0 1 0
3 1 0 1 0 3
4 1 0 1 1 0
5 1 1 0 0 0
6 1 1 0 1 1
7 1 1 1 0 0
8 1 1 1 1 0
9 2 0 0 0 1
10 2 0 0 1 1
11 2 0 1 0 1
12 2 0 1 1 0
13 2 1 0 0 0
14 2 1 0 1 1
15 2 1 1 0 1
16 2 1 1 1 0
17 3 0 0 0 0
18 3 0 0 1 0
19 3 0 1 0 1
20 3 0 1 1 2
21 3 1 0 0 1
22 3 1 0 1 1
23 3 1 1 0 0
24 3 1 1 1 0
25 4 0 0 0 0
26 4 0 0 1 1
27 4 0 1 0 1
28 4 0 1 1 0
29 4 1 0 0 1
30 4 1 0 1 1
31 4 1 1 0 0
32 4 1 1 1 0
Alternatively, if that is too unwieldy, you can also try table1() from library(table1).
library(tidyverse)
library(table1)
df1 <- data.frame(cat = c(1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4),
item1 = c(0,0,1,0,1,1,0,0,0,1,0,1,0,0,1,0,0,1),
item2 = c(1,1,0,1,0,1,1,0,0,0,1,0,1,1,0,0,1,0),
item3 = c(0,0,1,0,1,0,0,0,1,0,1,1,1,0,0,1,0,1))
df1 <- df1 %>% mutate_if(is.numeric, as.factor)
table1(~ item1 + item2 + item3 | cat, data=df1)
To get a table of the frequencies and percentages. The top row is your cat variable.
table1() is really great for generating HTML frequency tables. Highly recommend. You can do lots of formatting and labels to make tables presentable. Here is a tutorial
Here's another approach using ftable and stack from base R:
x <- ftable(cbind(cat = df1[, 1], stack(df1[-1])), row.vars = 1, col.vars = c(3, 2))
x
# ind item1 item2 item3
# values 0 1 0 1 0 1
# cat
# 1 3 1 1 3 3 1
# 2 3 2 3 2 3 2
# 3 3 2 2 3 2 3
# 4 2 2 3 1 2 2
One (debatable) downside of this approach is that the default data.table or data.frame methods for converting ftables to more usable objects will convert the output to a long format. But, you can grab SOfun and use ftable2dt if you want to keep the wide format.
library(SOfun)
ftable2dt(x)
# cat item1_0 item1_1 item2_0 item2_1 item3_0 item3_1
# 1: 1 3 1 1 3 3 1
# 2: 2 3 2 3 2 3 2
# 3: 3 3 2 2 3 2 3
# 4: 4 2 2 3 1 2 2
You can try this:
List <- list()
for(i in 2:dim(df1)[2])
{
List[[i-1]] <- table(df1$cat, df1[,i])
}
[[1]]
0 1
1 3 1
2 3 2
3 3 2
4 2 2
[[2]]
0 1
1 1 3
2 3 2
3 2 3
4 3 1
[[3]]
0 1
1 3 1
2 3 2
3 2 3
4 2 2

Row-wise operation by group over time R

Problem:
I am trying to create variable x2 which is equal to 1, for all rows within each ID group where over time x1 switches from 1 to 0.
Additionally, after the switch, every consecutive 0 in the run, x2 is set to 1.
I tried to figure out how to do this using library(dplyr), but could not figure out how to look at previous records within the group.
Input Data:
ID<-c("1","1","1","1","1","2","2","2","2","3","3","3","4","4","5","5","5")
time<-c("1","2","3","4","5","1","2","3","4","1","2","3","1","2","1","2","3")
x1<-c("0","1","1","1","1","0","0","0","0","1","0","0","1","1","1","0","1")
df<-data.frame(ID,time,x1)
Required Output:
ID time x1 x2
1 1 0 0
1 2 1 0
1 3 1 0
1 4 1 0
1 5 1 0
2 1 0 0
2 2 0 0
2 3 0 0
2 4 0 0
3 1 1 0
3 2 0 1
3 3 0 1
4 1 1 0
4 2 1 0
5 1 1 0
5 2 0 1
5 3 1 0
It is better to have the 'x1' as numeric column
library(data.table)
setDT(df)[, x2 := (cumsum(x1) < 2)*cumsum(c(FALSE, diff(x1) < 0)), ID]
df
# ID time x1 x2
# 1: 1 1 0 0
# 2: 1 2 1 0
# 3: 1 3 1 0
# 4: 1 4 1 0
# 5: 1 5 1 0
# 6: 2 1 0 0
# 7: 2 2 0 0
# 8: 2 3 0 0
# 9: 2 4 0 0
#10: 3 1 1 0
#11: 3 2 0 1
#12: 3 3 0 1
#13: 4 1 1 0
#14: 4 2 1 0
#15: 5 1 1 0
#16: 5 2 0 1
#17: 5 3 1 0
data
ID<-c("1","1","1","1","1","2","2","2","2","3","3","3","4","4","5","5","5")
time<-c("1","2","3","4","5","1","2","3","4","1","2","3","1","2","1","2","3")
x1<- as.integer(c("0","1","1","1","1","0","0","0","0","1","0","0","1","1","1","0","1"))
df<-data.frame(ID,time,x1)
If you want a dplyr answer, you can use #akrun's code in mutate after grouping by ID
library(dplyr)
ID<-c("1","1","1","1","1","2","2","2","2","3","3","3","4","4","5","5","5")
time<-c("1","2","3","4","5","1","2","3","4","1","2","3","1","2","1","2","3")
x1<- as.integer(c("0","1","1","1","1","0","0","0","0","1","0","0","1","1","1","0","1"))
df<-data.frame(ID,time,x1)
df <- df %>%
group_by(ID) %>%
mutate(x2 = (cumsum(x1) < 2)*cumsum(c(FALSE, diff(x1) < 0)))
df
# ID time x1 x2
# 1 1 0 0
# 1 2 1 0
# 1 3 1 0
# 1 4 1 0
# 1 5 1 0
# 2 1 0 0
# 2 2 0 0
# 2 3 0 0
# 2 4 0 0
# 3 1 1 0
# 3 2 0 1
# 3 3 0 1
# 4 1 1 0
# 4 2 1 0
# 5 1 1 0
# 5 2 0 1
# 5 3 1 0

Creating a new variable by detecting max value for each id

My data set contains three variables:
id <- c(1,1,1,1,1,1,2,2,2,2,5,5,5,5,5,5)
ind <- c(0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1)
price <- c(1,2,3,4,5,6,1,2,3,4,1,2,3,4,5,6)
mdata <- data.frame(id,ind,price)
I need to create a new variable (ind2) that is if ind=0, then ind2=0.
also, if ind=1, then ind2=0, unless the price value is max, then ind2=1.
The new data looks like:
id ind ind2 price
1 0 0 1
1 0 0 2
1 0 0 3
1 0 0 4
1 0 0 5
1 0 0 6
2 1 0 1
2 1 0 2
2 1 0 3
2 1 1 4
5 1 0 1
5 1 0 2
5 1 0 3
5 1 0 4
5 1 0 5
5 1 1 6
library(dplyr)
mdata %>%
group_by(id) %>%
mutate(ind2 = +(ind == 1L & price == max(price)))
# id ind price ind2
# 1 1 0 1 0
# 2 1 0 2 0
# 3 1 0 3 0
# 4 1 0 4 0
# 5 1 0 5 0
# 6 1 0 6 0
# 7 2 1 1 0
# 8 2 1 2 0
# 9 2 1 3 0
# 10 2 1 4 1
# 11 5 1 1 0
# 12 5 1 2 0
# 13 5 1 3 0
# 14 5 1 4 0
# 15 5 1 5 0
# 16 5 1 6 1
Or if you prefer data.table
setDT(mdata)[, ind2 := +(ind == 1L & price == max(price)), by = id]
Or with base R
mdata$ind2 <- unlist(lapply(split(mdata,mdata$id),
function(x) +(x$ind == 1L & x$price == max(x$price))))

Convert continuous dataframe into binary dataframe in R

I have the following data frame:
i39<-c(5,3,5,4,4,3)
i38<-c(5,3,5,3,4,1)
i37<-c(5,3,5,3,4,3)
i36<-c(5,4,5,5,4,2)
ndat1<-as.data.frame(cbind(i39,i38,i37,i36))
> ndat1
i39 i38 i37 i36
1 5 5 5 5
2 3 3 3 4
3 5 5 5 5
4 4 3 3 5
5 4 4 4 4
6 3 1 3 2
My goal is to convert any value that is a 4 or a 5 into a 1, and anything else into a 0 to yield the following:
> ndat1
i39 i38 i37 i36
1 1 1 1 1
2 0 0 0 1
3 1 1 1 1
4 1 0 0 1
5 1 1 1 1
6 0 0 0 0
With your data set I would just do
ndat1[] <- +(ndat1 >= 4)
# i39 i38 i37 i36
# 1 1 1 1 1
# 2 0 0 0 1
# 3 1 1 1 1
# 4 1 0 0 1
# 5 1 1 1 1
# 6 0 0 0 0
Though a more general solution will be
ndat1[] <- +(ndat1 == 4 | ndat1 == 5)
# i39 i38 i37 i36
# 1 1 1 1 1
# 2 0 0 0 1
# 3 1 1 1 1
# 4 1 0 0 1
# 5 1 1 1 1
# 6 0 0 0 0
Some data.table alternative
library(data.table)
setDT(ndat1)[, names(ndat1) := lapply(.SD, function(x) +(x %in% 4:5))]
And I'll to the dplyr guys have fun with mutate_each
I used the following to solve this issue:
recode<-function(ndat1){
ifelse((as.data.frame(ndat1)==4|as.data.frame(ndat1)==5),1,0)
}
sum_dc1<-as.data.frame(sapply(as.data.frame(ndat1),recode),drop=FALSE)
> sum_dc1
i39 i38 i37 i36
1 1 1 1 1
2 0 0 0 1
3 1 1 1 1
4 1 0 0 1
5 1 1 1 1
6 0 0 0 0
I was just wondering if anyone else had any thoughts, but overall I am satisfied with this way of solving the issue. Thank you.

Find the max and sum of the group and insert into related rows in R

I have a sample dataframe sample.data as follows:
x y z
1 0 1
1 0 1
1 0 1
1 0 1
1 0 2
1 0 2
1 0 2
1 0 2
1 0 2
0 1 2
I need to find the max and sum of x and y for each category of z (z is like 1,2,...600). I use ddply from plyr for this:
library(plyr)
z.group<-ddply (sample.data,.(z),summarize,max_x=max(x), max_y=max(y), sum_x=sum(x), sum_y=sum(y))
z.group
z max_x max_y sum_x sum_y
1 1 0 4 0
2 1 1 5 1
Now, I need to insert these sum_x, sum_y, max_x, and max_y as the columns of sample.data under the related rows. For example, if max_x is 1 for z=1, then I insert max_x is 1 for all rows with z=1. The expected output is
x y z max_x max_y sum_x sum_y
1 0 1 1 0 4 0
1 0 1 1 0 4 0
1 0 1 1 0 4 0
1 0 1 1 0 4 0
1 0 2 1 1 5 1
1 0 2 1 1 5 1
1 0 2 1 1 5 1
1 0 2 1 1 5 1
1 0 2 1 1 5 1
0 1 2 1 1 5 1
I wonder how do I get the expected output?
You can do it directly in one step , using transform
.group<-ddply (sample.data,.(z),transform,max_x=max(x), max_y=max(y), sum_x=sum(x), sum_y=sum(y))
> z.group
x y z max_x max_y sum_x sum_y
1 1 0 1 1 0 4 0
2 1 0 1 1 0 4 0
3 1 0 1 1 0 4 0
4 1 0 1 1 0 4 0
5 1 0 2 1 1 5 1
6 1 0 2 1 1 5 1
7 1 0 2 1 1 5 1
8 1 0 2 1 1 5 1
9 1 0 2 1 1 5 1
10 0 1 2 1 1 5 1
I think you can do this with merge:
merge(sample.data, z.group, by="z")
# z x y max_x max_y sum_x sum_y
# 1 1 1 0 1 0 4 0
# 2 1 1 0 1 0 4 0
# 3 1 1 0 1 0 4 0
# 4 1 1 0 1 0 4 0
# 5 2 1 0 1 1 5 1
# 6 2 1 0 1 1 5 1
# 7 2 1 0 1 1 5 1
# 8 2 1 0 1 1 5 1
# 9 2 1 0 1 1 5 1
# 10 2 0 1 1 1 5 1
A data.table alternative:
require(data.table)
dt <- data.table(sample.data, key="z")
dt[, list(x=x, y=y, max_x=max(x), max_y=max(y), sum_x=sum(x), sum_y=sum(y)), by=z]
Even better/shorter solution (as #agstudy suggested, should be possible):
dt[, `:=`(max_x=max(x), max_y=max(y), sum_x=sum(x), sum_y=sum(y)), by=z]

Resources