Delete repeated TIME points in a dataframe - r

I have a simple goal that I want to achieve in my data frame that looks like this:
ID TIME AMT
1 0 100
1 1 0
1 2 0
1 2 50
1 3 0
2 0 50
2 1 0
2 2 0
2 2 100
2 3 0
How do I subset the df for unique TIME (i.e. get rid of the repeated time point that has AMT=0? To make it clearer: I want to remove duplicate TIME rows that has AMT=0.

It is not entirely clear what you're asking. I think what you want is, for each unique ID value, eliminate duplicate TIME rows, and if a duplicate row has AMT=0, prefer to delete that row rather than another duplicate (with the same TIME value) that has AMT!=0.
The best way to do this is actually to call aggregate(), and group by both ID and TIME, taking the max() of all the AMT values in all the duplicates in a group (thus this will work for duplicate groups that have more than two rows, if such existed):
df <- data.frame(id=c(1,1,1,1,1,2,2,2,2,2), time=c(0,1,2,2,3,0,1,2,2,3), amt=c(100,0,0,50,0,50,0,0,100,0) );
df;
## id time amt
## 1 1 0 100
## 2 1 1 0
## 3 1 2 0
## 4 1 2 50
## 5 1 3 0
## 6 2 0 50
## 7 2 1 0
## 8 2 2 0
## 9 2 2 100
## 10 2 3 0
aggregate(amt~id+time, df, max );
## id time amt
## 1 1 0 100
## 2 2 0 50
## 3 1 1 0
## 4 2 1 0
## 5 1 2 50
## 6 2 2 100
## 7 1 3 0
## 8 2 3 0
As you can see, the order got a little messed up, but you could easily fix that with a call to order() afterward:
df2 <- aggregate(amt~id+time, df, max );
df2[order(df2$id,df2$time),];
## id time amt
## 1 1 0 100
## 3 1 1 0
## 5 1 2 50
## 7 1 3 0
## 2 2 0 50
## 4 2 1 0
## 6 2 2 100
## 8 2 3 0

It is not entirely clear from the description, how we want to remove the duplicated elements. Suppose if there are duplicates for 'TIME', 'ID', but the 'AMT' element is neither zero nor maximum value. If we need to remove only the '0' values per combination,
library(data.table)
res1 <- setDT(df1)[, if(all(AMT==0)) .SD[1L] else .SD[AMT!=0], list(TIME,ID)]
res1[order(TIME)]
# TIME ID AMT
#1: 0 1 100
#2: 0 2 50
#3: 1 1 0
#4: 1 2 0
#5: 2 1 50
#6: 2 2 100
#7: 3 1 0
#8: 3 2 0
or if the idea of removing the duplicates was as assumed by #bgoldst, an equivalent option using data.table is
res2 <- setDT(df1)[, list(amt=max(AMT)), list(TIME, ID)]
res2[order(TIME)]
# TIME ID amt
#1: 0 1 100
#2: 0 2 50
#3: 1 1 0
#4: 1 2 0
#5: 2 1 50
#6: 2 2 100
#7: 3 1 0
#8: 3 2 0
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L),
TIME = c(0L, 1L, 2L, 2L, 3L, 0L, 1L, 2L, 2L, 3L), AMT = c(100L,
0L, 0L, 50L, 0L, 50L, 0L, 0L, 100L, 0L)), .Names = c("ID",
"TIME", "AMT"), class = "data.frame", row.names = c(NA, -10L))

Related

How to custom arrange such that no group dimension contains the same index twice?

I have the following tibble containing all the permutations of some indexes:
bb <- as_tibble(expand.grid(v1=0:2, v2=0:2)) %>%
arrange(v1, v2)
bb
# A tibble: 9 x 2
v1 v2
<int> <int>
1 0 0
2 0 1
3 0 2
4 1 0
5 1 1
6 1 2
7 2 0
8 2 1
9 2 2
How can it be arranged in such a way that it generates this output instead:
v1 v2
<int> <int>
1 0 0
2 1 1
3 2 2
4 0 1
5 1 2
6 2 0
7 0 2
8 1 0
9 2 1
Where the output is three groups/sets such that within each set there is no repetition of the index within each variable. Note that there can be only so many rows per group/set fulfilling this criteria ...
Sorry that I am not very familiar with tibble, so I provide a solution with data.frame in base R:
shifter <- function(x, n) ifelse(n == 0, return(x), return(c(tail(x, -n), head(x, n))))
res <- `rownames<-`(Reduce(rbind,lapply(seq(length(dfs<-split(df,rep(0:2,3)))),
function(k) {
dfs[[k]][,2] <- shifter(dfs[[k]][,1],k-1)
dfs[[k]]})),seq(nrow(df)))
which gives:
> res
v1 v2
1 0 0
2 1 1
3 2 2
4 0 1
5 1 2
6 2 0
7 0 2
8 1 0
9 2 1
DATA
df <- structure(list(v1 = c(0L, 0L, 0L, 1L, 1L, 1L, 2L, 2L, 2L), v2 = c(0L,
1L, 2L, 0L, 1L, 2L, 0L, 1L, 2L)), class = "data.frame", row.names = c(NA,
-9L))
Update: a more efficient generator for all combinations with desired format is given as below:
genAllCombn <- function(n) {
v1 <- rep(0:(n-1),n)
v2 <- (v1 + rep(0:(n-1),1,each = n)) %% n
return(data.frame(v1,v2))
}
> genAllCombn(4)
v1 v2
1 0 0
2 1 1
3 2 2
4 3 3
5 0 1
6 1 2
7 2 3
8 3 0
9 0 2
10 1 3
11 2 0
12 3 1
13 0 3
14 1 0
15 2 1
16 3 2

applying sorting function in r for every four rows returns dataframe sorted but without extended selection, other columns are not sorted accordingly

I need every four rows to be sorted by the 4th column, separately from the next four rows, made a function :
for (i in seq(1,nrow(data_frame), by=4)) {
data_frame[i:(i+3),4] <- sort(data_frame[i:(i+3),4], decreasing=TRUE) }
problem is only the 4th column gets sorted but the corresponding rows are maintained.
from
x y z userID
-1 1 2 5 1
-2 1 1 2 2
-3 0 0 5 5
-6 1 2 5 3
-4 1 1 2 6
-5 0 0 5 4
-4 1 1 2 1
-5 0 0 5 5
to -
x y z userID
-1 1 2 5 5
-2 1 1 2 3
-3 0 0 5 2
-6 1 2 5 1
-4 1 1 2 6
-5 0 0 5 5
-4 1 1 2 4
-5 0 0 5 1
With tidyverse, we can use %/% to create a grouping column with %/% and use that to sort the 'userID'
library(tidyverse)
df1 %>%
group_by(grp = (row_number()-1) %/% 4 + 1) %>%
#or use
#group_by(grp = cumsum(rep(c(TRUE, FALSE, FALSE, FALSE), length.out = n()))) %>%
mutate(userID = sort(userID, decreasing = TRUE))
# A tibble: 8 x 5
# Groups: grp [2]
# x y z userID grp
# <int> <int> <int> <int> <dbl>
#1 1 2 5 5 1
#2 1 1 2 3 1
#3 0 0 5 2 1
#4 1 2 5 1 1
#5 1 1 2 6 2
#6 0 0 5 5 2
#7 1 1 2 4 2
#8 0 0 5 1 2
Or using base R with ave
with(df1, ave(userID, (seq_along(userID)-1) %/% 4 + 1,
FUN = function(x) sort(x, decreasing = TRUE)))
#[1] 5 3 2 1 6 5 4 1
data
df1 <- structure(list(x = c(1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L), y = c(2L,
1L, 0L, 2L, 1L, 0L, 1L, 0L), z = c(5L, 2L, 5L, 5L, 2L, 5L, 2L,
5L), userID = c(1L, 2L, 5L, 3L, 6L, 4L, 1L, 5L)), row.names = c(NA,
-8L), class = "data.frame")
In base R, we can split every 4 rows, order the fourth column and return the updated dataframe back.
df[] <- do.call(rbind, lapply(split(df, gl(nrow(df)/4, 4)),
function(p) p[order(p[[4]], decreasing = TRUE), ]))
df
# x y z userID
#1 0 0 5 5
#2 1 2 5 3
#3 1 1 2 2
#4 1 2 5 1
#5 1 1 2 6
#6 0 0 5 5
#7 0 0 5 4
#8 1 1 2 1
tidyverse approach using the same logic would be
library(tidyverse)
df %>%
group_split(gl(n()/4, 4), keep = FALSE) %>%
map_dfr(. %>% arrange(desc(userID)))

Longest consecutive count of the same value per group

I have a data.frame as below and I want to add a variable describing the longest consecutive count of 1 in the VALUE variable observed in the group (i.e. longest consecutive rows with 1 in VALUE per group).
GROUP_ID VALUE
1 0
1 1
1 1
1 1
1 1
1 0
2 1
2 1
2 0
2 1
2 1
2 1
3 1
3 0
3 1
3 0
So the output would look like this:
GROUP_ID VALUE CONSECUTIVE
1 0 4
1 1 4
1 1 4
1 1 4
1 1 4
1 0 4
2 1 3
2 1 3
2 0 3
2 1 3
2 1 3
2 1 3
3 1 1
3 0 1
3 1 1
3 0 1
Any help would be greatly appreciated!
Using dplyr:
library(dplyr)
dat %>%
group_by(GROUP_ID) %>%
mutate(CONSECUTIVE = {rl <- rle(VALUE); max(rl$lengths[rl$values == 1])})
which gives:
# A tibble: 16 x 3
# Groups: GROUP_ID [3]
GROUP_ID VALUE CONSECUTIVE
<int> <int> <int>
1 1 0 4
2 1 1 4
3 1 1 4
4 1 1 4
5 1 1 4
6 1 0 4
7 2 1 3
8 2 1 3
9 2 0 3
10 2 1 3
11 2 1 3
12 2 1 3
13 3 1 1
14 3 0 1
15 3 1 1
16 3 0 1
Or with data.table:
library(data.table)
setDT(dat) # convert to a 'data.table'
dat[, CONSECUTIVE := {rl <- rle(VALUE); max(rl$lengths[rl$values == 1])}
, by = GROUP_ID][]
We can use ave with rle and get maximum occurrence of consecutive 1's for each group. (GROUP_ID)
df$Consecutive <- ave(df$VALUE, df$GROUP_ID, FUN = function(x) {
y <- rle(x == 1)
max(y$lengths[y$values])
})
df
# GROUP_ID VALUE Consecutive
#1 1 0 4
#2 1 1 4
#3 1 1 4
#4 1 1 4
#5 1 1 4
#6 1 0 4
#7 2 1 3
#8 2 1 3
#9 2 0 3
#10 2 1 3
#11 2 1 3
#12 2 1 3
#13 3 1 1
#14 3 0 1
#15 3 1 1
#16 3 0 1
Here is another option with data.table
library(data.table)
library(dplyr)
setDT(df1)[, CONSECUTIVE := max(table(na_if(rleid(VALUE)*VALUE, 0))), .(GROUP_ID)]
df1
# GROUP_ID VALUE CONSECUTIVE
# 1: 1 0 4
# 2: 1 1 4
# 3: 1 1 4
# 4: 1 1 4
# 5: 1 1 4
# 6: 1 0 4
# 7: 2 1 3
# 8: 2 1 3
# 9: 2 0 3
#10: 2 1 3
#11: 2 1 3
#12: 2 1 3
#13: 3 1 1
#14: 3 0 1
#15: 3 1 1
#16: 3 0 1
data
df1 <- structure(list(GROUP_ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L), VALUE = c(0L, 1L, 1L, 1L, 1L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-16L))

Panel data sequence adding for a particular value

I am really new in r and stackoverflow. Apologies in advance for this novice question.
I have a panel data set like the following table.
ID Choice
1 1
1 1
1 2
1 5
1 1
2 1
2 1
2 5
2 1
2 1
3 3
3 1
3 1
3 2
3 4
I want to add another column like the following table when choice is 1. This is basically, sequencing the choice 1 within ID.
ID Choice BUS
1 1 0 (The first 1 will be considered as 0)
1 1 1
1 2 1
1 5 1
1 1 2
2 1 0
2 1 1
2 5 1
2 1 2
2 1 3
3 3 0
3 1 0
3 1 1
3 2 1
3 4 1
with(df, ave(Choice == 1, ID, FUN = cumsum))
Almost gives you what you want but as you want to consider first 1 as 0 it needs some modification.
df$BUS <- with(df, ave(Choice == 1, ID, FUN = function(x) {
inds = cumsum(x)
ifelse(inds > 0, inds - 1, inds)
}))
df
# ID Choice BUS
#1 1 1 0
#2 1 1 1
#3 1 2 1
#4 1 5 1
#5 1 1 2
#6 2 1 0
#7 2 1 1
#8 2 5 1
#9 2 1 2
#10 2 1 3
#11 3 3 0
#12 3 1 0
#13 3 1 1
#14 3 2 1
#15 3 4 1
Here we subtract 1 from cumulative sum from the first 1.
Using the same logic in dplyr
library(dplyr)
df %>%
group_by(ID) %>%
mutate(inds = cumsum(Choice == 1),
BUS = ifelse(inds > 0, inds - 1, inds)) %>%
select(-inds)
We can also use data.table
library(data.table)
setDT(df1)[, BUS := pmax(0, cumsum(Choice == 1)-1), ID]
df1
# ID Choice BUS
# 1: 1 1 0
# 2: 1 1 1
# 3: 1 2 1
# 4: 1 5 1
# 5: 1 1 2
# 6: 2 1 0
# 7: 2 1 1
# 8: 2 5 1
# 9: 2 1 2
#10: 2 1 3
#11: 3 3 0
#12: 3 1 0
#13: 3 1 1
#14: 3 2 1
#15: 3 4 1
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L), Choice = c(1L, 1L, 2L, 5L, 1L, 1L, 1L, 5L,
1L, 1L, 3L, 1L, 1L, 2L, 4L)), class = "data.frame", row.names = c(NA,
-15L))

R: adding in rows of zero based on the values in multiple columns

I am trying to append rows to an R data.frame. Here is an example of a data.frame "foo":
A B C D
1 1 1 200
1 1 2 50
1 1 3 15
1 2 1 150
1 2 4 50
1 3 1 300
2 1 2 40
2 1 4 90
2 3 2 80
For every A, there are 3 possible values of B, and for every B, there are 4 possible values of C. However, the initial df only contains non-zero values of D. I'd like to manipulate the df so that zeros are included for both B and C. Thus, the df would show 0's in D for any value of B/C that was 0. I have seen questions that address this with one column, but couldn't find a question addressing it with multiple columns. The final df would look like this:
A B C D
1 1 1 200
1 1 2 50
1 1 3 15
1 1 4 0
1 2 1 150
1 2 2 0
1 2 3 0
1 2 4 50
1 3 1 300
1 3 2 0
1 3 3 0
1 3 4 0
2 1 1 0
2 1 2 40
2 1 3 0
2 1 4 90
2 2 1 0
2 2 2 0
2 2 3 0
2 2 4 0
2 3 1 0
2 3 2 80
2 3 3 0
2 3 4 0
I first tried creating a dummy data frame that then merged with the initial df, but something isn't working right. Here's the current code, which I know is wrong because this code only generates rows based on A. I think I want to make the dummy frame based on A and B but I don't know how - could an if/else function work here?:
# create dummy df
dummy <- as.data.frame(
cbind(
sort(rep(unique(foo$A), 12)),
rep(1:3,length(unique(foo$A)))))
colnames(dummy) <- c("A","B")
foo$A <- as.numeric(foo$A)
foo$B <- as.numeric(foo$C)
# merge with foo
mergedummy <- merge(dummy,foo,all.x=T)
Any insight is greatly appreciated - thanks!
A one liner:
merge(dat, data.frame(table(dat[1:3]))[-4],all.y=TRUE)
# A B C D
#1 1 1 1 200
#2 1 1 2 50
#3 1 1 3 15
#4 1 1 4 NA
#...
Or maybe less complicated:
out <- data.frame(xtabs(D ~ ., data=dat))
out[do.call(order,out[1:3]),]
# A B C Freq
#1 1 1 1 200
#7 1 1 2 50
#13 1 1 3 15
#19 1 1 4 0
#...
Where dat is:
dat <- structure(list(A = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), B = c(1L,
1L, 1L, 2L, 2L, 3L, 1L, 1L, 3L), C = c(1L, 2L, 3L, 1L, 4L, 1L,
2L, 4L, 2L), D = c(200L, 50L, 15L, 150L, 50L, 300L, 40L, 90L,
80L)), .Names = c("A", "B", "C", "D"), class = "data.frame", row.names = c(NA,
-9L))
I created a master data frame which includes all combinations of A, B, and C as you describe in the expected outcome. Then, I merge the master data frame and your data frame. Finally, I replaced NA with 0.
master <- data.frame(A = rep(1:2, each = 12),
B = rep(1:3, each = 4),
C = rep(1:4, times = 6))
library(dplyr)
master %>%
left_join(., mydf) %>%
mutate(D = ifelse(D %in% NA, 0, D))
# A B C D
#1 1 1 1 200
#2 1 1 2 50
#3 1 1 3 15
#4 1 1 4 0
#5 1 2 1 150
#6 1 2 2 0
#7 1 2 3 0
#8 1 2 4 50
#9 1 3 1 300
#10 1 3 2 0
#11 1 3 3 0
#12 1 3 4 0
#13 2 1 1 0
#14 2 1 2 40
#15 2 1 3 0
#16 2 1 4 90
#17 2 2 1 0
#18 2 2 2 0
#19 2 2 3 0
#20 2 2 4 0
#21 2 3 1 0
#22 2 3 2 80
#23 2 3 3 0
#24 2 3 4 0
Here is one solution:
foo <- merge(expand.grid(lapply(foo[,1:3], unique)), foo, all=TRUE, sort=TRUE)
foo[is.na(foo)] <- 0

Resources