Keep only the second observation per group in R - r

I have a data frame ordered by id variables ("city"), and I want to keep the second observation of those cities that have more than one observation.
For example, here's an example data set:
city <- c(1,1,2,3,3,4,5,6,7,7,8)
value <- c(3,5,7,8,2,5,4,2,3,2,3)
mydata <- data.frame(city, value)
Then we have:
city value
1 1 3
2 1 5
3 2 7
4 3 8
5 3 2
6 4 5
7 5 4
8 6 2
9 7 3
10 7 2
11 8 3
The ideal outcome would be:
city value
2 1 5
3 2 7
5 3 2
6 4 5
7 5 4
8 6 2
10 7 2
11 8 3
Any help is appreciated!

dplyr
library(dplyr)
mydata %>%
group_by(city) %>%
filter(n() == 1L | row_number() == 2L) %>%
ungroup()
# # A tibble: 8 x 2
# city value
# <dbl> <dbl>
# 1 1 5
# 2 2 7
# 3 3 2
# 4 4 5
# 5 5 4
# 6 6 2
# 7 7 2
# 8 8 3
or slightly different
mydata %>%
group_by(city) %>%
slice(min(n(), 2)) %>%
ungroup()
base R
ind <- ave(rep(TRUE, nrow(mydata)), mydata$city,
FUN = function(z) length(z) == 1L | seq_along(z) == 2L)
ind
# [1] FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
mydata[ind,]
# city value
# 2 1 5
# 3 2 7
# 5 3 2
# 6 4 5
# 7 5 4
# 8 6 2
# 10 7 2
# 11 8 3
data.table
Since you mentioned "is way bigger", you might consider data.table at some point for its speed and referential semantics. (And it doesn't hurt that this code is much more terse :-)
library(data.table)
DT <- as.data.table(mydata) # normally one might use setDT(mydata) instead ...
DT[, .SD[min(.N, 2),], by = city]
# city value
# <num> <num>
# 1: 1 5
# 2: 2 7
# 3: 3 2
# 4: 4 5
# 5: 5 4
# 6: 6 2
# 7: 7 2
# 8: 8 3

Here is logic that uses pmin() to choose either 2 or 1 depending on the length of the vector of value-values:
aggregate( value ~ city, mydata, function(x) x[ pmin(2, length(x))] )
city value
1 1 5
2 2 7
3 3 2
4 4 5
5 5 4
6 6 2
7 7 2
8 8 3
The aggregate function delivers vectors of value split on the basis of city-values.

You may try
library(dplyr)
mydata %>%
group_by(city) %>%
filter(case_when(n()> 1 ~ row_number() == 2,
TRUE ~ row_number()== 1))
city value
<dbl> <dbl>
1 1 5
2 2 7
3 3 2
4 4 5
5 5 4
6 6 2
7 7 2
8 8 3

Another dplyr solution:
mydata %>% group_by(city) %>%
summarize(value=value[pmin(2, n())])
Or:
mydata %>% group_by(city) %>%
summarize(value=ifelse(n() >= 2, value[2], value[1]))
Both Output:
city value
<dbl> <dbl>
1 1 5
2 2 7
3 3 2
4 4 5
5 5 4
6 6 2
7 7 2
8 8 3

If base R is ok try this:
EDIT (since performance really seems to be important):
Using if as a function, should give a 100-fold speed-up in some cases.
aggregate( value ~ city, mydata, function(x) `if`(!is.na(x[2]),x[2],x[1]) )
city value
1 1 5
2 2 7
3 3 2
4 4 5
5 5 4
6 6 2
7 7 2
8 8 3
Benchmarks
Here're some benchmarks because I was curious. I gathered all solutions and let them run through microbenchmark.
Bottom line is 'if'(cond,T,F) is fastest (22.3% faster than ifelse and 17-times faster than the slowest), followed by ifelse and aggregate(pmin). Keep in mind that the data.table solution only ran on one core. So all speed-up in that package comes from parallelization. No real shocker but interesting nonetheless.
library(microbenchmark)
lengths( mydata )
city value
20000 20000
c( class(mydata$value), class(mydata$value) )
[1] "integer" "integer"
microbenchmark("aggr_if_function" = { res <- aggregate( value ~ city, mydata, function(x) `if`(!is.na(x[2]),x[2],x[1]) )},
"aggr_ifelse" = { res <- aggregate( value ~ city, mydata, function(x) ifelse(!is.na(x[2]),x[2],x[1]) ) },
"dplyr_filter" = { res <- mydata %>% group_by(city) %>% filter(n() == 1L | row_number() == 2L) %>% ungroup() },
"dplyr_slice" = { res <- mydata %>% group_by(city) %>% slice(min(n(), 2)) %>% ungroup() },
"data.table_single_core" = { res <- DT[, .SD[min(.N, 2),], by = city] },
"aggr_pmin" = { res <- aggregate( value ~ city, mydata, function(x) x[ pmin(2, length(x))] ) },
"dplyr_filter_case_when" = { res <- mydata %>% group_by(city) %>% filter(case_when(n()> 1 ~ row_number() == 2, TRUE ~ row_number()== 1)) },
"group_split_purrr" = { res <- group_split(mydata, city) %>% map_if(~nrow(.) > 1, ~.[2, ]) %>% bind_rows() }, times=50)
Unit: milliseconds
expr min lq mean median uq
aggr_if_function 175.5104 179.3273 184.5157 182.1778 186.8963
aggr_ifelse 214.5846 220.7074 229.2062 228.0688 234.1087
dplyr_filter 585.5275 607.7011 643.6320 632.0794 660.8184
dplyr_slice 713.4047 762.9887 792.7491 780.8475 803.7191
data.table_single_core 2080.3869 2164.3829 2240.8578 2229.5310 2298.9002
aggr_pmin 321.5265 330.5491 343.2752 341.7866 352.2880
dplyr_filter_case_when 3171.4859 3337.1669 3492.6915 3500.7783 3608.1809
group_split_purrr 1466.4527 1543.2597 1590.9994 1588.0186 1630.5590
max neval cld
212.6006 50 a
253.0433 50 a
1066.6018 50 c
1304.4045 50 d
2702.4201 50 f
457.3435 50 b
4195.0774 50 g
1786.5310 50 e

Combining group_split and map_if:
library(tidyverse)
city <- c(1,1,2,3,3,4,5,6,7,7,8)
value <- c(3,5,7,8,2,5,4,2,3,2,3)
value2 <- c(3,5,7,8,2,5,4,2,3,2,3)
mydata <- data.frame(city, value)
group_split(mydata, city) %>%
map_if(~nrow(.) > 1, ~.[2, ]) %>% bind_rows()
#> # A tibble: 8 × 2
#> city value
#> <dbl> <dbl>
#> 1 1 5
#> 2 2 7
#> 3 3 2
#> 4 4 5
#> 5 5 4
#> 6 6 2
#> 7 7 2
#> 8 8 3
Created on 2021-11-30 by the reprex package (v2.0.1)

Related

How to replace repeating entries in a data frame with n-(number of times it's repeated) in R?

In my data I have repeating entries in a column. What I'm trying to do is if an entry n is repeated more than 2 times within a column, then I want to replace that entry with n-(number_of_times_it_has_repeated - 2). For example, if my data looks like this:
df <- data.frame(
A = c(1,2,2,4,5,7,7,7,7,2,8,8),
B = c(2,3,4,5,6,7,8,9,10,11,12,13)
)
> df
A B
1 2
2 3
2 4
4 5
5 6
7 7
7 8
7 9
7 10
2 11
8 12
8 13
we can see that in df$A 7 is repeated 4 times. If the entry is repeated more than 2 times, then I want to replace that entry. So in my example,the 1st and 2nd entry of the number 7 would remain unchanged. The 3rd instance of the number 7 would be replaced by : 7 - (3-2). The 4th instance of number 7 would be replaced by 7 - (4-2).
We can also see that in df$A, the number 2 is repeated 3 times. using the same method, the 3rd instance of number 2 would be replaced with 2 - (3-2).
As there are no repeating values in df$B, that column would remain unchanged.
For clarity, my expected result would be:
dfNew <- data.frame(
A = c(1,2,2,4,5,7,7,6,5,1,8,8),
B = c(2,3,4,5,6,7,8,9,10,11,12,13)
)
> dfNew
A B
1 2
2 3
2 4
4 5
5 6
7 7
7 8
6 9
5 10
1 11
8 12
8 13
Here's how you can do it for one column -
library(dplyr)
df %>%
group_by(A) %>%
transmute(A = A - c(rep(0, 2), row_number())[row_number()]) %>%
ungroup
# A
# <dbl>
# 1 1
# 2 2
# 3 2
# 4 4
# 5 5
# 6 7
# 7 7
# 8 6
# 9 5
#10 1
#11 8
#12 8
To do it for all the columns you can use map_dfc -
purrr::map_dfc(names(df), ~{
df %>%
group_by(.data[[.x]]) %>%
transmute(!!.x := .data[[.x]] - c(rep(0, 2), row_number())[row_number()])%>%
ungroup
})
# A B
# <dbl> <dbl>
# 1 1 2
# 2 2 3
# 3 2 4
# 4 4 5
# 5 5 6
# 6 7 7
# 7 7 8
# 8 6 9
# 9 5 10
#10 1 11
#11 8 12
#12 8 13
The logic here is that for each number we subtract 0 from first 2 values and later we subtract -1, -2 and so on.
You can skip the order if you don't want it here is my approach, if you have some data where after the changes there are still some duplicates then i can work on the answer to put it in a function or something.
my_df <- data.frame(A = c(1,2,2,4,5,7,7,7,7,2,8,8),
B = c(2,3,4,5,6,7,8,9,10,11,12,13),
stringsAsFactors = FALSE)
my_df <- my_df[order(my_df$A, my_df$B),]
my_df$Id <- seq.int(from = 1, to = nrow(my_df), by = 1)
my_temp <- my_df %>% group_by(A) %>% filter(n() > 2) %>% mutate(Count = seq.int(from = 1, to = n(), by = 1)) %>% filter(Count > 2) %>% mutate(A = A - (Count - 2))
my_var <- which(my_df$Id %in% my_temp$Id)
if (length(my_var)) {
my_df <- my_df[-my_var,]
my_df <- rbind(my_df, my_temp[, c("A", "B", "Id")])
}
my_df <- my_df[order(my_df$A, my_df$B),]
A base R option using ave + pmax + seq_along
list2DF(
lapply(
df,
function(x) {
x - ave(x, x, FUN = function(v) pmax(seq_along(v) - 2, 0))
}
)
)
gives
A B
1 1 2
2 2 3
3 2 4
4 4 5
5 5 6
6 7 7
7 7 8
8 6 9
9 5 10
10 1 11
11 8 12
12 8 13

Dense Rank by Multiple Columns in R

How can I get a dense rank of multiple columns in a dataframe? For example,
# I have:
df <- data.frame(x = c(1,1,1,1,2,2,2,3,3,3),
y = c(1,2,3,4,2,2,2,1,2,3))
# I want:
res <- data.frame(x = c(1,1,1,1,2,2,2,3,3,3),
y = c(1,2,3,4,2,2,2,1,2,3),
r = c(1,2,3,4,5,5,5,6,7,8))
res
x y z
1 1 1 1
2 1 2 2
3 1 3 3
4 1 4 4
5 2 2 5
6 2 2 5
7 2 2 5
8 3 1 6
9 3 2 7
10 3 3 8
My hack approach works for this particular dataset:
df %>%
arrange(x,y) %>%
mutate(r = if_else(y - lag(y,default=0) == 0, 0, 1)) %>%
mutate(r = cumsum(r))
But there must be a more general solution, maybe using functions like dense_rank() or row_number(). But I'm struggling with this.
dplyr solutions are ideal.
Right after posting, I think I found a solution here. In my case, it would be:
mutate(df, r = dense_rank(interaction(x,y,lex.order=T)))
But if you have a better solution, please share.
data.table
data.table has you covered with frank().
library(data.table)
frank(df, x,y, ties.method = 'min')
[1] 1 2 3 4 5 5 5 8 9 10
You can df$r <- frank(df, x,y, ties.method = 'min') to add as a new column.
tidyr/dplyr
Another option (though clunkier) is to use tidyr::unite to collapse your columns to one plus dplyr::dense_rank.
library(tidyverse)
df %>%
# add a single column with all the info
unite(xy, x, y) %>%
cbind(df) %>%
# dense rank on that
mutate(r = dense_rank(xy)) %>%
# now drop the helper col
select(-xy)
You can use cur_group_id:
library(dplyr)
df %>%
group_by(x, y) %>%
mutate(r = cur_group_id())
# x y r
# <dbl> <dbl> <int>
# 1 1 1 1
# 2 1 2 2
# 3 1 3 3
# 4 1 4 4
# 5 2 2 5
# 6 2 2 5
# 7 2 2 5
# 8 3 1 6
# 9 3 2 7
# 10 3 3 8

How to remove zero values until the first non-zero value occurs in an R dataframe?

The title says it all! I have grouped data where I'd like to remove rows up until the first 0 value by id group.
Example code:
problem <- data.frame(
id = c(1,1,1,1,2,2,2,2,3,3,3,3),
value = c(0,0,2,0,0,8,4,2,1,7,6,5)
)
solution <- data.frame(
id = c(1,1,2,2,2,3,3,3,3),
value = c(2,0,8,4,2,1,7,6,5)
)
Here is a dplyr solution:
library(dplyr)
problem %>%
group_by(id) %>%
mutate(first_match = min(row_number()[value != 0])) %>%
filter(row_number() >= first_match) %>%
select(-first_match) %>%
ungroup()
# A tibble: 9 x 2
id value
<dbl> <dbl>
1 1 2
2 1 0
3 2 8
4 2 4
5 2 2
6 3 1
7 3 7
8 3 6
9 3 5
Or more succinctly per Tjebo's comment:
problem %>%
group_by(id) %>%
filter(row_number() >= min(row_number()[value != 0])) %>%
ungroup()
You can do this in base R:
subset(problem,ave(value,id,FUN=cumsum)>0)
# id value
# 3 1 2
# 4 1 0
# 6 2 8
# 7 2 4
# 8 2 2
# 9 3 1
# 10 3 7
# 11 3 6
# 12 3 5
Use abs(value) if you have negative values in your real case.

Automatically rowwise a function for dataframe usage

I have a function**:
do_thing <- function(x) {
return(x + runif(1, 0, 100))
}
That I'd like to apply to my data:
df <- tibble(x = 1:10)
Preferably with mutate:
set.seed(1)
df %>%
mutate(y = do_thing(x))
The function, however, is not performing as expected:
# x y
# 1 1 27.55087
# 2 2 28.55087
# 3 3 29.55087
# 4 4 30.55087
# 5 5 31.55087
# 6 6 32.55087
# 7 7 33.55087
# 8 8 34.55087
# 9 9 35.55087
# 10 10 36.55087
I actually want the function to apply in a rowwise fashion:
df %>%
rowwise() %>%
mutate(y = do_thing(x))
# x y
# 1 1 38.21239
# 2 2 59.28534
# 3 3 93.82078
# 4 4 24.16819
# 5 5 94.83897
# 6 6 100.46753
# 7 7 73.07978
# 8 8 70.91140
# 9 9 15.17863
# 10 10 30.59746
Is there a way that I might be able to rewrite my function so that it is flexible and can automatically default to rowwise while still working with a single input (ie., do_thing(100))?
** actual function is a lot more complex
Instead of getting the runif for 1 observation, we can specify the n as the number of rows (n()) of the dataset
set.seed(24)
df %>%
mutate(y = x + runif(n(), 0, 100))
# A tibble: 10 x 2
# x y
# <int> <dbl>
# 1 1 46.952549
# 2 2 61.939816
# 3 3 94.972191
# 4 4 102.282408
# 5 5 8.780258
# 6 6 63.793740
# 7 7 80.331417
# 8 8 32.874240
# 9 9 39.073652
#10 10 83.346670

R, dplyr: cumulative version of n_distinct

I have a dataframe as follows. It is ordered by column time.
Input -
df = data.frame(time = 1:20,
grp = sort(rep(1:5,4)),
var1 = rep(c('A','B'),10)
)
head(df,10)
time grp var1
1 1 1 A
2 2 1 B
3 3 1 A
4 4 1 B
5 5 2 A
6 6 2 B
7 7 2 A
8 8 2 B
9 9 3 A
10 10 3 B
I want to create another variable var2 which computes no of distinct var1 values so far i.e. until that point in time for each group grp . This is a little different from what I'd get if I were to use n_distinct.
Expected output -
time grp var1 var2
1 1 1 A 1
2 2 1 B 2
3 3 1 A 2
4 4 1 B 2
5 5 2 A 1
6 6 2 B 2
7 7 2 A 2
8 8 2 B 2
9 9 3 A 1
10 10 3 B 2
I want to create a function say cum_n_distinct for this and use it as -
d_out = df %>%
arrange(time) %>%
group_by(grp) %>%
mutate(var2 = cum_n_distinct(var1))
A dplyr solution inspired from #akrun's answer -
Ths logic is basically to set 1st occurrence of each unique values of var1 to 1 and rest to 0 for each group grp and then apply cumsum on it -
df = df %>%
arrange(time) %>%
group_by(grp,var1) %>%
mutate(var_temp = ifelse(row_number()==1,1,0)) %>%
group_by(grp) %>%
mutate(var2 = cumsum(var_temp)) %>%
select(-var_temp)
head(df,10)
Source: local data frame [10 x 4]
Groups: grp
time grp var1 var2
1 1 1 A 1
2 2 1 B 2
3 3 1 A 2
4 4 1 B 2
5 5 2 A 1
6 6 2 B 2
7 7 2 A 2
8 8 2 B 2
9 9 3 A 1
10 10 3 B 2
Assuming stuff is ordered by time already, first define a cumulative distinct function:
dist_cum <- function(var)
sapply(seq_along(var), function(x) length(unique(head(var, x))))
Then a base solution that uses ave to create groups (note, assumes var1 is factor), and then applies our function to each group:
transform(df, var2=ave(as.integer(var1), grp, FUN=dist_cum))
A data.table solution, basically doing the same thing:
library(data.table)
(data.table(df)[, var2:=dist_cum(var1), by=grp])
And dplyr, again, same thing:
library(dplyr)
df %>% group_by(grp) %>% mutate(var2=dist_cum(var1))
Try:
Update
With your new dataset, an approach in base R
df$var2 <- unlist(lapply(split(df, df$grp),
function(x) {x$var2 <-0
indx <- match(unique(x$var1), x$var1)
x$var2[indx] <- 1
cumsum(x$var2) }))
head(df,7)
# time grp var1 var2
# 1 1 1 A 1
# 2 2 1 B 2
# 3 3 1 A 2
# 4 4 1 B 2
# 5 5 2 A 1
# 6 6 2 B 2
# 7 7 2 A 2
Here's another solution using data.table that's pretty quick.
Generic Function
cum_n_distinct <- function(x, na.include = TRUE){
# Given a vector x, returns a corresponding vector y
# where the ith element of y gives the number of unique
# elements observed up to and including index i
# if na.include = TRUE (default) NA is counted as an
# additional unique element, otherwise it's essentially ignored
temp <- data.table(x, idx = seq_along(x))
firsts <- temp[temp[, .I[1L], by = x]$V1]
if(na.include == FALSE) firsts <- firsts[!is.na(x)]
y <- rep(0, times = length(x))
y[firsts$idx] <- 1
y <- cumsum(y)
return(y)
}
Example Use
cum_n_distinct(c(5,10,10,15,5)) # 1 2 2 3 3
cum_n_distinct(c(5,NA,10,15,5)) # 1 2 3 4 4
cum_n_distinct(c(5,NA,10,15,5), na.include = FALSE) # 1 1 2 3 3
Solution To Your Question
d_out = df %>%
arrange(time) %>%
group_by(grp) %>%
mutate(var2 = cum_n_distinct(var1))

Resources