remove the first row for each group - r

suppose I have a dataset like this
df <- data.frame(group = c(rep(1,3),rep(2,2), rep(3,2),rep(4,3),rep(5, 2)), score = c(30, 10, 22, 44, 6, 5, 20, 35, 2, 60, 14,5))
group score
1 1 30
2 1 10
3 1 22
4 2 44
5 2 6
6 3 5
7 3 20
8 4 35
9 4 2
10 4 60
11 5 14
12 5 5
I want to remove the first row for each group, the expected out put should look like this:
group score
1 1 10
2 1 22
3 2 6
4 3 20
5 4 2
6 4 60
7 5 5
Is there a simple way to do this?

An option with dplyr is to select rows ignoring 1st row
library(dplyr)
df %>%
group_by(group) %>%
slice(2:n())
# group score
# <dbl> <dbl>
#1 1.00 10.0
#2 1.00 22.0
#3 2.00 6.00
#4 3.00 20.0
#5 4.00 2.00
#6 4.00 60.0
#7 5.00 5.00
Another way is shown by #Rich Scriven in now deleted answer
df %>%
group_by(group) %>%
slice(-1)

Quite simple with duplicated
df[duplicated(df$group),]
group score
2 1 10
3 1 22
5 2 6
7 3 20
9 4 2
10 4 60
12 5 5

Another base R option would be to check the adjacent elements
df[c(FALSE,df$group[-1]==df$group[-nrow(df)]),]
# group score
#2 1 10
#3 1 22
#5 2 6
#7 3 20
#9 4 2
#10 4 60
#12 5 5
Here I removed the first observation in 'group' (df$group[-1]) and compared (==) with the vector in which last observation is removed (df$group[-nrow(df)])). As the length of the comparison is one less than the nrow of the dataset, we pad with FALSE at the top and use this as logical index to subset the dataset.

dplyr::filter(df, group == lag(group))
group score
1 1 10
2 1 22
3 2 6
4 3 20
5 4 2
6 4 60
7 5 5
See lead and lag of package dplyr for more information:
https://dplyr.tidyverse.org/reference/lead-lag.html

Related

Binning a discrete variable (preferably in dplyr)

I would like to "bin" a large discrete variable by combining two consecutive rows into one bin. I would also like to call the bin by the first row value.
As an example:
x<-data.frame(x=c(1,2,3,4,5,6,7,8,9,10,11,12),
y=c(1,1,3,3,5,5,7,7,9,9,11,11))
x
We may use gl to create the grouping bin
library(dplyr)
x %>%
mutate(grp = as.integer(gl(n(), 2, n())))
x y grp
1 1 1 1
2 2 1 1
3 3 3 2
4 4 3 2
5 5 5 3
6 6 5 3
7 7 7 4
8 8 7 4
9 9 9 5
10 10 9 5
11 11 11 6
12 12 11 6
Performing the steps as you exactly outlined them would be this:
library(dplyr)
x %>%
mutate(bins = rep(1:(length(x) / 2), each = 2)) %>%
group_by(bins) %>%
filter(row_number() == 1) %>%
ungroup()
However this would give you the exact same result (without the bins column) in one line of code:
x[seq(1, nrow(x), by = 2), ]
Another way using seq and ceiling.
x$bin <- ceiling(seq(nrow(x))/2)
x
# x y bin
#1 1 1 1
#2 2 1 1
#3 3 3 2
#4 4 3 2
#5 5 5 3
#6 6 5 3
#7 7 7 4
#8 8 7 4
#9 9 9 5
#10 10 9 5
#11 11 11 6
#12 12 11 6

R: How to split a row in a dataframe into a number of rows, conditional on a value in a cell?

I have a data.frame which looks like the following:
id <- c("a","a","a","a","b","b","b","b")
age_from <- c(0,2,3,7,0,1,2,6)
age_to <- c(2,3,7,10,1,2,6,10)
y <- c(100,150,100,250,300,200,100,150)
df <- data.frame(id,age_from,age_to,y)
df$years <- df$age_to - df$age_from
Which gives a df that looks like:
id age_from age_to y years
1 a 0 2 100 2
2 a 2 3 150 1
3 a 3 7 100 4
4 a 7 10 250 3
5 b 0 1 300 1
6 b 1 2 200 1
7 b 2 6 100 4
8 b 6 10 150 4
Instead of having an unequal number of years per row, I would like to have 20 rows, 10 for each id, with each row accounting for one year. This would also involve averaging the y column across the number of years listed in the years column.
I believe this may have to be done using a loop 1:n with the n equaling a value in the years column. Although I am not sure how to start with this.
You can use rep to repeat the rows by the number of given years.
x <- df[rep(seq_len(nrow(df)), df$years),]
x
# id age_from age_to y years
#1 a 0 2 50.00000 2
#1.1 a 0 2 50.00000 2
#2 a 2 3 150.00000 1
#3 a 3 7 25.00000 4
#3.1 a 3 7 25.00000 4
#3.2 a 3 7 25.00000 4
#3.3 a 3 7 25.00000 4
#4 a 7 10 83.33333 3
#4.1 a 7 10 83.33333 3
#4.2 a 7 10 83.33333 3
#5 b 0 1 300.00000 1
#6 b 1 2 200.00000 1
#7 b 2 6 25.00000 4
#7.1 b 2 6 25.00000 4
#7.2 b 2 6 25.00000 4
#7.3 b 2 6 25.00000 4
#8 b 6 10 37.50000 4
#8.1 b 6 10 37.50000 4
#8.2 b 6 10 37.50000 4
#8.3 b 6 10 37.50000 4
When you mean with averaging the y column across the number of years to divide by the number of years:
x$y <- x$y / x$years
In case age_from should go from 0 to 9 and age_to from 1 to 10 for each id:
x$age_from <- x$age_from + ave(x$age_from, x$id, x$age_from, FUN=seq_along) - 1
#x$age_from <- ave(x$age_from, x$id, FUN=seq_along) - 1 #Alternative
x$age_to <- x$age_from + 1
Here is a solution with tidyr and dplyr.
First of all we complete age_from from 0 to 9 as you wanted, by keeping only the existing ids.
You will have several NAs on age_to, y and years. So, we fill them by dragging down each value in order to complete the immediately following values that are NA.
Now you can divide y by years (I assumed you meant this by setting the average value so to leave the sum consistent).
At that point, you only need to recalculate age_to accordingly.
Remember to ungroup at the end!
library(tidyr)
library(dplyr)
df %>%
complete(id, age_from = 0:9) %>%
group_by(id) %>%
fill(y, years, age_to) %>%
mutate(y = y/years) %>%
mutate(age_to = age_from + 1) %>%
ungroup()
# A tibble: 20 x 5
id age_from age_to y years
<chr> <dbl> <dbl> <dbl> <dbl>
1 a 0 1 50 2
2 a 1 2 50 2
3 a 2 3 150 1
4 a 3 4 25 4
5 a 4 5 25 4
6 a 5 6 25 4
7 a 6 7 25 4
8 a 7 8 83.3 3
9 a 8 9 83.3 3
10 a 9 10 83.3 3
11 b 0 1 300 1
12 b 1 2 200 1
13 b 2 3 25 4
14 b 3 4 25 4
15 b 4 5 25 4
16 b 5 6 25 4
17 b 6 7 37.5 4
18 b 7 8 37.5 4
19 b 8 9 37.5 4
20 b 9 10 37.5 4
A tidyverse solution.
library(tidyverse)
df %>%
mutate(age_to = age_from + 1) %>%
group_by(id) %>%
complete(nesting(age_from = 0:9, age_to = 1:10)) %>%
fill(y, years) %>%
mutate(y = y / years)
# A tibble: 20 x 5
# Groups: id [2]
id age_from age_to y years
<chr> <dbl> <dbl> <dbl> <dbl>
1 a 0 1 50 2
2 a 1 2 50 2
3 a 2 3 150 1
4 a 3 4 25 4
5 a 4 5 25 4
6 a 5 6 25 4
7 a 6 7 25 4
8 a 7 8 83.3 3
9 a 8 9 83.3 3
10 a 9 10 83.3 3
11 b 0 1 300 1
12 b 1 2 200 1
13 b 2 3 25 4
14 b 3 4 25 4
15 b 4 5 25 4
16 b 5 6 25 4
17 b 6 7 37.5 4
18 b 7 8 37.5 4
19 b 8 9 37.5 4
20 b 9 10 37.5 4

How to remove a x quantity between several last rows?

I'm facing an issue concerning my willingness to remove a quantity to my last rows until this quantity goes to 0.
For instance, if my quantity to remove is 20, how can I remove it in my dataframe:
data.frame(Time = c(1,2,3,4,5,6,7,8,9,10), Quantity = c(5,9,2,17,23,101,15,21,7,3))
So, I have to subtract it to my last rows, to obtain:
data.frame(Time = c(1,2,3,4,5,6,7,8), Quantity = c(5,9,2,17,23,101,15,11))
Should I try a while loop?
Using base R you could do something like:
del_val = function(val){
a = cumsum(rev(df$Quantity))
b = which(a>val)[1]
replace(head(df,-b+1),cbind(nrow(df)-b+1,2),a[b]-val)
}
del_val(20)
Time Quantity
1 1 5
2 2 9
3 3 2
4 4 17
5 5 23
6 6 101
7 7 15
8 8 11
del_val(9)
Time Quantity
1 1 5
2 2 9
3 3 2
4 4 17
5 5 23
6 6 101
7 7 15
8 8 21
9 9 1
We can write a function to remove rows
return_rows <- function(df, n) {
vals <- cumsum(rev(df$Quantity))
inds <- nrow(df) - max(which(vals < n))
df$Quantity[inds] <- df$Quantity[inds] - (n - vals[nrow(df) - inds])
df[seq_len(inds), ]
}
return_rows(df,20)
# Time Quantity
#1 1 5
#2 2 9
#3 3 2
#4 4 17
#5 5 23
#6 6 101
#7 7 15
#8 8 11
return_rows(df,40)
# Time Quantity
#1 1 5
#2 2 9
#3 3 2
#4 4 17
#5 5 23
#6 6 101
#7 7 6

Remove first occurance of column value from dataframe in r [duplicate]

suppose I have a dataset like this
df <- data.frame(group = c(rep(1,3),rep(2,2), rep(3,2),rep(4,3),rep(5, 2)), score = c(30, 10, 22, 44, 6, 5, 20, 35, 2, 60, 14,5))
group score
1 1 30
2 1 10
3 1 22
4 2 44
5 2 6
6 3 5
7 3 20
8 4 35
9 4 2
10 4 60
11 5 14
12 5 5
I want to remove the first row for each group, the expected out put should look like this:
group score
1 1 10
2 1 22
3 2 6
4 3 20
5 4 2
6 4 60
7 5 5
Is there a simple way to do this?
An option with dplyr is to select rows ignoring 1st row
library(dplyr)
df %>%
group_by(group) %>%
slice(2:n())
# group score
# <dbl> <dbl>
#1 1.00 10.0
#2 1.00 22.0
#3 2.00 6.00
#4 3.00 20.0
#5 4.00 2.00
#6 4.00 60.0
#7 5.00 5.00
Another way is shown by #Rich Scriven in now deleted answer
df %>%
group_by(group) %>%
slice(-1)
Quite simple with duplicated
df[duplicated(df$group),]
group score
2 1 10
3 1 22
5 2 6
7 3 20
9 4 2
10 4 60
12 5 5
Another base R option would be to check the adjacent elements
df[c(FALSE,df$group[-1]==df$group[-nrow(df)]),]
# group score
#2 1 10
#3 1 22
#5 2 6
#7 3 20
#9 4 2
#10 4 60
#12 5 5
Here I removed the first observation in 'group' (df$group[-1]) and compared (==) with the vector in which last observation is removed (df$group[-nrow(df)])). As the length of the comparison is one less than the nrow of the dataset, we pad with FALSE at the top and use this as logical index to subset the dataset.
dplyr::filter(df, group == lag(group))
group score
1 1 10
2 1 22
3 2 6
4 3 20
5 4 2
6 4 60
7 5 5
See lead and lag of package dplyr for more information:
https://dplyr.tidyverse.org/reference/lead-lag.html

How to combine common values of two column while averaging the corresponding values of other columns

I have a data frame
> df<- as.data.frame(cbind(a=c(rep(3, 5), rep(5, 5)), b=c(rep(1, 2), rep(2, 2), rep(3, 2), rep(4, 2), rep(5, 2)), c=(11:20)))
> df
a b c
1 3 1 11
2 3 1 12
3 3 2 13
4 3 2 14
5 3 3 15
6 5 3 16
7 5 4 17
8 5 4 18
9 5 5 19
10 5 5 20
I want to combine the common elements of column "a" and "b" like the count function in plyr package does:
> count(df, vars= c("a", "b"))
But I also want to keep my column "c" that will contain the mean values of all the combined rows. As shown below in column "m"
a b freq m
1 3 1 2 11.5
2 3 2 2 13.5
3 3 3 1 15.0
4 5 3 1 16.0
5 5 4 2 17.5
6 5 5 2 19.5
Any suggestion who can I do that
We can group by 'a', 'b', then summarise to create the number of rows as 'freq' and mean of 'c' as 'm'
library(dplyr)
df %>%
group_by(a, b) %>%
summarise(freq = n(), m= mean(c))
# a b freq m
# (dbl) (dbl) (int) (dbl)
#1 3 1 2 11.5
#2 3 2 2 13.5
#3 3 3 1 15.0
#4 5 3 1 16.0
#5 5 4 2 17.5
#6 5 5 2 19.5
Or use data.table:
setDT(df)
res <- df[,.(freq=.N, m=mean(c)),by=.(a,b)]
res
a b freq m
1: 3 1 2 11.5
2: 3 2 2 13.5
3: 3 3 1 15.0
4: 5 3 1 16.0
5: 5 4 2 17.5
6: 5 5 2 19.5

Resources