Related
I have a series of rows in a single dataframe. I'm trying to aggregate the first two rows for each ID- i.e. - I want to combine events 1 and 2 for ID 1 into a single row, events 1 and 2 for ID 2 into a singlw row etc, but leave event 3 completely untouched.
id <- c(1,1,1,2,2,2,3,3,3,4,4,4,5,5,5)
event <- c(1,2,3,1,2,3,1,2,3,1,2,3,1,2,3)
score <- c(3,NA,1,3,NA,2,6,NA,1,8,NA,2,4,NA,1)
score2 <- c(NA,4,1,NA,5,2,NA,0,3,NA,5,6,NA,8,7)
df <- tibble(id, event, score, score2)
# A tibble: 15 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 NA
2 1 2 NA 4
3 1 3 1 1
4 2 1 3 NA
5 2 2 NA 5
6 2 3 2 2
7 3 1 6 NA
8 3 2 NA 0
9 3 3 1 3
10 4 1 8 NA
11 4 2 NA 5
12 4 3 2 6
13 5 1 4 NA
14 5 2 NA 8
15 5 3 1 7
I've tried :
df_merged<- df %>% group_by (id) %>% summarise_all(funs(min(as.character(.),na.rm=TRUE))),
which aggregates these nicely, but then I struggle to merge these back into the orignal dataframe/tibble (there are really about 300 different "score" columns in the full dataset, so a right_join is a headache with score.x, score.y, score2.x, score2.y all over the place...)
Ideally, the situation would need to be dplyr as the rest of my code runs on this!
EDIT:
Ideally, my expected output would be:
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
3 1 3 1 1
4 2 1 3 5
6 2 3 2 2
7 3 1 6 0
9 3 3 1 3
10 4 1 8 5
12 4 3 2 6
13 5 1 4 8
15 5 3 1 7
We may change the order of NA elements with replace
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with('score'),
~replace(., 1:2, .[1:2][order(is.na(.[1:2]))]))) %>%
ungroup %>%
filter(if_all(starts_with('score'), Negate(is.na)))
-output
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7
Here is an alternative way to achieve your task with fill from tidyr package:
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
fill(everything(), .direction = "down") %>%
fill(everything(), .direction = "up") %>%
slice(1,3)
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7
How about this?
library(dplyr)
df_e12 <- df %>%
filter(event %in% c(1, 2)) %>%
group_by(id) %>%
mutate(across(starts_with("score"), ~min(.x, na.rm = TRUE))) %>%
ungroup() %>%
distinct(id, .keep_all = TRUE)
df_e3 <- df %>%
filter(event == 3)
df <- bind_rows(df_e12, df_e3) %>%
arrange(id, event)
df
> df
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7
I have the following data set containing duplicate columns and I would like to stack them but in the following way. I can get the desired output with bind_rows but I would like to try it with tidyr functions:
df <- tibble(
runs = c(1, 2, 3, 4),
col1 = c(3, 4, 5, 5),
col2 = c(5, 3, 1, 4),
col3 = c(6, 4, 9, 2),
col1 = c(0, 2, 2, 1),
col2 = c(2, 3, 1, 7),
col3 = c(2, 4, 9, 9),
col1 = c(3, 4, 5, 7),
col2 = c(3, 3, 1, 4),
col3 = c(3, 2, NA, NA), .name_repair = "minimal")
df %>%
select(runs, 2:4) %>%
bind_rows(df %>%
select(runs, 5:7)) %>%
bind_rows(df %>%
select(runs, 8:10))
# A tibble: 12 x 4 # This is my desired output in a way that column runs is a repeated number of 1 to 4
runs col1 col2 col3
<dbl> <dbl> <dbl> <dbl>
1 1 3 5 6
2 2 4 3 4
3 3 5 1 9
4 4 5 4 2
5 1 0 2 2
6 2 2 3 4
7 3 2 1 9
8 4 1 7 9
9 1 3 3 3
10 2 4 3 2
11 3 5 1 NA
12 4 7 4 NA
However when I use tidyr the runs is arranged differently in the following way.
df %>%
pivot_longer(-runs) %>%
group_by(name) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = name, values_from = value) %>%
select(-id)
# A tibble: 12 x 4
runs col1 col2 col3
<dbl> <dbl> <dbl> <dbl>
1 1 3 5 6
2 1 0 2 2
3 1 3 3 3
4 2 4 3 4
5 2 2 3 4
6 2 4 3 2
7 3 5 1 9
8 3 2 1 9
9 3 5 1 NA
10 4 5 4 2
11 4 1 7 9
12 4 7 4 NA
I would be grateful if you could let me know how I could rearrange runs so that the numbers are sequential and not like three 1 in a row and ...
Thank you very much in advance.
There may be a more elegant way to do this, but could you not simply group by runs and use the row numbers to arrange.
df %>%
pivot_longer(cols = starts_with("col"),
names_to = c(".value")) %>%
group_by(runs) %>%
mutate(grp_n = row_number()) %>%
ungroup() %>%
arrange(grp_n, runs)
# A tibble: 12 x 5
runs col1 col2 col3 grp_n
<dbl> <dbl> <dbl> <dbl> <int>
1 1 3 5 6 1
2 2 4 3 4 1
3 3 5 1 9 1
4 4 5 4 2 1
5 1 0 2 2 2
6 2 2 3 4 2
7 3 2 1 9 2
8 4 1 7 9 2
9 1 3 3 3 3
10 2 4 3 2 3
11 3 5 1 NA 3
12 4 7 4 NA 3
A base R option using split.default :
data.frame(runs = df$runs,
sapply(split.default(df[-1], names(df)[-1]), unlist),row.names = NULL)
# runs col1 col2 col3
#1 1 3 5 6
#2 2 4 3 4
#3 3 5 1 9
#4 4 5 4 2
#5 1 0 2 2
#6 2 2 3 4
#7 3 2 1 9
#8 4 1 7 9
#9 1 3 3 3
#10 2 4 3 2
#11 3 5 1 NA
#12 4 7 4 NA
This question already has answers here:
Filter group of rows based on sum of values from different column
(2 answers)
Closed 2 years ago.
I have a data.frame that looks like this
data=data.frame(group=c("A","B","C","A","B","C","A","B","C"),
time= c(rep(1,3),rep(2,3), rep(3,3)),
value=c(0,1,1,0.1,10,20,10,20,30))
group time value
1 A 1 0.0
2 B 1 1.0
3 C 1 1.0
4 A 2 0.1
5 B 2 10.0
6 C 2 20.0
7 A 3 10.0
8 B 3 20.0
9 C 3 30.0
I would like to find an elegant way to erase a group when its values are smaller < 0.2 in two different time points. Those points do not have to be consecutive.
In this case, I would like to filter out group A because its value at time point 1 and time point 2 is smaller than < 0.2.
group time value
1 B 1 1.0
2 C 1 1.0
3 B 2 10.0
4 C 2 20.0
5 B 3 20.0
6 C 3 30.0
With this solution you check that no group has more than 1 observation with values under 0.2 as you requested.
library(dplyr)
data %>%
group_by(group) %>%
filter(sum(value < 0.2) < 2) %>%
ungroup()
#> # A tibble: 6 x 3
#> group time value
#> <chr> <dbl> <dbl>
#> 1 B 1 1
#> 2 C 1 1
#> 3 B 2 10
#> 4 C 2 20
#> 5 B 3 20
#> 6 C 3 30
But if you are really a fan of base R:
data[ave(data$value<0.2, data$group, FUN = function(x) sum(x)<2), ]
#> group time value
#> 2 B 1 1
#> 3 C 1 1
#> 5 B 2 10
#> 6 C 2 20
#> 8 B 3 20
#> 9 C 3 30
Try this dplyr approach:
library(tidyverse)
#Code
data <- data %>% group_by(group) %>% mutate(Flag=any(value<0.2)) %>%
filter(Flag==F) %>% select(-Flag)
Output:
# A tibble: 6 x 3
# Groups: group [2]
group time value
<fct> <dbl> <dbl>
1 B 1 1
2 C 1 1
3 B 2 10
4 C 2 20
5 B 3 20
6 C 3 30
I am trying to calculate the mean of time by keeping all the variables in the final dataset within dplyr package.
Here how my sample dataset looks like:
library(dplyr)
id <- c(1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4)
gender <- c(1,1,1,1, 2,2,2,2, 2,2,2,2, 1,1,1,1)
item.id <-c(1,1,1,2, 1,1,2,2, 1,2,3,4, 1,2,2,3)
sequence<-c(1,2,3,1, 1,2,1,2, 1,1,1,1, 1,1,2,1)
time <- c(5,6,7,1, 2,3,4,9, 1,2,3,9, 5,6,7,8)
data <- data.frame(id, gender, item.id, sequence, time)
> data
id gender item.id sequence time
1 1 1 1 1 5
2 1 1 1 2 6
3 1 1 1 3 7
4 1 1 2 1 1
5 2 2 1 1 2
6 2 2 1 2 3
7 2 2 2 1 4
8 2 2 2 2 9
9 3 2 1 1 1
10 3 2 2 1 2
11 3 2 3 1 3
12 3 2 4 1 9
13 4 1 1 1 5
14 4 1 2 1 6
15 4 1 2 2 7
16 4 1 3 1 8
id for student id, gender for gender, item.id for the question ids students take, sequence is the sequence number of attempts to solve the question because students might return back to questions and try to answer again, and time is the time spent on each trial.
When calculating the mean of the time, I need to follow three steps:
(a) students have multiple trials for each question. I need to calculate the mean of the time for each item having multiple trials.
(b) then calculate the overall mean of the time for each id. For example, for id=1, I have two items, the first item has 3 trials and the second item has 1 trial. First I need to aggregate the time for the first item by (5+6+7)/3=6, so id=1 has item1 time 6 and item2 time 1. Second, taking 6 and 1 and calculating the mean for this student (6+1)/2=3.5.
(c) Lastly, I would like to keep all the variables in the dataset.
data <- data %>%
group_by(id) %>%
select(id, gender, item.id, sequence, time) %>%
summarize(mean.time = mean(time))
I got this but obviously this is only aggregating the mean by not taking into account of the within mean for each trial and this also does not keep all the variables:
> data
# A tibble: 4 x 2
id mean.time
<dbl> <dbl>
1 1 4.75
2 2 4.5
3 3 3.75
4 4 6.5
I thought select() was going to keep all variables.
The final dataset should look like this below:
> data
id gender item.id sequence time mean.time
1 1 1 1 1 5 3.5
2 1 1 1 2 6 3.5
3 1 1 1 3 7 3.5
4 1 1 2 1 1 3.5
5 2 2 1 1 2 4.5
6 2 2 1 2 3 4.5
7 2 2 2 1 4 4.5
8 2 2 2 2 5 4.5
9 3 2 1 1 1 3.75
10 3 2 2 1 2 3.75
11 3 2 3 1 3 3.75
12 3 2 4 1 9 3.75
13 4 1 1 1 5 6.5
14 4 1 2 1 6 6.5
15 4 1 2 2 7 6.5
16 4 1 3 1 8 6.5
I used dplyr but open any other solutions.
Thanks in advance!
We can use mutate instead of summarise as summarise returns a summarised output of 1 row per each group, while mutate creates a new column in the dataset
...
%>%
mutate(mean.time = mean(time))
If wee want to get the mean of mean, then first group by 'id', 'item.id', get the mean, and then grouped by 'id', get the mean of unique elements
data %>%
group_by(id, item.id) %>%
mutate(mean.time = mean(time)) %>%
group_by(id) %>%
mutate(mean.time = mean(unique(mean.time)))
# A tibble: 16 x 6
# Groups: id [4]
# id gender item.id sequence time mean.time
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 1 1 5 3.5
# 2 1 1 1 2 6 3.5
# 3 1 1 1 3 7 3.5
# 4 1 1 2 1 1 3.5
# 5 2 2 1 1 2 4.5
# 6 2 2 1 2 3 4.5
# 7 2 2 2 1 4 4.5
# 8 2 2 2 2 9 4.5
# 9 3 2 1 1 1 3.75
#10 3 2 2 1 2 3.75
#11 3 2 3 1 3 3.75
#12 3 2 4 1 9 3.75
#13 4 1 1 1 5 6.5
#14 4 1 2 1 6 6.5
#15 4 1 2 2 7 6.5
#16 4 1 3 1 8 6.5
Or instead of creating a second group by, we can do a match to get the first position of 'item.id', extract the 'mean.time' and get the mean
data %>%
group_by(id, item.id) %>%
mutate(mean.time = mean(time),
mean.time = mean(mean.time[match(unique(item.id), item.id)]))
Or use summarise and then do a left_join
data %>%
group_by(id, item.id) %>%
summarise(mean.time = mean(time)) %>%
group_by(id) %>%
summarise(mean.time = mean(mean.time)) %>%
right_join(data)
Similar to this question but I want to use tidy evaluation instead.
df = data.frame(group = c(1,1,1,2,2,2,3,3,3),
date = c(1,2,3,4,5,6,7,8,9),
speed = c(3,4,3,4,5,6,6,4,9))
> df
group date speed
1 1 1 3
2 1 2 4
3 1 3 3
4 2 4 4
5 2 5 5
6 2 6 6
7 3 7 6
8 3 8 4
9 3 9 9
The task is to create a new column (newValue) whose values equals to the values of the date column (per group) with one condition: speed == 4. Example: group 1 has a newValue of 2 because date[speed==4] = 2.
group date speed newValue
1 1 1 3 2
2 1 2 4 2
3 1 3 3 2
4 2 4 4 4
5 2 5 5 4
6 2 6 6 4
7 3 7 6 8
8 3 8 4 8
9 3 9 9 8
It worked without tidy evaluation
df %>%
group_by(group) %>%
mutate(newValue=date[speed==4L])
#> # A tibble: 9 x 4
#> # Groups: group [3]
#> group date speed newValue
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 3 2
#> 2 1 2 4 2
#> 3 1 3 3 2
#> 4 2 4 4 4
#> 5 2 5 5 4
#> 6 2 6 6 4
#> 7 3 7 6 8
#> 8 3 8 4 8
#> 9 3 9 9 8
But had error with tidy evaluation
my_fu <- function(df, filter_var){
filter_var <- sym(filter_var)
df <- df %>%
group_by(group) %>%
mutate(newValue=!!filter_var[speed==4L])
}
my_fu(df, "date")
#> Error in quos(..., .named = TRUE): object 'speed' not found
Thanks in advance.
We can place the evaluation within brackets. Otherwise, it may try to evaluate the whole expression (filter_var[speed = 4L]) instead of filter_var alone
library(rlang)
library(dplyr)
my_fu <- function(df, filter_var){
filter_var <- sym(filter_var)
df %>%
group_by(group) %>%
mutate(newValue=(!!filter_var)[speed==4L])
}
my_fu(df, "date")
# A tibble: 9 x 4
# Groups: group [3]
# group date speed newValue
# <dbl> <dbl> <dbl> <dbl>
#1 1 1 3 2
#2 1 2 4 2
#3 1 3 3 2
#4 2 4 4 4
#5 2 5 5 4
#6 2 6 6 4
#7 3 7 6 8
#8 3 8 4 8
#9 3 9 9 8
Also, you can use from sqldf. Join df with a constraint on that:
library(sqldf)
df = data.frame(group = c(1,1,1,2,2,2,3,3,3),
date = c(1,2,3,4,5,6,7,8,9),
speed = c(3,4,3,4,5,6,6,4,9))
sqldf("SELECT df_origin.*, df4.`date` new_value FROM
df df_origin join (SELECT `group`, `date` FROM df WHERE speed = 4) df4
on (df_origin.`group` = df4.`group`)")