HAVE = data.frame("WEEK"=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
"STUDENT"=c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3),
"CLASS"=c('A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H'),
"TEST"=c(1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1),
"SCORE"=c(93, 97, 72, 68, 93, 51, 19, 88, 56, 53, 79, 69, 90, 61, 74, 50, 76, 97, 55, 63, 63, 59, 68, 77, 80, 52, 94, 74, 64, 74, 92, 98, 89, 84, 54, 51, 82, 86, 51, 90, 72, 86))
WANT = data.frame("WEEK"=c(1,1,1,2,2,2),
"STUDENT"=c(1,2,3,1,2,3),
"CLASS"=c('A','A','B','B','B','C'),
"H"=c(19,61,63,74,54,86),
"TEST1"=c(93,88,76,77,92,90),
"TEST2"=c(97,56,97,80,98,72))
I wish to group by WEEK and STUDENT and then for each combination of WEEK and STUDENT find the CLASS when SCORE equals to maximum(SCORE) where TEST equals to one. Then I wish to find the corresponding SCORE for TEST equals to 2 using that same CLASS. I wish to transform this into the data WANT from the data HAVE. And ALSO add the COLUMN H where it is just equals to the SCORE when CLASS equals to H
We can reshape to 'wide' with pivot_wider, then grouped by 'WEEK', 'STUDENT', create the 'H' column with 'TEST1' values were 'CLASS' is "H" and then slice the max row for 'TEST1'
library(dplyr)
library(tidyr)
HAVE %>%
pivot_wider(names_from = TEST, values_from = SCORE,
names_glue = "TEST{TEST}") %>%
group_by(WEEK, STUDENT) %>%
mutate(H = TEST1[CLASS == "H"], .before = 'TEST1') %>%
slice_max(n = 1, order_by = TEST1, with_ties = FALSE) %>%
ungroup
-output
# A tibble: 6 × 6
WEEK STUDENT CLASS H TEST1 TEST2
<dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 1 1 A 19 93 97
2 1 2 A 61 88 56
3 1 3 B 63 76 97
4 2 1 B 74 77 80
5 2 2 B 54 92 98
6 2 3 C 86 90 72
-checking with 'WANT'
> WANT
WEEK STUDENT CLASS H TEST1 TEST2
1 1 1 A 19 93 97
2 1 2 A 61 88 56
3 1 3 B 63 76 97
4 2 1 B 74 77 80
5 2 2 B 54 92 98
6 2 3 C 86 90 72
Related
For each combination of WEEK and GROUP I wish to find the day (1 or 2) when SCORE is lowest for TYPE == 'M' and then keep the TYPE == 'M' and TYPE == 'E' value on that same day.
HAVE = data.frame("WEEK" = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4),
"GROUP" = c('A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'),
"SCORE" = c(29, 10, 19, 11, 28, 38, 23, 27, 32, 19, 32, 26, 35, 27, 39, 51, 43, 12, 35, 29, 14, 47, 12, 36, 17, 11, 24, 19, 19, 12, 33, 7),
"DAY" = c(1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2),
"TYPE" = c('M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E'))
WANT=data.frame("WEEK"=c(1, 1, 2, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 3, 4, 4),
"GROUP"=c('A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'),
"SCORE"=c(19, 11, 23, 27, 32, 19, 35, 27, 35, 29, 12, 36, 17, 11, 19, 12),
"DAY"=c(2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1),
"TYPE"=c('M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E'),
"SCORE.MIN"=c(1, NA, 1, NA, 1, NA, 1, NA, 1, NA, 1, NA, 1, NA, 1, NA))
I have 'HAVE' and wish to create 'WANT'
For each combination of WEEK and GROUP, select the DAY when TYPE = M and SCORE is lowest. This I am able to do by
WANT.ATTEMPT = HAVE %>%
group_by(WEEK, GROUP) %>%
mutate(SCORE.MIN = ifelse(TYPE == 'M', min(SCORE), NA))
HOWEVER what I wish for is the dataset WANT where as you see I group by WEEK and GROUP and find the DAY when SCORE is minimum for TYPE == 'M' but I also keep the SCORE for TYPE == 'E' for the same day
Update to prevent ties.
You can do:
HAVE %>%
group_by(WEEK, GROUP) %>%
filter(DAY == first(DAY[SCORE == min(SCORE[TYPE == 'M'])]) | TYPE == 'E') %>%
filter(DAY %in% DAY[TYPE == 'M']) %>%
ungroup()
# A tibble: 16 x 5
WEEK GROUP SCORE DAY TYPE
<dbl> <chr> <dbl> <dbl> <chr>
1 1 A 19 2 M
2 1 A 11 2 E
3 2 A 23 2 M
4 2 A 27 2 E
5 3 A 32 1 M
6 3 A 19 1 E
7 4 A 35 1 M
8 4 A 27 1 E
9 1 B 35 2 M
10 1 B 29 2 E
11 2 B 12 2 M
12 2 B 36 2 E
13 3 B 17 1 M
14 3 B 11 1 E
15 4 B 19 1 M
16 4 B 12 1 E
We could also arrange first before grouping and then do the filter
library(dplyr)
HAVE %>%
arrange(WEEK, GROUP, TYPE != 'M', SCORE) %>%
group_by(WEEK, GROUP) %>%
filter(DAY %in% DAY[TYPE == "M"][1]) %>%
ungroup
-output
# A tibble: 16 × 5
WEEK GROUP SCORE DAY TYPE
<dbl> <chr> <dbl> <dbl> <chr>
1 1 A 19 2 M
2 1 A 11 2 E
3 1 B 35 2 M
4 1 B 29 2 E
5 2 A 23 2 M
6 2 A 27 2 E
7 2 B 12 2 M
8 2 B 36 2 E
9 3 A 32 1 M
10 3 A 19 1 E
11 3 B 17 1 M
12 3 B 11 1 E
13 4 A 35 1 M
14 4 A 27 1 E
15 4 B 19 1 M
16 4 B 12 1 E
I have a df like this:
I want to transform the continuous Age variable into a discrete one, that is equal a if the original was between 1 and 2, and b if it was betweem 3 and 4. Thus needing to aggregate the values of Value 1 and Value 2 by summing the entries associated with Age=1 + Age=2 and Age=3 + Age=4. The output would be something like this:
The 146 is the sum of the Value1 entry for Age=1 (75) and Age=2 (71).
I thought on using aggregate:
`df2 = df %>% group_by(Sex, Race) %>%
summarise(across(starts_with("Value"), fun))
Where fun would be some function that checks the Age values and sum accordingly. But i'm not much familiar with these dplyr functions and couldn't get it to work. Thanks for the help!
Data:
df = structure(list(Sex = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
2, 2, 2), Race = c(1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2,
2, 2), Age = c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4
), `Value 1` = c(75, 71, 52, 51, 24, 21, 70, 58, 67, 68, 36,
22, 91, 43, 33, 57), `Value 2` = c(22, 22, 49, 1, 20, 18, 34,
0, 27, 37, 31, 83, 29, 24, 10, 99)), row.names = c(NA, -16L), class = c("tbl_df",
"tbl", "data.frame"))
We can use case_when to do the recoding of 'Age' based on the values
library(dplyr)
df %>%
group_by(Sex, Race, Age = case_when(Age %in% 1:2 ~ 'a',
Age %in% 3:4 ~ 'b')) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 8 x 5
# Sex Race Age `Value 1` `Value 2`
#* <dbl> <dbl> <chr> <dbl> <dbl>
#1 1 1 a 146 44
#2 1 1 b 103 50
#3 1 2 a 45 38
#4 1 2 b 128 34
#5 2 1 a 135 64
#6 2 1 b 58 114
#7 2 2 a 134 53
#8 2 2 b 90 109
Based on the OP's comment, if the original data have lots of categories, an easier option is cut or findInterval
df %>%
group_by(Sex, Race, Age = cut(Age, breaks = c(-Inf,
seq(0, 90, by = 5), Inf), labels = letters[1:20])) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
Seems like a straightforward data manip problem, however we would like to avoid using a for loop that simply compares the values in each row. We have the following dataframe:
zed = data.frame(
a = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1),
b = c('a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd', 'd', 'd', 'd', 'e', 'e', 'a', 'a'),
c = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 1, 1),
stringsAsFactors = FALSE
)
output = zed = data.frame(
a = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1),
b = c('a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd', 'd', 'd', 'd', 'e', 'e', 'a', 'a'),
c = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 1, 1),
group = c(1, 1, 2, 2, 2, 3, 4, 5, 6, 6, 6, 7, 8, 8, 9, 9),
stringsAsFactors = FALSE
)
> output
a b c group
1 1 a 1 1
2 1 a 1 1
3 1 b 1 2
4 1 b 1 2
5 1 b 1 2
6 1 c 1 3
7 1 c 2 4
8 1 d 2 5
9 2 d 2 6
10 2 d 2 6
11 2 d 2 6
12 2 d 3 7
13 2 e 3 8
14 2 e 3 8
15 1 a 1 9
16 1 a 1 9
The dataframe begins with the columns a, b, c, and we need to add the group column to the dataframe. The group column starts at 1, and increases sequentially if any of the values in a, b, c are different from their value in the previous row.
This is not quite as simple as doing a group_by() on a, b, c, as the same row can appear later, but not sequentially, in the dataframe (e.g. rows 1,2 == rows 15,16, however they are not the same group because they did not appear sequentially in the dataframe).
We can use
library(data.table)
setDT(zed)[, group := .GRP, .(rleid(a, b, c))]
I have a dataframe that looks like this
> head(printing_id_map_unique_frames)
# A tibble: 6 x 5
# Groups: frame_number [6]
X1 X2 X3 row_in_frame frame_number
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 3 15 1
2 1 2 3 15 2
3 1 2 3 15 3
4 1 2 3 15 4
5 1 2 3 15 5
6 1 2 3 15 6
As you can see, X1,X2,X3, row_in_frame is identical
However, eventually you get to a
X1 X2 X3 row_in_frame frame_number
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 3 15 32
2 1 2 3 15 33
3 1 2 3 5 34**
4 1 4 5 15 35
5 1 4 5 15 36
What I would like to do is essentially compute a dataframe that looks like:
X1 X2 X3 row_in_frame num_duplicates
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 3 15 33
2 1 2 3 5 1
...
Essentially, what I want is to "collapse" over identical first 4 columns and count how many rows of that type there are in the "num_duplicates" column.
Is there a nice way to do this in dplyr without a messy for loop that tracks a count and if there is a change.
Below please find a full data structure via dput:
> dput(printing_id_map_unique_frames)
structure(list(X1 = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), X2 = c(2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
), X3 = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5), row_in_frame = c(15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 5, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 5
), frame_number = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68)), row.names = c(NA, -68L), class = c("tbl_df",
"tbl", "data.frame"))
Here is one option with count
library(dplyr) # 1.0.0
df1 %>%
count(!!! rlang::syms(names(.)[1:4]))
Or specify the unquoted column names
df1 %>%
count(X1, X2, X3, row_in_frame)
If we don't want to change the order, an option is to convert the first 4 columns to factor with levels specified as the unique values (which is the same as the order of occurrence of values) and then apply the count
df1 %>%
mutate(across(1:4, ~ factor(.x, levels = unique(.x)))) %>%
count(!!! rlang::syms(names(.)[1:4])) %>%
type.convert(as.is = TRUE)
# A tibble: 4 x 5
# X1 X2 X3 row_in_frame n
# <int> <int> <int> <int> <int>
#1 1 2 3 15 33
#2 1 2 3 5 1
#3 1 4 5 15 33
#4 1 4 5 5 1
Here is a simplified version of a problem that involves processing a large, complex table. Here is the input table:
library(tidyverse)
input <- tribble(
~group, ~score, ~label,
1, 10, 'A',
1, 20, 'B',
1, 30, 'C',
1, 40, 'D',
2, 11, 'A',
2, 21, 'B',
2, 31, 'C',
2, 41, 'D',
3, 12, 'A',
3, 22, 'B',
4, 13, 'A',
4, 23, 'B',
4, 33, 'C',
4, 43, 'D'
)
The table has 14 rows. The data are grouped in numbered groups (1:4), each group is supposed to have four scores labeled A, B, C, D.
The problem is group 3, which is missing the C and D rows.
I want R to do the following:
Find group 3 based on its lack of C and D rows.
Insert C and D rows for group 3, in proper alphabetical sequence.
Populate score in the new C and D rows with the value of of score (22) from group 3 row B.
Another way of describing the transformation is that I want two insert two copies of row 3B, changing the label
of those copied rows from B to C and D, respectively.
The desired output table has 16 rows and looks like this:
output <- tribble(
~group, ~score, ~label,
1, 10, 'A',
1, 20, 'B',
1, 30, 'C',
1, 40, 'D',
2, 11, 'A',
2, 21, 'B',
2, 31, 'C',
2, 41, 'D',
3, 12, 'A',
3, 22, 'B',
3, 22, 'C',
3, 22, 'D',
4, 13, 'A',
4, 23, 'B',
4, 33, 'C',
4, 43, 'D'
)
Thanks in advance for any help!
complete(input, group, label) %>%
fill(score)
# A tibble: 16 x 3
group label score
<dbl> <chr> <dbl>
1 1 A 10
2 1 B 20
3 1 C 30
4 1 D 40
5 2 A 11
6 2 B 21
7 2 C 31
8 2 D 41
9 3 A 12
10 3 B 22
11 3 C 22
12 3 D 22
13 4 A 13
14 4 B 23
15 4 C 33
16 4 D 43