R select values for each group - r

For each combination of WEEK and GROUP I wish to find the day (1 or 2) when SCORE is lowest for TYPE == 'M' and then keep the TYPE == 'M' and TYPE == 'E' value on that same day.
HAVE = data.frame("WEEK" = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4),
"GROUP" = c('A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'),
"SCORE" = c(29, 10, 19, 11, 28, 38, 23, 27, 32, 19, 32, 26, 35, 27, 39, 51, 43, 12, 35, 29, 14, 47, 12, 36, 17, 11, 24, 19, 19, 12, 33, 7),
"DAY" = c(1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2),
"TYPE" = c('M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E'))
WANT=data.frame("WEEK"=c(1, 1, 2, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 3, 4, 4),
"GROUP"=c('A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'),
"SCORE"=c(19, 11, 23, 27, 32, 19, 35, 27, 35, 29, 12, 36, 17, 11, 19, 12),
"DAY"=c(2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1),
"TYPE"=c('M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E'),
"SCORE.MIN"=c(1, NA, 1, NA, 1, NA, 1, NA, 1, NA, 1, NA, 1, NA, 1, NA))
I have 'HAVE' and wish to create 'WANT'
For each combination of WEEK and GROUP, select the DAY when TYPE = M and SCORE is lowest. This I am able to do by
WANT.ATTEMPT = HAVE %>%
group_by(WEEK, GROUP) %>%
mutate(SCORE.MIN = ifelse(TYPE == 'M', min(SCORE), NA))
HOWEVER what I wish for is the dataset WANT where as you see I group by WEEK and GROUP and find the DAY when SCORE is minimum for TYPE == 'M' but I also keep the SCORE for TYPE == 'E' for the same day

Update to prevent ties.
You can do:
HAVE %>%
group_by(WEEK, GROUP) %>%
filter(DAY == first(DAY[SCORE == min(SCORE[TYPE == 'M'])]) | TYPE == 'E') %>%
filter(DAY %in% DAY[TYPE == 'M']) %>%
ungroup()
# A tibble: 16 x 5
WEEK GROUP SCORE DAY TYPE
<dbl> <chr> <dbl> <dbl> <chr>
1 1 A 19 2 M
2 1 A 11 2 E
3 2 A 23 2 M
4 2 A 27 2 E
5 3 A 32 1 M
6 3 A 19 1 E
7 4 A 35 1 M
8 4 A 27 1 E
9 1 B 35 2 M
10 1 B 29 2 E
11 2 B 12 2 M
12 2 B 36 2 E
13 3 B 17 1 M
14 3 B 11 1 E
15 4 B 19 1 M
16 4 B 12 1 E

We could also arrange first before grouping and then do the filter
library(dplyr)
HAVE %>%
arrange(WEEK, GROUP, TYPE != 'M', SCORE) %>%
group_by(WEEK, GROUP) %>%
filter(DAY %in% DAY[TYPE == "M"][1]) %>%
ungroup
-output
# A tibble: 16 × 5
WEEK GROUP SCORE DAY TYPE
<dbl> <chr> <dbl> <dbl> <chr>
1 1 A 19 2 M
2 1 A 11 2 E
3 1 B 35 2 M
4 1 B 29 2 E
5 2 A 23 2 M
6 2 A 27 2 E
7 2 B 12 2 M
8 2 B 36 2 E
9 3 A 32 1 M
10 3 A 19 1 E
11 3 B 17 1 M
12 3 B 11 1 E
13 4 A 35 1 M
14 4 A 27 1 E
15 4 B 19 1 M
16 4 B 12 1 E

Related

R Reshape and Select Max

HAVE = data.frame("WEEK"=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
"STUDENT"=c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3),
"CLASS"=c('A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H'),
"TEST"=c(1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1),
"SCORE"=c(93, 97, 72, 68, 93, 51, 19, 88, 56, 53, 79, 69, 90, 61, 74, 50, 76, 97, 55, 63, 63, 59, 68, 77, 80, 52, 94, 74, 64, 74, 92, 98, 89, 84, 54, 51, 82, 86, 51, 90, 72, 86))
WANT = data.frame("WEEK"=c(1,1,1,2,2,2),
"STUDENT"=c(1,2,3,1,2,3),
"CLASS"=c('A','A','B','B','B','C'),
"H"=c(19,61,63,74,54,86),
"TEST1"=c(93,88,76,77,92,90),
"TEST2"=c(97,56,97,80,98,72))
I wish to group by WEEK and STUDENT and then for each combination of WEEK and STUDENT find the CLASS when SCORE equals to maximum(SCORE) where TEST equals to one. Then I wish to find the corresponding SCORE for TEST equals to 2 using that same CLASS. I wish to transform this into the data WANT from the data HAVE. And ALSO add the COLUMN H where it is just equals to the SCORE when CLASS equals to H
We can reshape to 'wide' with pivot_wider, then grouped by 'WEEK', 'STUDENT', create the 'H' column with 'TEST1' values were 'CLASS' is "H" and then slice the max row for 'TEST1'
library(dplyr)
library(tidyr)
HAVE %>%
pivot_wider(names_from = TEST, values_from = SCORE,
names_glue = "TEST{TEST}") %>%
group_by(WEEK, STUDENT) %>%
mutate(H = TEST1[CLASS == "H"], .before = 'TEST1') %>%
slice_max(n = 1, order_by = TEST1, with_ties = FALSE) %>%
ungroup
-output
# A tibble: 6 × 6
WEEK STUDENT CLASS H TEST1 TEST2
<dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 1 1 A 19 93 97
2 1 2 A 61 88 56
3 1 3 B 63 76 97
4 2 1 B 74 77 80
5 2 2 B 54 92 98
6 2 3 C 86 90 72
-checking with 'WANT'
> WANT
WEEK STUDENT CLASS H TEST1 TEST2
1 1 1 A 19 93 97
2 1 2 A 61 88 56
3 1 3 B 63 76 97
4 2 1 B 74 77 80
5 2 2 B 54 92 98
6 2 3 C 86 90 72

Extract certain values out of a correlation matrix

Is there a way to distract the correlation coefficients out of a correlation matrix ?
Let's say I have a dataset with 3 variables (a, b, c) and I want to calculate the correlations among themselves.
with
df <- data.frame(a <- c(2, 3, 3, 5, 6, 9, 14, 15, 19, 21, 22, 23),
b <- c(23, 24, 24, 23, 17, 28, 38, 34, 35, 39, 41, 43),
c <- c(13, 14, 14, 14, 15, 17, 18, 19, 22, 20, 24, 26),
d <- c(6, 6, 7, 8, 8, 8, 7, 6, 5, 3, 3, 2))
and
cor(df[, c('a', 'b', 'c')])
I'll get a correlation matrix:
a b c
a 1.0000000 0.9279869 0.9604329
b 0.9279869 1.0000000 0.8942139
c 0.9604329 0.8942139 1.0000000
Is there a way to show the results in a manner like this:
Correlation between a and b is: 0.9279869.
Correlation between a and c is: 0.9604329.
Correlation between b and c is: 0.8942139:
?
My correlation matrix is of obviously bigger (~300 entries) eand I need a way to distract only the values that are important for me.
Thanks.
Using reshape2 and melt
df <- data.frame("a" = c(2, 3, 3, 5, 6, 9, 14, 15, 19, 21, 22, 23),
"b" = c(23, 24, 24, 23, 17, 28, 38, 34, 35, 39, 41, 43),
"c" = c(13, 14, 14, 14, 15, 17, 18, 19, 22, 20, 24, 26),
"d" = c(6, 6, 7, 8, 8, 8, 7, 6, 5, 3, 3, 2))
tmp=cor(df[, c('a', 'b', 'c')])
tmp[lower.tri(tmp)]=NA
diag(tmp)=NA
library(reshape2)
na.omit(melt(tmp))
resulting in
Var1 Var2 value
4 a b 0.9279869
7 a c 0.9604329
8 b c 0.8942139
You can do,
df1 = cor(df[, c('a', 'b', 'c')])
df1 = as.data.frame(as.table(df1))
df1$Freq = round(df1$Freq,2)
df2 = subset(df1, (as.character(df1$Var1) != as.character(df1$Var2)))
df2$res = paste('Correlation between', df2$Var1, 'and', df2$Var2, 'is', df2$Freq)
Var1 Var2 Freq res
2 b a 0.93 Correlation between b and a is 0.93
3 c a 0.96 Correlation between c and a is 0.96
4 a b 0.93 Correlation between a and b is 0.93
6 c b 0.89 Correlation between c and b is 0.89
7 a c 0.96 Correlation between a and c is 0.96
8 b c 0.89 Correlation between b and c is 0.89
Here is another idea with reshaping to long format, i.e.
tidyr::pivot_longer(tibble::rownames_to_column(as.data.frame(cor(df[, c('a', 'b', 'c')])), var = 'rn'), -1)
# A tibble: 9 x 3
rn name value
<chr> <chr> <dbl>
1 a a 1
2 a b 0.928
3 a c 0.960
4 b a 0.928
5 b b 1
6 b c 0.894
7 c a 0.960
8 c b 0.894
9 c c 1
Maybe you can try as.table + as.data.frame
> as.data.frame(as.table(cor(df[, c("a", "b", "c")])))
Var1 Var2 Freq
1 a a 1.0000000
2 b a 0.9279869
3 c a 0.9604329
4 a b 0.9279869
5 b b 1.0000000
6 c b 0.8942139
7 a c 0.9604329
8 b c 0.8942139
9 c c 1.0000000

Sum by logical condition, in a dataset with groups

I have a df like this:
I want to transform the continuous Age variable into a discrete one, that is equal a if the original was between 1 and 2, and b if it was betweem 3 and 4. Thus needing to aggregate the values of Value 1 and Value 2 by summing the entries associated with Age=1 + Age=2 and Age=3 + Age=4. The output would be something like this:
The 146 is the sum of the Value1 entry for Age=1 (75) and Age=2 (71).
I thought on using aggregate:
`df2 = df %>% group_by(Sex, Race) %>%
summarise(across(starts_with("Value"), fun))
Where fun would be some function that checks the Age values and sum accordingly. But i'm not much familiar with these dplyr functions and couldn't get it to work. Thanks for the help!
Data:
df = structure(list(Sex = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
2, 2, 2), Race = c(1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2,
2, 2), Age = c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4
), `Value 1` = c(75, 71, 52, 51, 24, 21, 70, 58, 67, 68, 36,
22, 91, 43, 33, 57), `Value 2` = c(22, 22, 49, 1, 20, 18, 34,
0, 27, 37, 31, 83, 29, 24, 10, 99)), row.names = c(NA, -16L), class = c("tbl_df",
"tbl", "data.frame"))
We can use case_when to do the recoding of 'Age' based on the values
library(dplyr)
df %>%
group_by(Sex, Race, Age = case_when(Age %in% 1:2 ~ 'a',
Age %in% 3:4 ~ 'b')) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 8 x 5
# Sex Race Age `Value 1` `Value 2`
#* <dbl> <dbl> <chr> <dbl> <dbl>
#1 1 1 a 146 44
#2 1 1 b 103 50
#3 1 2 a 45 38
#4 1 2 b 128 34
#5 2 1 a 135 64
#6 2 1 b 58 114
#7 2 2 a 134 53
#8 2 2 b 90 109
Based on the OP's comment, if the original data have lots of categories, an easier option is cut or findInterval
df %>%
group_by(Sex, Race, Age = cut(Age, breaks = c(-Inf,
seq(0, 90, by = 5), Inf), labels = letters[1:20])) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')

In R, create sequential 1 to N column based on values in other columns

Seems like a straightforward data manip problem, however we would like to avoid using a for loop that simply compares the values in each row. We have the following dataframe:
zed = data.frame(
a = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1),
b = c('a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd', 'd', 'd', 'd', 'e', 'e', 'a', 'a'),
c = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 1, 1),
stringsAsFactors = FALSE
)
output = zed = data.frame(
a = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1),
b = c('a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd', 'd', 'd', 'd', 'e', 'e', 'a', 'a'),
c = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 1, 1),
group = c(1, 1, 2, 2, 2, 3, 4, 5, 6, 6, 6, 7, 8, 8, 9, 9),
stringsAsFactors = FALSE
)
> output
a b c group
1 1 a 1 1
2 1 a 1 1
3 1 b 1 2
4 1 b 1 2
5 1 b 1 2
6 1 c 1 3
7 1 c 2 4
8 1 d 2 5
9 2 d 2 6
10 2 d 2 6
11 2 d 2 6
12 2 d 3 7
13 2 e 3 8
14 2 e 3 8
15 1 a 1 9
16 1 a 1 9
The dataframe begins with the columns a, b, c, and we need to add the group column to the dataframe. The group column starts at 1, and increases sequentially if any of the values in a, b, c are different from their value in the previous row.
This is not quite as simple as doing a group_by() on a, b, c, as the same row can appear later, but not sequentially, in the dataframe (e.g. rows 1,2 == rows 15,16, however they are not the same group because they did not appear sequentially in the dataframe).
We can use
library(data.table)
setDT(zed)[, group := .GRP, .(rleid(a, b, c))]

R find group with missing rows, create new rows using existing data

Here is a simplified version of a problem that involves processing a large, complex table. Here is the input table:
library(tidyverse)
input <- tribble(
~group, ~score, ~label,
1, 10, 'A',
1, 20, 'B',
1, 30, 'C',
1, 40, 'D',
2, 11, 'A',
2, 21, 'B',
2, 31, 'C',
2, 41, 'D',
3, 12, 'A',
3, 22, 'B',
4, 13, 'A',
4, 23, 'B',
4, 33, 'C',
4, 43, 'D'
)
The table has 14 rows. The data are grouped in numbered groups (1:4), each group is supposed to have four scores labeled A, B, C, D.
The problem is group 3, which is missing the C and D rows.
I want R to do the following:
Find group 3 based on its lack of C and D rows.
Insert C and D rows for group 3, in proper alphabetical sequence.
Populate score in the new C and D rows with the value of of score (22) from group 3 row B.
Another way of describing the transformation is that I want two insert two copies of row 3B, changing the label
of those copied rows from B to C and D, respectively.
The desired output table has 16 rows and looks like this:
output <- tribble(
~group, ~score, ~label,
1, 10, 'A',
1, 20, 'B',
1, 30, 'C',
1, 40, 'D',
2, 11, 'A',
2, 21, 'B',
2, 31, 'C',
2, 41, 'D',
3, 12, 'A',
3, 22, 'B',
3, 22, 'C',
3, 22, 'D',
4, 13, 'A',
4, 23, 'B',
4, 33, 'C',
4, 43, 'D'
)
Thanks in advance for any help!
complete(input, group, label) %>%
fill(score)
# A tibble: 16 x 3
group label score
<dbl> <chr> <dbl>
1 1 A 10
2 1 B 20
3 1 C 30
4 1 D 40
5 2 A 11
6 2 B 21
7 2 C 31
8 2 D 41
9 3 A 12
10 3 B 22
11 3 C 22
12 3 D 22
13 4 A 13
14 4 B 23
15 4 C 33
16 4 D 43

Resources