R find group with missing rows, create new rows using existing data - r

Here is a simplified version of a problem that involves processing a large, complex table. Here is the input table:
library(tidyverse)
input <- tribble(
~group, ~score, ~label,
1, 10, 'A',
1, 20, 'B',
1, 30, 'C',
1, 40, 'D',
2, 11, 'A',
2, 21, 'B',
2, 31, 'C',
2, 41, 'D',
3, 12, 'A',
3, 22, 'B',
4, 13, 'A',
4, 23, 'B',
4, 33, 'C',
4, 43, 'D'
)
The table has 14 rows. The data are grouped in numbered groups (1:4), each group is supposed to have four scores labeled A, B, C, D.
The problem is group 3, which is missing the C and D rows.
I want R to do the following:
Find group 3 based on its lack of C and D rows.
Insert C and D rows for group 3, in proper alphabetical sequence.
Populate score in the new C and D rows with the value of of score (22) from group 3 row B.
Another way of describing the transformation is that I want two insert two copies of row 3B, changing the label
of those copied rows from B to C and D, respectively.
The desired output table has 16 rows and looks like this:
output <- tribble(
~group, ~score, ~label,
1, 10, 'A',
1, 20, 'B',
1, 30, 'C',
1, 40, 'D',
2, 11, 'A',
2, 21, 'B',
2, 31, 'C',
2, 41, 'D',
3, 12, 'A',
3, 22, 'B',
3, 22, 'C',
3, 22, 'D',
4, 13, 'A',
4, 23, 'B',
4, 33, 'C',
4, 43, 'D'
)
Thanks in advance for any help!

complete(input, group, label) %>%
fill(score)
# A tibble: 16 x 3
group label score
<dbl> <chr> <dbl>
1 1 A 10
2 1 B 20
3 1 C 30
4 1 D 40
5 2 A 11
6 2 B 21
7 2 C 31
8 2 D 41
9 3 A 12
10 3 B 22
11 3 C 22
12 3 D 22
13 4 A 13
14 4 B 23
15 4 C 33
16 4 D 43

Related

How to cutoff values in dataframe?

From the following dataframe:
df1 <- data.frame(id=c(1, 2, 3, 4, 5, 6, 7),
revenue=c(34, 1000, 40, 49, 43, 55, 99))
df2 <- data.frame(id=c(1, 2, 3, 4, 5, 6, 7),
expenses=c(22, 26, 31, 40, 20, 2000, 22))
df3 <- data.frame(id=c(1, 2, 3, 4, 5, 6, 7),
profit=c(12, 10, 14, 12, 9, 15, 16))
df_list <- list(df1, df2, df3)
test <- Reduce(function(x, y) merge(x, y, all=TRUE), df_list)
rownames(test) <- test[,1]
test[,1] <- NULL
test
I would like to eliminate extreme values (e.g. 1000 and 2000). I need to cutoff everything that is greater than 100. When I check test<100 I see TRUE and FALSE positions but I would like to replace them with NA or zeroes.
To replace all values in a dataframe (df) which values are higher than 100 with a 0 simply use: df[df > 100] = 0
We can use replace()
replace(test, test>100, NA)
revenue expenses profit
1 34 22 12
2 NA 26 10
3 40 31 14
4 49 40 12
5 43 20 9
6 55 NA 15
7 99 22 16

R select values for each group

For each combination of WEEK and GROUP I wish to find the day (1 or 2) when SCORE is lowest for TYPE == 'M' and then keep the TYPE == 'M' and TYPE == 'E' value on that same day.
HAVE = data.frame("WEEK" = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4),
"GROUP" = c('A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'),
"SCORE" = c(29, 10, 19, 11, 28, 38, 23, 27, 32, 19, 32, 26, 35, 27, 39, 51, 43, 12, 35, 29, 14, 47, 12, 36, 17, 11, 24, 19, 19, 12, 33, 7),
"DAY" = c(1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2),
"TYPE" = c('M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E'))
WANT=data.frame("WEEK"=c(1, 1, 2, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 3, 4, 4),
"GROUP"=c('A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'),
"SCORE"=c(19, 11, 23, 27, 32, 19, 35, 27, 35, 29, 12, 36, 17, 11, 19, 12),
"DAY"=c(2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1),
"TYPE"=c('M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E', 'M', 'E'),
"SCORE.MIN"=c(1, NA, 1, NA, 1, NA, 1, NA, 1, NA, 1, NA, 1, NA, 1, NA))
I have 'HAVE' and wish to create 'WANT'
For each combination of WEEK and GROUP, select the DAY when TYPE = M and SCORE is lowest. This I am able to do by
WANT.ATTEMPT = HAVE %>%
group_by(WEEK, GROUP) %>%
mutate(SCORE.MIN = ifelse(TYPE == 'M', min(SCORE), NA))
HOWEVER what I wish for is the dataset WANT where as you see I group by WEEK and GROUP and find the DAY when SCORE is minimum for TYPE == 'M' but I also keep the SCORE for TYPE == 'E' for the same day
Update to prevent ties.
You can do:
HAVE %>%
group_by(WEEK, GROUP) %>%
filter(DAY == first(DAY[SCORE == min(SCORE[TYPE == 'M'])]) | TYPE == 'E') %>%
filter(DAY %in% DAY[TYPE == 'M']) %>%
ungroup()
# A tibble: 16 x 5
WEEK GROUP SCORE DAY TYPE
<dbl> <chr> <dbl> <dbl> <chr>
1 1 A 19 2 M
2 1 A 11 2 E
3 2 A 23 2 M
4 2 A 27 2 E
5 3 A 32 1 M
6 3 A 19 1 E
7 4 A 35 1 M
8 4 A 27 1 E
9 1 B 35 2 M
10 1 B 29 2 E
11 2 B 12 2 M
12 2 B 36 2 E
13 3 B 17 1 M
14 3 B 11 1 E
15 4 B 19 1 M
16 4 B 12 1 E
We could also arrange first before grouping and then do the filter
library(dplyr)
HAVE %>%
arrange(WEEK, GROUP, TYPE != 'M', SCORE) %>%
group_by(WEEK, GROUP) %>%
filter(DAY %in% DAY[TYPE == "M"][1]) %>%
ungroup
-output
# A tibble: 16 × 5
WEEK GROUP SCORE DAY TYPE
<dbl> <chr> <dbl> <dbl> <chr>
1 1 A 19 2 M
2 1 A 11 2 E
3 1 B 35 2 M
4 1 B 29 2 E
5 2 A 23 2 M
6 2 A 27 2 E
7 2 B 12 2 M
8 2 B 36 2 E
9 3 A 32 1 M
10 3 A 19 1 E
11 3 B 17 1 M
12 3 B 11 1 E
13 4 A 35 1 M
14 4 A 27 1 E
15 4 B 19 1 M
16 4 B 12 1 E

R Reshape and Select Max

HAVE = data.frame("WEEK"=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
"STUDENT"=c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3),
"CLASS"=c('A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H'),
"TEST"=c(1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1),
"SCORE"=c(93, 97, 72, 68, 93, 51, 19, 88, 56, 53, 79, 69, 90, 61, 74, 50, 76, 97, 55, 63, 63, 59, 68, 77, 80, 52, 94, 74, 64, 74, 92, 98, 89, 84, 54, 51, 82, 86, 51, 90, 72, 86))
WANT = data.frame("WEEK"=c(1,1,1,2,2,2),
"STUDENT"=c(1,2,3,1,2,3),
"CLASS"=c('A','A','B','B','B','C'),
"H"=c(19,61,63,74,54,86),
"TEST1"=c(93,88,76,77,92,90),
"TEST2"=c(97,56,97,80,98,72))
I wish to group by WEEK and STUDENT and then for each combination of WEEK and STUDENT find the CLASS when SCORE equals to maximum(SCORE) where TEST equals to one. Then I wish to find the corresponding SCORE for TEST equals to 2 using that same CLASS. I wish to transform this into the data WANT from the data HAVE. And ALSO add the COLUMN H where it is just equals to the SCORE when CLASS equals to H
We can reshape to 'wide' with pivot_wider, then grouped by 'WEEK', 'STUDENT', create the 'H' column with 'TEST1' values were 'CLASS' is "H" and then slice the max row for 'TEST1'
library(dplyr)
library(tidyr)
HAVE %>%
pivot_wider(names_from = TEST, values_from = SCORE,
names_glue = "TEST{TEST}") %>%
group_by(WEEK, STUDENT) %>%
mutate(H = TEST1[CLASS == "H"], .before = 'TEST1') %>%
slice_max(n = 1, order_by = TEST1, with_ties = FALSE) %>%
ungroup
-output
# A tibble: 6 × 6
WEEK STUDENT CLASS H TEST1 TEST2
<dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 1 1 A 19 93 97
2 1 2 A 61 88 56
3 1 3 B 63 76 97
4 2 1 B 74 77 80
5 2 2 B 54 92 98
6 2 3 C 86 90 72
-checking with 'WANT'
> WANT
WEEK STUDENT CLASS H TEST1 TEST2
1 1 1 A 19 93 97
2 1 2 A 61 88 56
3 1 3 B 63 76 97
4 2 1 B 74 77 80
5 2 2 B 54 92 98
6 2 3 C 86 90 72

Extract certain values out of a correlation matrix

Is there a way to distract the correlation coefficients out of a correlation matrix ?
Let's say I have a dataset with 3 variables (a, b, c) and I want to calculate the correlations among themselves.
with
df <- data.frame(a <- c(2, 3, 3, 5, 6, 9, 14, 15, 19, 21, 22, 23),
b <- c(23, 24, 24, 23, 17, 28, 38, 34, 35, 39, 41, 43),
c <- c(13, 14, 14, 14, 15, 17, 18, 19, 22, 20, 24, 26),
d <- c(6, 6, 7, 8, 8, 8, 7, 6, 5, 3, 3, 2))
and
cor(df[, c('a', 'b', 'c')])
I'll get a correlation matrix:
a b c
a 1.0000000 0.9279869 0.9604329
b 0.9279869 1.0000000 0.8942139
c 0.9604329 0.8942139 1.0000000
Is there a way to show the results in a manner like this:
Correlation between a and b is: 0.9279869.
Correlation between a and c is: 0.9604329.
Correlation between b and c is: 0.8942139:
?
My correlation matrix is of obviously bigger (~300 entries) eand I need a way to distract only the values that are important for me.
Thanks.
Using reshape2 and melt
df <- data.frame("a" = c(2, 3, 3, 5, 6, 9, 14, 15, 19, 21, 22, 23),
"b" = c(23, 24, 24, 23, 17, 28, 38, 34, 35, 39, 41, 43),
"c" = c(13, 14, 14, 14, 15, 17, 18, 19, 22, 20, 24, 26),
"d" = c(6, 6, 7, 8, 8, 8, 7, 6, 5, 3, 3, 2))
tmp=cor(df[, c('a', 'b', 'c')])
tmp[lower.tri(tmp)]=NA
diag(tmp)=NA
library(reshape2)
na.omit(melt(tmp))
resulting in
Var1 Var2 value
4 a b 0.9279869
7 a c 0.9604329
8 b c 0.8942139
You can do,
df1 = cor(df[, c('a', 'b', 'c')])
df1 = as.data.frame(as.table(df1))
df1$Freq = round(df1$Freq,2)
df2 = subset(df1, (as.character(df1$Var1) != as.character(df1$Var2)))
df2$res = paste('Correlation between', df2$Var1, 'and', df2$Var2, 'is', df2$Freq)
Var1 Var2 Freq res
2 b a 0.93 Correlation between b and a is 0.93
3 c a 0.96 Correlation between c and a is 0.96
4 a b 0.93 Correlation between a and b is 0.93
6 c b 0.89 Correlation between c and b is 0.89
7 a c 0.96 Correlation between a and c is 0.96
8 b c 0.89 Correlation between b and c is 0.89
Here is another idea with reshaping to long format, i.e.
tidyr::pivot_longer(tibble::rownames_to_column(as.data.frame(cor(df[, c('a', 'b', 'c')])), var = 'rn'), -1)
# A tibble: 9 x 3
rn name value
<chr> <chr> <dbl>
1 a a 1
2 a b 0.928
3 a c 0.960
4 b a 0.928
5 b b 1
6 b c 0.894
7 c a 0.960
8 c b 0.894
9 c c 1
Maybe you can try as.table + as.data.frame
> as.data.frame(as.table(cor(df[, c("a", "b", "c")])))
Var1 Var2 Freq
1 a a 1.0000000
2 b a 0.9279869
3 c a 0.9604329
4 a b 0.9279869
5 b b 1.0000000
6 c b 0.8942139
7 a c 0.9604329
8 b c 0.8942139
9 c c 1.0000000

In R, create sequential 1 to N column based on values in other columns

Seems like a straightforward data manip problem, however we would like to avoid using a for loop that simply compares the values in each row. We have the following dataframe:
zed = data.frame(
a = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1),
b = c('a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd', 'd', 'd', 'd', 'e', 'e', 'a', 'a'),
c = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 1, 1),
stringsAsFactors = FALSE
)
output = zed = data.frame(
a = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1),
b = c('a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd', 'd', 'd', 'd', 'e', 'e', 'a', 'a'),
c = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 1, 1),
group = c(1, 1, 2, 2, 2, 3, 4, 5, 6, 6, 6, 7, 8, 8, 9, 9),
stringsAsFactors = FALSE
)
> output
a b c group
1 1 a 1 1
2 1 a 1 1
3 1 b 1 2
4 1 b 1 2
5 1 b 1 2
6 1 c 1 3
7 1 c 2 4
8 1 d 2 5
9 2 d 2 6
10 2 d 2 6
11 2 d 2 6
12 2 d 3 7
13 2 e 3 8
14 2 e 3 8
15 1 a 1 9
16 1 a 1 9
The dataframe begins with the columns a, b, c, and we need to add the group column to the dataframe. The group column starts at 1, and increases sequentially if any of the values in a, b, c are different from their value in the previous row.
This is not quite as simple as doing a group_by() on a, b, c, as the same row can appear later, but not sequentially, in the dataframe (e.g. rows 1,2 == rows 15,16, however they are not the same group because they did not appear sequentially in the dataframe).
We can use
library(data.table)
setDT(zed)[, group := .GRP, .(rleid(a, b, c))]

Resources