Take Symmetrical Mean of a tibble (ignoring the NAs) - r

I have a tibble where the rows and columns are the same IDs and I would like to take the mean (ignoring the NAs) to make the df symmetrical. I am struggling to see how.
data <- tibble(group = LETTERS[1:4],
A = c(NA, 10, 20, NA),
B = c(15, NA, 25, 30),
C = c(20, NA, NA, 10),
D = c(10, 12, 15, NA)
)
I would normally do
A <- as.matrix(data[-1])
(A + t(A))/2
But this does not work because of the NAs.
Edit: below is the expected output.
output <- tibble(group = LETTERS[1:4],
A = c(NA, 12.5, 20, 10),
B = c(12.5, NA, 25, 21),
C = c(20, 25, NA, 12.5),
D = c(10, 21, 12.5, NA))

Here is a suggestion using tidyverse code.
library(tidyverse)
data <- tibble(group = LETTERS[1:4],
A = c(NA, 10, 20, NA),
B = c(15, NA, 25, 30),
C = c(20, NA, NA, 10),
D = c(10, 12, 15, NA)
)
A <- data %>%
pivot_longer(-group, values_to = "x")
B <- t(data) %>%
as.data.frame() %>%
setNames(LETTERS[1:4]) %>%
rownames_to_column("group") %>%
pivot_longer(-group, values_to = "y") %>%
left_join(A, by = c("group", "name")) %>%
mutate(
mean = if_else(!(is.na(x) | is.na(y)), (x + y)/2, x),
mean = if_else(is.na(mean) & !is.na(y), y, mean)
) %>%
select(-x, -y) %>%
pivot_wider(names_from = name, values_from = mean)
B
## A tibble: 4 x 5
# group A B C D
# <chr> <dbl> <dbl> <dbl> <dbl>
#1 A NA 12.5 20 10
#2 B 12.5 NA 25 21
#3 C 20 25 NA 12.5
#4 D 10 21 12.5 NA

Okay so this is how I ended up doing this. I would have preferred if I didnt use a for loop because the actual data I have is much bigger but beggars cant be choosers!
A <- as.matrix(data[-1])
for (i in 1:nrow(A)){
for (j in 1:ncol(A)){
if(is.na(A[i,j])){
A[i,j] <- A[j, i]
}
}
}
output <- (A + t(A))/2
output %>%
as_tibble() %>%
mutate(group = data$group) %>%
select(group, everything())
# A tibble: 4 x 5
group A B C D
<chr> <dbl> <dbl> <dbl> <dbl>
1 A NA 12.5 20 10
2 B 12.5 NA 25 21
3 C 20 25 NA 12.5
4 D 10 21 12.5 NA

Related

Performing pivot_longer() over multiple sets of columns

I am stuck in performing pivot_longer() over multiple sets of columns. Here is the sample dataset
df <- data.frame(
id = c(1, 2),
uid = c("m1", "m2"),
germ_kg = c(23, 24),
mineral_kg = c(12, 17),
perc_germ = c(45, 34),
perc_mineral = c(78, 10))
I need the output dataframe to look like this
out <- df <- data.frame(
id = c(1, 1, 2, 2),
uid = c("m1", "m1", "m2", "m2"),
crop = c("germ", "germ", "mineral", "mineral"),
kg = c(23, 12, 24, 17),
perc = c(45, 78, 34, 10))
df %>%
rename_with(~str_replace(.x,'(.*)_kg', 'kg_\\1')) %>%
pivot_longer(-c(id, uid), names_to = c('.value', 'crop'), names_sep = '_')
# A tibble: 4 x 5
id uid crop kg perc
<dbl> <chr> <chr> <dbl> <dbl>
1 1 m1 germ 23 45
2 1 m1 mineral 12 78
3 2 m2 germ 24 34
4 2 m2 mineral 17 10
If you were to use data.table:
library(data.table)
melt(setDT(df), c('id', 'uid'), patterns(kg = 'kg', perc = 'perc'))
id uid variable kg perc
1: 1 m1 1 23 45
2: 2 m2 1 24 34
3: 1 m1 2 12 78
4: 2 m2 2 17 10
I suspect there might be a simpler way using pivot_long_spec, but one tricky thing here is that your column names don't have a consistent ordering of their semantic components. #Onyambu's answer deals with this nicely by fixing it upsteam.
library(tidyverse)
df %>%
pivot_longer(-c(id, uid)) %>%
separate(name, c("col1", "col2")) %>% # only needed
mutate(crop = if_else(col2 == "kg", col1, col2), # because name
meas = if_else(col2 == "kg", col2, col1)) %>% # structure
select(id, uid, crop, meas, value) %>% # is
pivot_wider(names_from = meas, values_from = value) # inconsistent
# A tibble: 4 x 5
id uid crop kg perc
<dbl> <chr> <chr> <dbl> <dbl>
1 1 m1 germ 23 45
2 1 m1 mineral 12 78
3 2 m2 germ 24 34
4 2 m2 mineral 17 10

Wide to long with pivot_longer and mix of numeric and character data

help <- data.frame(
id = c(100, 100, 101, 102, 102),
q1 = c(NA, 1, NA, NA, 3),
q2 = c(1, NA, 2, NA, NA),
q3 = c(NA, 1, NA, 4, NA),
q4 = c(NA, NA, 4, NA, 5),
group = c("a", "b", "c", "a", "c"))
help$group <- as.character(help$group)
I am trying to pivot longer so dataset looks like this:
id score group
100 NA a
100 1 b
100 NA c
...
But I get an error with the numeric values of q1-q4 and the character string group.
pivot_longer(help, !id, names_to = "score",
values_to = "group", values_ptypes = list(group = 'character'))
Error: Can't convert <double> to <character>.
How can I pivot longer but also preserve the group variable (where there is several missing data for the q1-4 there is a match for every id and group)?
library(tidyr)
output <- pivot_longer(help, -c(id, group), names_to = "question",
values_to = "score") %>%
dplyr::select(-question) %>%
dplyr::arrange(id, group)
Output
head(output)
# A tibble: 6 × 3
id group score
<dbl> <chr> <dbl>
1 100 a NA
2 100 a 1
3 100 a NA
4 100 a NA
5 100 b 1
6 100 b NA

How to join 3 tables with incomplete values in R

I want to join df1 and df2 to get df_me. As I couldn't get the result, I tried to use also df_p as a star scheme, but I couldn't get the result that I want.
library(tidyverse)
df1 <- tibble(c = c('b','c','d'),
x = c(1, 2, 3),
z = c(10, 11, 12))
df2 <- tibble(c = c('a','b','d'),
y = c(4,5,6),
z = c(20, 10, 12))
df_p <- tibble(c = c('a','b','c','d'),
z = c(20, 10, 11, 12))
# This is the result that I want
df_me <- tibble(c = c('a','b','c','d'),
x = c(NA, 1, 2, 3),
y = c(4, 5, NA, 6),
z = c(20, 10, 11, 12))
# This is (part of) what I tried without success
df_left2 <- left_join(df_p, df1, by = 'c')
df_left3 <- left_join(df_p, df2, by = 'c')
df_left4 <- left_join(df_left2, df_left3, by = 'c')
df_left4 %>% arrange(c)
#> # A tibble: 4 x 7
#> c z.x.x x z.y.x z.x.y y z.y.y
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 a 20 NA NA 20 4 20
#> 2 b 10 1 10 10 5 10
#> 3 c 11 2 11 11 NA NA
#> 4 d 12 3 12 12 6 12
Created on 2021-06-11 by the reprex package (v2.0.0)
why not?
merge(df1, df2, by = c('c', 'z'), all = T)
c z x y
1 a 20 NA 4
2 b 10 1 5
3 c 11 2 NA
4 d 12 3 6
Or in dplyr?
df1 %>% full_join(df2, by = c('c', 'z'))

What's the easiest way for the multiple if-else conditions?

I have a data set something like this:
df_1 <- tribble(
~A, ~B, ~C,
10, 10, NA,
NA, 34, 15,
40, 23, NA,
4, 12, 18,
)
Now, I just want to compare A, B, C for each row, and add a new column that shows us the minimum number. Let's see how desired data looks like:
df_2 <- tribble(
~A, ~B, ~C, ~Winner,
10, 10, NA, "Same",
NA, 34, 15, "C",
40, 23, NA, "B",
4, 12, 18, "A",
)
There are four outputs: Same, A-Win, B-Win, C-Win.
How would you code to get this result?
Thanks in advance.
Here is something:
foo <- function(x) {
rmin <- which(x == min(x, na.rm = TRUE))
if (length(rmin) > 1) "same" else names(rmin)
}
apply(df_1, 1, foo)
[1] "same" "C" "B" "A"
You can add this as a column to your data.frame with:
df_1$winner <- apply(df_1, 1, foo)
# A tibble: 4 x 4
A B C winner
<dbl> <dbl> <dbl> <chr>
1 10 10 NA same
2 NA 34 15 C
3 40 23 NA B
4 4 12 18 A
If you have more variables and only want to use some you can use a character vector:
vars <- c("A", "B", "C")
apply(df_1[vars], 1, foo)
df_1 <- tribble(
~A, ~B, ~C,
10, 10, NA,
NA, 34, 15,
40, 23, NA,
4, 12, 18,
)
df_1 %>%
mutate(
winner = colnames(df_1)[apply(df_1,1,which.min)],
winner = if_else(A == B | B == C | A == C, 'same', winner, missing = winner))
# A tibble: 4 x 4
A B C winner
<dbl> <dbl> <dbl> <chr>
1 10 10 NA same
2 NA 34 15 C
3 40 23 NA B
4 4 12 18 A

Generate list of dataframes and apply function

I want to generate a list of dataframes and apply the same functions to each of them. I do not know how to do this elegantly without a very large number of lines of code.
From a dataframe df,
id <- c('a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd', 'e')
x <- rnorm(n = 10, mean = 25, sd = 3)
y <- rnorm(n = 10, mean = 45, sd = 4.5)
z <- rnorm(n = 10, mean = 70000, sd = 10)
type <- c(rep("gold", 2),
rep("silver", 4),
rep("bronze", 4))
df <- data.frame(id, x, y, z, type)
I create a bunch of other datasets with a simple threshold rule based on one variable
df_25 <- df[df$x < 25,]
df_20 <- df[df$x < 20,]
# and so on
I then apply functions to each dataset; I can do this to each dataset individually, or to a list of datasets
# individually
df <- df_18 %>%
dplyr::group_by(id) %>%
dplyr::mutate(nb1= sum(x),
nb2 = sum(x != 25))
# to a list
ls1 <- list(df_25, df_20)
func_1 <- function(x) {
x <- x %>%
dplyr::group_by(id) %>%
dplyr::mutate(nb1= sum(x),
nb2 = sum(x != 25))
}
ls1 <- lapply(ls1, function(x) {x[c("id","x")]
<- lapply(x[c("id","x")], func_1)
x})
df_25 <- ls1[[1]]
df_20 <- ls1[[2]]
In any case this takes both a lot of lines and time as I am dealing with very large datasets. How could I simplify and fasten both the generation of datasets with proper recognisable names and the creation of the new variables through the functions defined above?
I did not find proper answer to this dual question yet and would welcome your help!
You could define a threshold vector and lapply your aggregation. In base R this could look like this:
threshold <- c(22, 24, 26)
res <- setNames(lapply(threshold, function(s) {
sst <- df[df$x < s, ]
merge(sst,
with(sst, aggregate(list(nb1=x, nb2=x != 25),
by=list(id=id), sum), by="id"))
}), threshold)
res
# $`22`
# id x y z type nb1 nb2
# 1 a 20.92786 37.61272 69976.23 gold 20.92786 1
# 2 b 20.64275 38.02056 69997.25 silver 20.64275 1
# 3 c 18.58916 46.08353 69985.98 silver 18.58916 1
#
# $`24`
# id x y z type nb1 nb2
# 1 a 22.73948 44.29524 70002.81 gold 43.66734 2
# 2 a 20.92786 37.61272 69976.23 gold 43.66734 2
# 3 b 20.64275 38.02056 69997.25 silver 20.64275 1
# 4 c 18.58916 46.08353 69985.98 silver 18.58916 1
#
# $`26`
# id x y z type nb1 nb2
# 1 a 22.73948 44.29524 70002.81 gold 43.66734 2
# 2 a 20.92786 37.61272 69976.23 gold 43.66734 2
# 3 b 20.64275 38.02056 69997.25 silver 20.64275 1
# 4 c 18.58916 46.08353 69985.98 silver 44.24036 2
# 5 c 25.65120 44.85778 70008.81 bronze 44.24036 2
# 6 d 24.84056 49.22505 69993.87 bronze 24.84056 1
Data
df <- structure(list(id = structure(c(1L, 1L, 2L, 2L, 2L, 3L, 3L, 4L,
4L, 5L), .Label = c("a", "b", "c", "d", "e"), class = "factor"),
x = c(22.7394803492982, 20.927856140076, 30.2395154764033,
26.6955462205898, 20.6427460111819, 18.589158456851, 25.6511987559726,
24.8405634272769, 28.8534602413068, 26.5376546472448), y = c(44.2952365501829,
37.6127198429065, 45.2842176546081, 40.3835729432985, 38.0205610647157,
46.083525703352, 44.8577760657779, 49.2250487481642, 40.2699166395278,
49.3740993403725), z = c(70002.8091832317, 69976.2314543058,
70000.9974233725, 70011.435897774, 69997.249180665, 69985.9786882474,
70008.8088326676, 69993.8665395223, 69998.7334115052, 70001.2935411788
), type = structure(c(2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L,
1L), .Label = c("bronze", "gold", "silver"), class = "factor")), class = "data.frame", row.names = c(NA,
-10L))
Using purrr::map to loop over the vector of thresholds
library(dplyr)
library(purrr)
map(c(18,20,25) %>%set_names() , ~ df %>% filter(x<.x) %>%
group_by(id) %>%
mutate(nb1= sum(x),
nb2 = sum(x != 25)))
Or using map_if to apply the calculation for df subset with nrow()>1.
map_if(c(18,20,25) %>%set_names(), ~df %>% filter(x<.x) %>% nrow()>1,
~df %>% filter(x<.x) %>% group_by(id) %>%
mutate(nb1= sum(x),
nb2 = sum(x != 25)), .else = ~NA)
Using tidyverse we can combine all this operations in one chain.
library(tidyverse)
df %>%
group_split(x > 25, keep = FALSE) %>%
map(. %>% group_by(id) %>% mutate(nb1= sum(x),nb2 = sum(x != 25)))
#[[1]]
# A tibble: 6 x 7
# Groups: id [5]
# id x y z type nb1 nb2
# <fct> <dbl> <dbl> <dbl> <fct> <dbl> <int>
#1 a 21.4 42.9 70001. gold 21.4 1
#2 b 18.0 45.3 70005. silver 18.0 1
#3 c 23.3 42.7 70006. bronze 23.3 1
#4 d 23.4 40.9 69990. bronze 46.7 2
#5 d 23.3 41.2 70000. bronze 46.7 2
#6 e 22.3 55.9 69991. bronze 22.3 1
#[[2]]
# A tibble: 4 x 7
# Groups: id [3]
# id x y z type nb1 nb2
# <fct> <dbl> <dbl> <dbl> <fct> <dbl> <int>
#1 a 25.8 40.5 69995. gold 25.8 1
#2 b 28.3 41.5 69996. silver 54.5 2
#3 b 26.3 49.3 69993. silver 54.5 2
#4 c 26.5 44.5 69986. silver 26.5 1
Here, I have split the data into two groups based on value of x,first group is values below 25 and second group is above 25. You might change the logic based on your requirement.
This gives you list of dataframes as output which you can access individually.
data
set.seed(1234)
id <- c('a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd', 'e')
x <- rnorm(n = 10, mean = 25, sd = 3)
y <- rnorm(n = 10, mean = 45, sd = 4.5)
z <- rnorm(n = 10, mean = 70000, sd = 10)
type <- c(rep("gold", 2),rep("silver", 4),rep("bronze", 4))
df <- data.frame(id, x, y, z, type)

Resources