How to join 3 tables with incomplete values in R - r

I want to join df1 and df2 to get df_me. As I couldn't get the result, I tried to use also df_p as a star scheme, but I couldn't get the result that I want.
library(tidyverse)
df1 <- tibble(c = c('b','c','d'),
x = c(1, 2, 3),
z = c(10, 11, 12))
df2 <- tibble(c = c('a','b','d'),
y = c(4,5,6),
z = c(20, 10, 12))
df_p <- tibble(c = c('a','b','c','d'),
z = c(20, 10, 11, 12))
# This is the result that I want
df_me <- tibble(c = c('a','b','c','d'),
x = c(NA, 1, 2, 3),
y = c(4, 5, NA, 6),
z = c(20, 10, 11, 12))
# This is (part of) what I tried without success
df_left2 <- left_join(df_p, df1, by = 'c')
df_left3 <- left_join(df_p, df2, by = 'c')
df_left4 <- left_join(df_left2, df_left3, by = 'c')
df_left4 %>% arrange(c)
#> # A tibble: 4 x 7
#> c z.x.x x z.y.x z.x.y y z.y.y
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 a 20 NA NA 20 4 20
#> 2 b 10 1 10 10 5 10
#> 3 c 11 2 11 11 NA NA
#> 4 d 12 3 12 12 6 12
Created on 2021-06-11 by the reprex package (v2.0.0)

why not?
merge(df1, df2, by = c('c', 'z'), all = T)
c z x y
1 a 20 NA 4
2 b 10 1 5
3 c 11 2 NA
4 d 12 3 6
Or in dplyr?
df1 %>% full_join(df2, by = c('c', 'z'))

Related

Take Symmetrical Mean of a tibble (ignoring the NAs)

I have a tibble where the rows and columns are the same IDs and I would like to take the mean (ignoring the NAs) to make the df symmetrical. I am struggling to see how.
data <- tibble(group = LETTERS[1:4],
A = c(NA, 10, 20, NA),
B = c(15, NA, 25, 30),
C = c(20, NA, NA, 10),
D = c(10, 12, 15, NA)
)
I would normally do
A <- as.matrix(data[-1])
(A + t(A))/2
But this does not work because of the NAs.
Edit: below is the expected output.
output <- tibble(group = LETTERS[1:4],
A = c(NA, 12.5, 20, 10),
B = c(12.5, NA, 25, 21),
C = c(20, 25, NA, 12.5),
D = c(10, 21, 12.5, NA))
Here is a suggestion using tidyverse code.
library(tidyverse)
data <- tibble(group = LETTERS[1:4],
A = c(NA, 10, 20, NA),
B = c(15, NA, 25, 30),
C = c(20, NA, NA, 10),
D = c(10, 12, 15, NA)
)
A <- data %>%
pivot_longer(-group, values_to = "x")
B <- t(data) %>%
as.data.frame() %>%
setNames(LETTERS[1:4]) %>%
rownames_to_column("group") %>%
pivot_longer(-group, values_to = "y") %>%
left_join(A, by = c("group", "name")) %>%
mutate(
mean = if_else(!(is.na(x) | is.na(y)), (x + y)/2, x),
mean = if_else(is.na(mean) & !is.na(y), y, mean)
) %>%
select(-x, -y) %>%
pivot_wider(names_from = name, values_from = mean)
B
## A tibble: 4 x 5
# group A B C D
# <chr> <dbl> <dbl> <dbl> <dbl>
#1 A NA 12.5 20 10
#2 B 12.5 NA 25 21
#3 C 20 25 NA 12.5
#4 D 10 21 12.5 NA
Okay so this is how I ended up doing this. I would have preferred if I didnt use a for loop because the actual data I have is much bigger but beggars cant be choosers!
A <- as.matrix(data[-1])
for (i in 1:nrow(A)){
for (j in 1:ncol(A)){
if(is.na(A[i,j])){
A[i,j] <- A[j, i]
}
}
}
output <- (A + t(A))/2
output %>%
as_tibble() %>%
mutate(group = data$group) %>%
select(group, everything())
# A tibble: 4 x 5
group A B C D
<chr> <dbl> <dbl> <dbl> <dbl>
1 A NA 12.5 20 10
2 B 12.5 NA 25 21
3 C 20 25 NA 12.5
4 D 10 21 12.5 NA

Convert R to wide format with arrangement depending on value in column

I have got a table containing data from various samples ("sample1" etc) with which several types measurement (A to C) were made. Every measurement gave 3 values: concentration, maximum and minimum.
my.sample <- c("sample1", "sample1", "sample2", "sample2", "sample3")
type <- c("A", "B", "A", "C", "C")
concentration <- c(12, 5, 7, 10, 14)
max <- c(13, 6, 7, 11, 15)
min <- c(11, 4, 6, 9, 13)
mydata <- data.frame(my.sample, type, concentration, max, min)
> mydata
my.sample type concentration max min
1 sample1 A 12 13 11
2 sample1 B 5 6 4
3 sample2 A 7 7 6
4 sample2 C 10 11 9
5 sample3 C 14 15 13
I'd like to convert this data to a new table where I only have one row per sample. This means creating 3 columns (concentration, max, min) for every measurement type, with the type of measurement indicated in the column. Missing values should be defined as NA. Here's an example of the result I'd like to obtain:
A_concentration <- c(12, 7, NA)
A_max <- c(13, 7, NA)
A_min <- c(11, 6, NA)
B_concentration <- c(5, NA, NA)
B_max <- c(6, NA, NA)
B_min <- c(4, NA, NA)
C_concentration <- c(NA, 10, 14)
C_max <- c(NA, 11, 15)
C_min <- c(NA, 9, 13)
mydata.new <- data.frame(my.sample.new, A_concentration, A_max, A_min, B_concentration, B_max, B_min, C_concentration, C_max, C_min)
> mydata.new
my.sample.new A_concentration A_max A_min B_concentration B_max B_min
1 sample1 12 13 11 5 6 4
2 sample2 7 7 6 NA NA NA
3 sample3 NA NA NA NA NA NA
C_concentration C_max C_min
1 NA NA NA
2 10 11 9
3 14 15 13
Is there a method to widen data based on a condition and include a value (here: from type ) in the column name? I have got many more types in my real dataset, so it should ideally be generalisable.
This works:
library(dplyr)
mydata %>%
pivot_wider(id_cols = my.sample, names_from = type, values_from = c(concentration, max, min), names_glue = "{type}_{.value}") %>%
select(my.sample, starts_with("A"), starts_with("B"), starts_with("C"))
This gives us:
# A tibble: 3 x 10
my.sample A_concentration B_concentration C_concentration A_max B_max C_max A_min B_min C_min
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 sample1 12 5 NA 13 6 NA 11 4 NA
2 sample2 7 NA 10 7 NA 11 6 NA 9
3 sample3 NA NA 14 NA NA 15 NA NA 13

Reshaping data by appending rows from different groups to the same row

I have data as follows:
DT <- structure(list(Area = c("A", "A", "A", "A", "B", "B", "B", "B"
), Year = c(1, 1, 2, 2, 1, 1, 2, 2), Group = c(1, 2, 1, 2, 1,
2, 1, 2), Population_Count = c(10, 12, 10, 12, 10, 13, 10, 11
), Male_Count = c(5, 7, 5, 4, 5, 8, 5, 6), Female_Count = c(5,
5, 5, 8, 5, 5, 5, 5)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
# A tibble: 8 x 6
Area Year Group Population_Count Male_Count Female_Count
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 1 10 5 5
2 A 1 2 12 7 5
3 A 2 1 10 5 5
4 A 2 2 12 4 8
5 B 1 1 10 5 5
6 B 1 2 13 8 5
7 B 2 1 10 5 5
8 B 2 2 11 6 5
I would like to keep one observations per Area-Year, without losing any information. I tried to do
DTcast <- dcast(DT, Area + Year ~ Group + Population_Count + Male_Count + Female_Count)
But that results in a lot of rubbish:
Area Year 1_10_5_5 2_11_6_5 2_12_4_8 2_12_7_5 2_13_8_5
1 A 1 5 NA NA 5 NA
2 A 2 5 NA 8 NA NA
3 B 1 5 NA NA NA 5
4 B 2 5 5 NA NA NA
In addition, when I apply it to the actual data, I get:
Using 'H_FEMALE' as value column. Use 'value.var' to override
Error in CJ(1:72284, 1:1333365) :
Cross product of elements provided to CJ() would result in 96380955660 rows which exceeds .Machine$integer.max == 2147483647
So I think I am doing something wrong. I think it maybe has to do with the value.var which I do not know how to select.
Desired result:
# A tibble: 4 x 9
Area Year Group `Population_Count_ Group_1` `Male_Count_ Group_1` `Female_Count_ Group_1` `Population_Count_ Group_2` `Male_Count_ Group_2` `Female_Count_ Group_2`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 1 10 5 5 12 7 5
2 A 2 1 10 5 5 12 4 8
3 B 1 1 10 5 5 13 8 5
4 B 2 1 10 5 5 11 6 5
library(tidyverse)
DT %>% pivot_wider(id_cols = c("Area", "Year"), names_from = "Group", values_from = 4:6)
> DT %>% pivot_wider(id_cols = c("Area", "Year"), names_from = "Group", values_from = 4:6)
# A tibble: 4 x 8
Area Year Population_Count_1 Population_Count_2 Male_Count_1 Male_Count_2 Female_Count_1 Female_Count_2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 10 12 5 7 5 5
2 A 2 10 12 5 4 5 8
3 B 1 10 13 5 8 5 5
4 B 2 10 11 5 6 5 5
This will name your columns as desired
DT %>% pivot_wider(id_cols = c("Area", "Year"),
names_from = "Group",
values_from = 4:6,
names_sep = "_Group_")
use data.table
library(data.table)
dt <- structure(list(Area = c("A", "A", "A", "A", "B", "B", "B", "B"
), Year = c(1, 1, 2, 2, 1, 1, 2, 2), Group = c(1, 2, 1, 2, 1,
2, 1, 2), Population_Count = c(10, 12, 10, 12, 10, 13, 10, 11
), Male_Count = c(5, 7, 5, 4, 5, 8, 5, 6), Female_Count = c(5,
5, 5, 8, 5, 5, 5, 5)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
setDT(dt)
dcast(
dt,
formula = Area + Year ~ Group,
value.var = grep("_Count", names(dt), value = T)
)
#> Area Year Population_Count_1 Population_Count_2 Male_Count_1 Male_Count_2
#> 1: A 1 10 12 5 7
#> 2: A 2 10 12 5 4
#> 3: B 1 10 13 5 8
#> 4: B 2 10 11 5 6
#> Female_Count_1 Female_Count_2
#> 1: 5 5
#> 2: 5 8
#> 3: 5 5
#> 4: 5 5
Created on 2020-12-18 by the reprex package (v0.3.0)

Looking for suggestions to split a row in R df into multiple rows

I have a dataframe with the structure below.
df <- data.frame(x = c(1,9), y = c(2,9), z = c(4,9), id_1 = c(5,2), id_2 = c(6,3), id_3 = c(8,7), d1 = c(7,44), d2 = c(8,55), d3 = c(0,76), d4 = c(8,0))
This is my final expected output
df_out <- data.frame(x = c(1,1,1,9,9,9), y = c(2,2,2,9,9,9), z = c(4,4,4,9,9,9), id = c(5,6,8,2,3,7), d1 = c(7,7,7,44,44,44), d2 = c(8,8,8,55,55,55), d3 = c(0,0,0,76,76,76), d4 = c(8,8,8,0,0,0))
Does this work:
> library(dplyr)
> library(tidyr)
> df %>% pivot_longer(cols = c(id_1,id_2,id_3), values_to = 'id') %>% select(-name) %>% relocate(1:3,8,4:7)
# A tibble: 6 x 8
x y z id d1 d2 d3 d4
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 4 5 7 8 0 8
2 1 2 4 6 7 8 0 8
3 1 2 4 8 7 8 0 8
4 9 9 9 2 44 55 76 0
5 9 9 9 3 44 55 76 0
6 9 9 9 7 44 55 76 0
>
Data used:
> dput(df)
structure(list(x = c(1, 9), y = c(2, 9), z = c(4, 9), id_1 = c(5,
2), id_2 = c(6, 3), id_3 = c(8, 7), d1 = c(7, 44), d2 = c(8,
55), d3 = c(0, 76), d4 = c(8, 0)), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame"))

filter infinite values and NAs in same call using dplyr::c_across and filter_if

I'm looking to filter dataframe rows with Inf and NA in the same call using filter with c_across and deprecated filter_if:
library(dplyr)
df <- tibble(a = c(1, 2, 3, NA, 1), b = c(5, Inf, 8, 8, 3), c = c(9, 10, Inf, 11, 12), d = c('a', 'b', 'c', 'd', 'e'), e = c(1, 2, 3, 4, -Inf))
# # A tibble: 5 x 5
# a b c d e
# <dbl> <dbl> <dbl> <chr> <dbl>
# 1 1 5 9 a 1
# 2 2 Inf 10 b 2
# 3 3 8 Inf c 3
# 4 NA 8 11 d 4
# 5 1 3 12 e -Inf
I could do this in two calls using either c_across or filter_if:
df %>%
rowwise %>%
filter(!any(is.infinite(c_across(where(is.numeric))))) %>%
filter(!any(is.na(c_across(where(is.numeric)))))
# # A tibble: 1 x 5
# # Rowwise:
# a b c d e
# <dbl> <dbl> <dbl> <chr> <dbl>
# 1 1 5 9 a 1
#OR filter_if:
df %>%
filter_if(~is.numeric(.), all_vars(!is.infinite(.))) %>%
filter_if(~is.numeric(.), all_vars(!is.na(.)))
# # A tibble: 1 x 5
# a b c d e
# <dbl> <dbl> <dbl> <chr> <dbl>
# 1 1 5 9 a 1
How would I do both approaches in one call to filter (and filter_if)? There may be an across approach too?
thanks
Try this. Use the where to identify your numeric columns.
df %>%
filter(across(.cols = where(is.numeric),
.fns = ~!is.infinite(.x) & !is.na(.x)))
I would suggest an approach with across() from dplyr:
library(dplyr)
#Data
df <- tibble(a = c(1, 2, 3, NA, 1),
b = c(5, Inf, 8, 8, 3),
c = c(9, 10, Inf, 11, 12),
d = c('a', 'b', 'c', 'd', 'e'),
e = c(1, 2, 3, 4, -Inf))
#Mutate
df %>% filter(across(c(a:e), ~ !is.na(.) & !is.infinite(.)))
Output:
# A tibble: 1 x 5
a b c d e
<dbl> <dbl> <dbl> <chr> <dbl>
1 1 5 9 a 1

Resources