R: how to combine rows using Pivot_wider - r

I have a table like the following:
A, B, C
1, Yes, 3
1, No, 2
2, Yes, 4
2, No, 6
etc
I want to convert it to:
A, Yes, No
1, 3, 2
2, 4, 6
I have tried using:
dat <- dat %>%
spread(B, C) %>%
group_by(A)
However, now I have a bunch of NA values. Is it possible to use pivot_longer to do this instead?

We can use pivot_wider
library(tidyr)
pivot_wider(dat, names_from = B, values_from = C)
-output
# A tibble: 2 x 3
# A Yes No
# <dbl> <dbl> <dbl>
#1 1 3 2
#2 2 4 6
If there are duplicate rows, then an option is to create a sequence by that column
library(data.table)
library(dplyr)
dat1 <- bind_rows(dat, dat) # // example with duplicates
dat1 %>%
mutate(rn = rowid(B)) %>%
pivot_wider(names_from = B, values_from = C) %>%
select(-rn)
-output
# A tibble: 4 x 3
# A Yes No
# <dbl> <dbl> <dbl>
#1 1 3 2
#2 2 4 6
#3 1 3 2
#4 2 4 6
data
dat <- structure(list(A = c(1, 1, 2, 2), B = c("Yes", "No", "Yes", "No"
), C = c(3, 2, 4, 6)), class = "data.frame", row.names = c(NA,
-4L))

Related

Subtract one group of values from all groups using group_by

How can I subtract one group of values from all values using group_by in tibble.
Below is an example with expected results. I wish to subtract values of category "A" from all values
d <- tibble(categories = c(rep("A", 3), rep("B", 3), rep("C", 3)),
values = 1:9)
# expected outcome
d <- tibble(categories = c(rep("A", 3), rep("B", 3), rep("C", 3)),
values = c(0, 0, 0, 3, 3, 3, 6, 6, 6))
If the categories size are the same length, we could do
library(dplyr)
d %>%
mutate(values = values - d$values[d$categories == "A"])
-output
# A tibble: 9 × 2
categories values
<chr> <int>
1 A 0
2 A 0
3 A 0
4 B 3
5 B 3
6 B 3
7 C 6
8 C 6
9 C 6
You can do:
library(tidyverse)
d %>%
group_by(categories) %>%
mutate(id = row_number()) %>%
ungroup() %>%
pivot_wider(names_from = 'categories',
values_from = 'values') %>%
mutate(across(-id, ~ . - A)) %>%
pivot_longer(cols = -id,
names_to = 'categories',
values_to = 'values',
cols_vary = 'slowest') %>%
select(-id)
Alternatively:
d %>%
group_by(categories) %>%
mutate(id = row_number()) %>%
ungroup() %>%
mutate(values = values - values[categories == 'A' & id == id]) %>%
select(-id)
# A tibble: 9 x 2
categories values
<chr> <int>
1 A 0
2 A 0
3 A 0
4 B 3
5 B 3
6 B 3
7 C 6
8 C 6
9 C 6

How to combine two rows of a dataframe into one row

I have a dataframe which looks like this.
Name info.1 info.2
ab a 1
123 a 1
de c 4
456 c 4
fg d 5
789 d 5
The two rows that need to be combined are identical aside from the name column and are together in the dataframe. I want the new dataframe to look like this:
Name ID info.1 info.2
ab 123 a 1
de 456 c 4
fg 789 d 5
I have no clue how to do this and google search hasn't been helpful so far
In base R you could do:
data.frame(Name = df[seq(nrow(df)) %% 2 == 0, 1],
ID = df[seq(nrow(df)) %% 2 == 1, 1],
df[seq(nrow(df)) %% 2 == 0, 2:3])
#> Name ID info.1 info.2
#> 2 ab 456 a 1
#> 4 123 fg c 4
#> 6 de 789 d 5
Created on 2022-07-20 by the reprex package (v2.0.1)
A possible solution:
library(tidyverse)
df %>%
group_by(info.1) %>%
summarise(Name = str_c(Name, collapse = "_"), info.2 = first(info.2)) %>%
separate(Name, into = c("Name", "ID"), convert = T) %>%
relocate(info.1, .before = info.2)
#> # A tibble: 3 × 4
#> Name ID info.1 info.2
#> <chr> <int> <chr> <int>
#> 1 ab 123 a 1
#> 2 de 456 c 4
#> 3 fg 789 d 5
Assuming the Name column is consistently ordered Name-ID-Name-ID then:
library(tidyverse)
data <- tibble(Name = c('ab', 123, 'de', 456, 'fg', 789),
info.1 = c('a', 'a', 'c', 'c', 'd', 'd'),
info.2 = c(1, 1, 4, 4, 5, 5))
# remove the troublesome column and make a tibble
# with the unique combos of info1 and 2
data_2 <- data %>% select(info.1, info.2) %>% distinct()
# add columns for name and ID by skipping every other row in the
# original tibble
data_2$Name <- data$Name[seq(from = 1, to = nrow(data), by = 2)]
data_2$ID <- data$Name[seq(from = 2, to = nrow(data), by = 2)]
We could also use summarise and extract first as name and last as id:
data |>
group_by(info.1, info.2) |>
summarise(name = first(Name), ID = last(Name)) |>
ungroup() #|>
#relocate(3:4,1:2)
Output:
# A tibble: 3 × 4
info.1 info.2 name ID
<chr> <dbl> <chr> <chr>
1 a 1 ab 123
2 c 4 de 456
3 d 5 fg 789
We could also use
library(dplyr)
library(stringr)
data %>%
group_by(across(starts_with('info'))) %>%
mutate(ID = str_subset(Name, "^\\d+$"), .before = 2) %>%
ungroup %>%
filter(str_detect(Name, '^\\d+$', negate = TRUE))
-output
# A tibble: 3 × 4
Name ID info.1 info.2
<chr> <chr> <chr> <dbl>
1 ab 123 a 1
2 de 456 c 4
3 fg 789 d 5
data
data <- structure(list(Name = c("ab", "123", "de", "456", "fg", "789"
), info.1 = c("a", "a", "c", "c", "d", "d"), info.2 = c(1, 1,
4, 4, 5, 5)), row.names = c(NA, -6L), class = "data.frame")

Iteratively dplyr::coalesce()

I have a dataset that I am needing to use dplyr::coalesce() on. But I want to do this multiple times and am not sure about what is a more efficient way of doing this (e.g. loop, apply, etc).
To give you a toy example, say my dataset is:
df = data.frame(
a = c(1, NA, NA),
a.1 = c(NA, 1, NA),
a.2 = c(NA, NA, 1),
b = c(2, NA, NA),
b.1 = c(NA, 2, NA),
b.2 = c(NA, NA, 2),
c = c(3, NA, NA),
c.1 = c(NA, 3, NA),
c.2 = c(NA, NA, 3)
)
And I could do this:
new_df = df |>
dplyr::mutate(
a = dplyr::coalesce(a, a.1, a.2),
b = dplyr::coalesce(b, b.1, b.2),
c = dplyr::coalesce(c, c.1, c.2)
) |>
dplyr::select(a, b, c)
Which would give me:
new_df
a b c
1 1 2 3
2 1 2 3
3 1 2 3
First, how could I efficiently do this without having to write coalesce n times? This example here is just an example and I'd really need to do this forty times with the dataset.
Also, is there a way to do it as I have here where I basically just keep a, b, and c rather than naming it as a.1 or whatever?
If columns are like something and somthing.etc shape,
you may try
library(dplyr)
library(stringr)
df %>%
split.default(str_remove(names(.), "\\..*")) %>%
map_df(~ coalesce(!!! .x))
a b c
<dbl> <dbl> <dbl>
1 1 2 3
2 1 2 3
3 1 2 3
Here is an alternative with pivoting:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(everything()) %>%
mutate(name = sub("\\..*", "", name)) %>%
drop_na %>%
pivot_wider(names_from = name, values_from = value, values_fn = list) %>%
unnest(cols = c(a, b, c))
a b c
<dbl> <dbl> <dbl>
1 1 2 3
2 1 2 3
3 1 2 3

Sum While melting columns in R

Is there a way to melt 2 columns and take there sums as value . For example
df <- data.frame(A = c("x", "y", "z"), B = c(1, 2, 3), Cat1 = c(1, 4, 3), New2 = c(4, 4, 4))
Expected output
New_Col Sum
Cat1 8
New2 12
Or using base R with colSums after selecting the columns of interest and then convert the named vector to data.frame with stack
stack(colSums(df[c("Cat1", "New2")]))[2:1]
ind values
1 Cat1 8
2 New2 12
Of course
df %>%
summarise(across(starts_with('Cat'), sum)) %>%
pivot_longer(everything(), names_to = 'New_Col', values_to = 'Sum')
# A tibble: 2 × 2
New_Col Sum
<chr> <dbl>
1 Cat1 8
2 Cat2 12

How do I select column based on value in another column with dplyr?

My data frame looks like this:
id A T C G ref var
1 1 10 15 7 0 A C
2 2 11 9 2 3 A G
3 3 2 31 1 12 T C
I'd like to create two new columns: ref_count and var_count which will have following values:
Value from A column and value from C column, since ref is A and var is C
Value from A column and value from G column, since ref is A and var is G
etc.
So I'd like to select a column based on the value in another column for each row.
Thanks!
We can use pivot_longer to reshape into 'long' format, filter the rows and then reshape it to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = A:G) %>%
group_by(id) %>%
filter(name == ref|name == var) %>%
mutate(nm1 = c('ref_count', 'var_count')) %>%
ungroup %>%
select(id, value, nm1) %>%
pivot_wider(names_from = nm1, values_from = value) %>%
left_join(df1, .)
# A tibble: 3 x 9
# id A T C G ref var ref_count var_count
#* <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#1 1 10 15 7 0 A C 10 7
#2 2 11 9 2 3 A G 11 3
#3 3 2 31 1 12 T C 31 1
Or in base R, we can also make use of the vectorized row/column indexing
df1$refcount <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$ref, names(df1)[2:5]))]
df1$var_count <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$var, names(df1)[2:5]))]
data
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))
The following is a tidyverse alternative without creating a long dataframe that needs filtering. It essentially uses tidyr::nest() to nest the dataframe by rows, after which the correct column can be selected for each row.
df1 %>%
nest(data = -id) %>%
mutate(
data = map(
data,
~mutate(., refcount = .[[ref]], var_count = .[[var]])
)
) %>%
unnest(data)
#> # A tibble: 3 × 9
#> id A T C G ref var refcount var_count
#> <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 1 10 15 7 0 A C 10 7
#> 2 2 11 9 2 3 A G 11 3
#> 3 3 2 31 1 12 T C 31 1
A variant of this does not need the (assumed row-specific) id column but defines the nested groups from the unique values of ref and var directly:
df1 %>%
nest(data = -c(ref, var)) %>%
mutate(
data = pmap(
list(data, ref, var),
function(df, ref, var) {
mutate(df, refcount = df[[ref]], var_count = df[[var]])
}
)
) %>%
unnest(data)
The data were specified by akrun:
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))

Resources