Sum While melting columns in R

Sum While melting columns in R - r

Is there a way to melt 2 columns and take there sums as value . For example
df <- data.frame(A = c("x", "y", "z"), B = c(1, 2, 3), Cat1 = c(1, 4, 3), New2 = c(4, 4, 4))
Expected output
New_Col Sum
Cat1 8
New2 12

Or using base R with colSums after selecting the columns of interest and then convert the named vector to data.frame with stack
stack(colSums(df[c("Cat1", "New2")]))[2:1]
ind values
1 Cat1 8
2 New2 12

Of course
df %>%
summarise(across(starts_with('Cat'), sum)) %>%
pivot_longer(everything(), names_to = 'New_Col', values_to = 'Sum')
# A tibble: 2 × 2
New_Col Sum
<chr> <dbl>
1 Cat1 8
2 Cat2 12

Related

Converting a matrix into a tibble in R

How can I convert this matrix:
> matrix(1:3, nrow = 3, dimnames = list(c("X","Y","Z"), c("A")))
A
X 1
Y 2
Z 3
into this tibble:
> tibble::tribble(~group1, ~group2, ~value, "X", "A", 1, "Y", "A", 2, "Z", "A", 3)
# A tibble: 3 × 3
group1 group2 value
<chr> <chr> <dbl>
1 X A 1
2 Y A 2
3 Z A 3
Thank you

as.tibble can convert the matrix's rownames to a column, and then you can use gather() to create the group2 column:
library(tidyverse)
m <- matrix(1:3, nrow = 3, dimnames = list(c("X","Y","Z"), c("A")))
newtib <- m %>%
as.tibble(rownames = "group1") %>%
gather('A', key = "group2", value = "value")
> newtib
# A tibble: 3 × 3
group1 group2 value
<chr> <chr> <int>
1 X A 1
2 Y A 2
3 Z A 3
> tibble::tribble(~group1, ~group2, ~value, "X", "A", 1, "Y", "A", 2, "Z", "A", 3)
# A tibble: 3 × 3
group1 group2 value
<chr> <chr> <dbl>
1 X A 1
2 Y A 2
3 Z A 3

Easier with base R, if we convert to table and coerce with as.data.frame (if we need to convert to tibble - use as_tibble as wrapper over the as.data.frame
as.data.frame(as.table(m1))
Var1 Var2 Freq
1 X A 1
2 Y A 2
3 Z A 3
data
m1 <- matrix(1:3, nrow = 3, dimnames = list(c("X","Y","Z"), c("A")))

Transform your matrix into a dataframe
bring your rownames to column group1
mutate group2
data.frame(matrix) %>%
rownames_to_column("group1") %>%
mutate(group2 = colnames(matrix)) %>%
dplyr::select(group1, group2, value=A)
group1 group2 value
1 X A 1
2 Y A 2
3 Z A 3

You can use -
library(tidyverse)
mat <- matrix(1:3, nrow = 3, dimnames = list(c("X","Y","Z"), c("A")))
mat %>%
as.data.frame() %>%
rownames_to_column(var = 'group1') %>%
pivot_longer(cols = -group1, names_to = 'group2')
# group1 group2 value
# <chr> <chr> <dbl>
#1 X A 1
#2 Y A 2
#3 Z A 3

Merge data frames and divide rows by group

I would like to divide the values from df1 over the values from df2. In this reproducible example, I am able to sum these values. What about the division? Thanks in advance!
df1 <- data.frame(country = c("a", "b", "c"), year1 = c(1, 2, 3), year2 = c(1, 2, 3))
df2 <- data.frame(country = c("a", "b", "d"), year1 = c(1, 2, NA), year2 = c(1, 2, 3))
df3 <- bind_rows(df1, df2) %>%
mutate_if(is.numeric, tidyr::replace_na, 0) %>%
group_by(country) %>%
summarise_all(., sum, na.rm = TRUE) %>%
na_if(., 0)
Expected result is:
# A tibble: 4 x 3
country year1 year2
<chr> <dbl> <dbl>
1 a 1 1
2 b 1 1
3 c NA NA
4 d NA NA

As there are groups with 2 rows and some with 1, use an if/else condition within summarise/across to divide the first element by the last if there are two elements or else return NA
library(dplyr) # version 1.0.4
library(tidyr)
bind_rows(df1, df2) %>%
mutate(across(where(is.numeric), replace_na, 0)) %>%
group_by(country) %>%
summarise(across(everything(), ~ if(n() == 2) first(.)/last(.)
else NA_real_))
-output
# A tibble: 4 x 3
# country year1 year2
#* <chr> <dbl> <dbl>
#1 a 1 1
#2 b 1 1
#3 c NA NA
#4 d NA NA

Here is a base R option using merge + split.default
df <- merge(df1, df2, by = "country", all = TRUE)
cbind(
df[1],
list2DF(lapply(
split.default(df[-1], gsub("\\.(x|y)", "", names(df)[-1])),
function(v) do.call("/", v)
))
)
which gives
country year1 year2
1 a 1 1
2 b 1 1
3 c NA NA
4 d NA NA

R: how to combine rows using Pivot_wider

I have a table like the following:
A, B, C
1, Yes, 3
1, No, 2
2, Yes, 4
2, No, 6
etc
I want to convert it to:
A, Yes, No
1, 3, 2
2, 4, 6
I have tried using:
dat <- dat %>%
spread(B, C) %>%
group_by(A)
However, now I have a bunch of NA values. Is it possible to use pivot_longer to do this instead?

We can use pivot_wider
library(tidyr)
pivot_wider(dat, names_from = B, values_from = C)
-output
# A tibble: 2 x 3
# A Yes No
# <dbl> <dbl> <dbl>
#1 1 3 2
#2 2 4 6
If there are duplicate rows, then an option is to create a sequence by that column
library(data.table)
library(dplyr)
dat1 <- bind_rows(dat, dat) # // example with duplicates
dat1 %>%
mutate(rn = rowid(B)) %>%
pivot_wider(names_from = B, values_from = C) %>%
select(-rn)
-output
# A tibble: 4 x 3
# A Yes No
# <dbl> <dbl> <dbl>
#1 1 3 2
#2 2 4 6
#3 1 3 2
#4 2 4 6
data
dat <- structure(list(A = c(1, 1, 2, 2), B = c("Yes", "No", "Yes", "No"
), C = c(3, 2, 4, 6)), class = "data.frame", row.names = c(NA,
-4L))

R group by column, count the combinations observed

I imagine this is already solved in many places, but I lack the right wordage to use to search for a solution. In R I have example data in long format like this:
A = tibble( c(1,2,3,1,2,4,5,5), c('a','b','c','a','f','-','b', 'f'))
and what I want returned is sort of a grouped result (something like a spread?) where I first collect the set of letters that match each number to get something like this.
1: 'a', 'a'
2: 'b', 'f'
3: 'c', 'c'
4: '_'
5: 'b', 'f'
and the actual final result I am looking for is the count of how many times each letter combination, when is observed:
'a','a': 1
'b','f': 2
'c','c': 1
'-': 1
I can do the last step with group_by() but I mention it here in case there is some magic sauce that does the whole thing.

We can do a group by 'a', then paste the second column while taking the number of distinct elements in 'b' and get the distinct rows
library(dplyr)
library(stringr)
A %>%
group_by(a) %>%
summarise(out = str_c(b, collapse=","), n = n_distinct(b))%>%
distinct(out, n)
# A tibble: 4 x 2
# out n
# <chr> <int>
#1 a,a 1
#2 b,f 2
#3 c 1
#4 - 1
data
A <- structure(list(a = c(1, 2, 3, 1, 2, 4, 5, 5), b = c("a", "b",
"c", "a", "f", "-", "b", "f")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))

This is close to what you are looking for:
library(tidyverse)
#Data
A <- structure(list(v1 = c(1, 2, 3, 1, 2, 4, 5, 5), v2 = c("a", "b",
"c", "a", "f", "-", "b", "f")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
#Code
A %>% group_by(v1) %>% summarise(chain=paste0(v2,collapse = ',')) %>% ungroup() %>%
group_by(chain) %>% summarise(N=n())
# A tibble: 4 x 2
chain N
<chr> <int>
1 - 1
2 a,a 1
3 b,f 2
4 c 1

Here is a base R option using nested aggregate
aggregate(.~y,aggregate(y~.,A,toString),length)
which gives
> aggregate(.~y,aggregate(y~.,A,toString),length)
y x
1 - 1
2 a, a 1
3 b, f 2
4 c 1
Data
A = tibble(x = c(1,2,3,1,2,4,5,5), y = c('a','b','c','a','f','-','b', 'f'))

Maybe you want to cast the data in wide format and then count the combinations. Try :
library(dplyr)
library(tidyr)
A %>%
group_by(v1) %>%
mutate(row = row_number()) %>%
pivot_wider(names_from = row, values_from = v2, names_prefix = 'col_') %>%
ungroup %>%
count(col_1, col_2)
# col_1 col_2 n
# <chr> <chr> <int>
#1 - NA 1
#2 a a 1
#3 b f 2
#4 c NA 1

How do I select column based on value in another column with dplyr?

My data frame looks like this:
id A T C G ref var
1 1 10 15 7 0 A C
2 2 11 9 2 3 A G
3 3 2 31 1 12 T C
I'd like to create two new columns: ref_count and var_count which will have following values:
Value from A column and value from C column, since ref is A and var is C
Value from A column and value from G column, since ref is A and var is G
etc.
So I'd like to select a column based on the value in another column for each row.
Thanks!

We can use pivot_longer to reshape into 'long' format, filter the rows and then reshape it to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = A:G) %>%
group_by(id) %>%
filter(name == ref|name == var) %>%
mutate(nm1 = c('ref_count', 'var_count')) %>%
ungroup %>%
select(id, value, nm1) %>%
pivot_wider(names_from = nm1, values_from = value) %>%
left_join(df1, .)
# A tibble: 3 x 9
# id A T C G ref var ref_count var_count
#* <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#1 1 10 15 7 0 A C 10 7
#2 2 11 9 2 3 A G 11 3
#3 3 2 31 1 12 T C 31 1
Or in base R, we can also make use of the vectorized row/column indexing
df1$refcount <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$ref, names(df1)[2:5]))]
df1$var_count <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$var, names(df1)[2:5]))]
data
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))

The following is a tidyverse alternative without creating a long dataframe that needs filtering. It essentially uses tidyr::nest() to nest the dataframe by rows, after which the correct column can be selected for each row.
df1 %>%
nest(data = -id) %>%
mutate(
data = map(
data,
~mutate(., refcount = .[[ref]], var_count = .[[var]])
)
) %>%
unnest(data)
#> # A tibble: 3 × 9
#> id A T C G ref var refcount var_count
#> <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 1 10 15 7 0 A C 10 7
#> 2 2 11 9 2 3 A G 11 3
#> 3 3 2 31 1 12 T C 31 1
A variant of this does not need the (assumed row-specific) id column but defines the nested groups from the unique values of ref and var directly:
df1 %>%
nest(data = -c(ref, var)) %>%
mutate(
data = pmap(
list(data, ref, var),
function(df, ref, var) {
mutate(df, refcount = df[[ref]], var_count = df[[var]])
}
)
) %>%
unnest(data)
The data were specified by akrun:
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))