Spread multiple columns [tidyr] - r

I would like to spread data over multiple columns using tidyr.
dat <- data.frame(ID = rep(1,10),
col1 = LETTERS[seq(1,10)],
col2 = c(letters[seq(1,8)],NA,NA),
col3 = c(rep(NA,8),"5",NA),
col4 = c(rep(NA,8),NA,"value"))
The expected outcome is:
Out <- data.frame(t(c(1,letters[seq(1,8)],"5","value")),row.names=NULL)
colnames(Out) <- c("ID",LETTERS[seq(1,10)])
I came up with:
a <- dat %>% gather(variable, value, -(ID:col1)) %>%
unite(temp, col1, variable) %>%
spread(temp, value)
a[,-which(is.na(a))]
which is clumsy and also changes the column names. Is there a better solution for this?

We can use the na.rm=TRUE in gather, remove the 'variable' with select and use spread
library(dplyr)
library(tidyr)
gather(dat, variable, val, -(ID:col1), na.rm=TRUE) %>%
select(-variable) %>%
spread(col1, val)
# ID A B C D E F G H I J
#1 1 d b b c b b b a 5 value
Update
With the devel version of tidyr (tidyr_0.8.3.9000), we can use pivot_wider when there are multiple value columns to be considered
dat %>%
pivot_wider(names_from = col1, values_from = str_c("col", 2:4)) %>%
select_if(~ any(!is.na(.)))
# A tibble: 1 x 11
# ID col2_A col2_B col2_C col2_D col2_E col2_F col2_G col2_H col3_I col4_J
# <dbl> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
#1 1 a b c d e f g h 5 value
If we are using reshape2, similar option is
library(reshape2)
dcast(melt(dat, measure = 3:5, na.rm=TRUE),
ID~col1, value.var='value')

Related

Replace a value in a data frame from other dataframe in r

Hi I have two dataframes, based on the id match, i wanted to replace table a's values with that of table b.
sample dataset is here :
a = tibble(id = c(1, 2,3),
type = c("a", "x", "y"))
b= tibble(id = c(1,3),
type =c("d", "n"))
Im expecting an output like the following :
c= tibble(id = c(1,2,3),
type = c("d", "x", "n"))
In dplyr v1.0.0, the rows_update() function was introduced for this purpose:
rows_update(a, b)
# Matching, by = "id"
# # A tibble: 3 x 2
# id type
# <dbl> <chr>
# 1 1 d
# 2 2 x
# 3 3 n
Here is an option using dplyr::left_join and dplyr::coalesce
library(dplyr)
a %>%
rename(old = type) %>%
left_join(b, by = "id") %>%
mutate(type = coalesce(type, old)) %>%
select(-old)
## A tibble: 3 Ă— 2
# id type
#. <dbl> <chr>
#1 1 d
#2 2 x
#3 3 n
The idea is to join a with b on column id; then replace missing values in type from b with values from a (column old is the old type column from a, avoiding duplicate column names).

Convert multiple binary columns into factors with names based on column name in R elegantly with dplyr

I'm trying to convert a dataframe of binary variables into factors, based on the column name with specific column ordering (for facet plotting, modelling, etc later). I've got a crude, but working function which I'm now trying to convert into a dplyr pipe.
Working function:
library(dplyr)
library(tidyr)
df <- tribble(
~id, ~A, ~B, ~C,
"X", 1, 0, 0,
"Y", 0, 0, 1,
"Z", 1, 1, 1
)
df1 = df
for (name in c("A", "B", "C")) {
df1[[name]] = factor(df1[[name]], levels = c(0, 1), labels = paste0(c("not ", ""), name))
}
df1$A
#> [1] A not A A
#> Levels: not A A
First attempt with dplyr. Almost, but I wanted Levels: not A A
df2 <- df %>%
pivot_longer(-id) %>%
mutate(value = factor(if_else(value == 1, name, paste("not", name)))) %>%
pivot_wider(names_from = name, values_from = value) %>%
droplevels()
df2$A
#> [1] A not A A
#> Levels: A not A # Levels incorrect
Second attempt with dplyr. A
df %>%
mutate(across(A:C, function(x) factor(x, levels = c(0, 1))))
#> # A tibble: 3 x 4
#> id A B C
#> <chr> <fct> <fct> <fct>
#> 1 X 1 0 0
#> 2 Y 0 0 1
#> 3 Z 1 1 1
# Unable to set custom factor labels
Is there a way to elegantly achieve what I'm after using dplyr/tidyverse?
We can use cur_column() to get name of the column.
library(dplyr) #dplyr > 1.0.0
df1 <- df %>%
mutate(across(A:C, function(x)
factor(x, c(0, 1), paste0(c("not ", ""), cur_column()))))
df1
# id A B C
# <chr> <fct> <fct> <fct>
#1 X A not B not C
#2 Y not A not B C
#3 Z A B C
df1$A
#[1] A not A A
#Levels: not A A

Using the value in one column to specify from which row to retrieve a value for a new column

I'm looking for an automated way of converting this:
dat = tribble(
~a, ~b, ~c
, 'x', 1, 'y'
, 'y', 2, NA
, 'q', 4, NA
, 'z', 3, 'q'
)
to:
tribble(
~a, ~b, ~d
, 'x', 1, 2
, 'z', 3, 4
)
So, the column c in dat encodes which row in dat to look at to grab a value for a new column d, and if c is NA, toss that row from the output. Any tips?
We can join dat with itself using c and a columns.
library(dplyr)
dat %>%
inner_join(dat %>% select(-c) %>% rename(d = 'b'),
by = c('c' = 'a'))
# A tibble: 2 x 4
# a b c d
# <chr> <dbl> <chr> <dbl>
#1 x 1 y 2
#2 z 3 q 4
In base R, we can do this with merge :
merge(dat, dat[-3], by.x = 'c', by.y = 'a')
We create the 'd' with lead of 'b' and filter out the NA rows of 'c' and remove the c column with select
library(dplyr)
dat %>%
mutate(d = lead(b)) %>%
filter(!is.na(c)) %>%
select(-c)
# A tibble: 2 x 3
# a b d
# <chr> <dbl> <dbl>
#1 x 1 2
#2 z 3 4
Or more compactly
dat %>%
mutate(d = replace(lead(b), is.na(c), NA), c = NULL) %>%
na.omit
Or with fill
library(tidyr)
dat %>%
mutate(c1 = c) %>%
fill(c1) %>%
group_by(c1) %>%
mutate(d = lead(b)) %>%
ungroup %>%
filter(!is.na(c)) %>%
select(-c, -c1)
Or in data.table
library(data.table)
setDT(dat)[, d := shift(b, type = 'lead')][!is.na(c)][, c := NULL][]
# a b d
#1: x 1 2
#2: z 3 4
NOTE: Both the solutions are simple and doesn't require any joins. Besides, it gives the expected output in the OP's post
Or using match from base R
cbind(na.omit(dat), d = with(dat, b[match(c, a, nomatch = 0)]))[, -3]
# a b d
#1 x 1 2
#2 z 3 4

Remove duplicates, keeping most frequent row

I would like to deduplicate my data, keeping the row that has the most frequent appearances. If there is a tie in rows, I don't care which gets returned—the first in alphabetical or numeric order is fine. I would like to do this by group of id and var.
MRE:
df <- data.frame(
id = rep("a", 8),
var = c(rep("b", 4), rep("c", 4)),
val = c("d", "d", "d", "e", "f", "f", "g", "g")
)
> df
id var val
1 a b d
2 a b d
3 a b d
4 a b e
5 a c f
6 a c f
7 a c g
8 a c g
Should be:
id var val
1 a b d
2 a c f
I'm working with large datasets and tidyverse pipe chains, so a dplyr solution would be preferable.
Use table and which.max to extract the mode:
df %>%
group_by(id, var) %>%
summarise(val = {t <- table(val); names(t)[which.max(t)] })
# A tibble: 2 x 3
# Groups: id [?]
# id var val
# <fct> <fct> <chr>
#1 a b d
#2 a c f
Another way to do this in base R: Create a three way contingency table directly, and then find the max column along the third axis:
apply(table(df), c(1, 2), function(v) names(v)[which.max(v)])
# var
#id b c
# a "d" "f"
Convert this to a data frame:
as.data.frame.table(
apply(table(df), c(1, 2), function(v) names(v)[which.max(v)])
)
# id var Freq
#1 a b d
#2 a c f
Using dplyr:
library(dplyr)
df %>%
group_by(id, var, val) %>%
summarise(n = n()) %>%
group_by(id, var) %>%
arrange(-n) %>%
slice(1) %>%
ungroup() %>%
select(-n)
# # A tibble: 2 x 3
# id var val
# <fct> <fct> <fct>
# 1 a b d
# 2 a c f
One option could be using table and max as:
library(dplyr)
df %>% group_by(id, var) %>%
filter(table(val) == max(table(val))) %>%
slice(1)
# # A tibble: 2 x 3
# # Groups: id, var [2]
# id var val
# <fctr> <fctr> <fctr>
# 1 a b d
# 2 a c g
NOTE: a c g is case of tie. Per OP any record can be returned in case of tie.
I doubt this is any faster, but another option is
df %>%
group_by(id, var) %>%
filter(row_number() == rle(as.character(val))$lengths %>%
{sum(.[1:which.max(.)])})
A dplyr solution using count:
library(dplyr)
df %>%
count(id,var,val,sort = T) %>%
group_by(id,var) %>%
summarize_at("val",head,1)
# # A tibble: 2 x 3
# id var val
# <fctr> <fctr> <fctr>
# 1 a b d
# 2 a c f
or maybe more idiomatic but longer:
df %>%
count(id,var,val,sort = T) %>%
group_by(id,var) %>%
slice(1) %>%
select(-n) %>%
ungroup
Or with tally for same output with slightly different syntax:
df %>%
group_by(id,var,val) %>%
tally(sort = T) %>%
slice(1) %>%
select(-n) %>%
ungroup
and a base solution :
df2 <- aggregate(x ~ .,cbind(df,x=1),sum)
aggregate(val ~ id+var, df2[order(-df2$x),],head,1)
# id var val
# 1 a b d
# 2 a c f
Here is my try:
library(dplyr)
df %>%
group_by(id,var,val) %>%
mutate(n=n()) %>%
arrange(desc(n)) %>%
group_by(id,var) %>%
filter(row_number()==1) %>%
select(-n)
`

Reshape2: multiple observations for variable

I have the following sample data:
d <- data.frame(id=c(1,1,1,2,2), time=c(1,1,1,1,1), var=runif(5))
id time var
1 1 1 0.373448545
2 1 1 0.007007124
3 1 1 0.840572603
4 2 1 0.684893481
5 2 1 0.822581501
I want to reshape this data.frame to wide format using dcast such that the output is the following:
id var.1 var.2 var.3
1 1 0.3734485 0.007007124 0.8405726
2 2 0.6848935 0.822581501 NA
Does anyone has some ideas?
Create a sequence column, seq, by id and then use dcast:
library(reshape2)
set.seed(123)
d <- data.frame(id=c(1,1,1,2,2), time=c(1,1,1,1,1), var=runif(5))
d2 <- transform(d, seq = ave(id, id, FUN = seq_along))
dcast(d2, id ~ seq, value.var = "var")
giving:
id 1 2 3
1 1 0.28758 0.78831 0.40898
2 2 0.88302 0.94047 NaN
A dplyr/tidyr option with spread would be
library(dplyr)
library(tidyr)
d %>%
group_by(id) %>%
mutate(n1= paste0("var.",row_number())) %>%
spread(n1, var) %>%
select(-time)
# id var.1 var.2 var.3
# (int) (dbl) (dbl) (dbl)
#1 1 0.3734485 0.007007124 0.8405726
#2 2 0.6848935 0.822581501 NA
Ok - here's a working solution. The key is to add a counting variable. My solution for this is a bit complicated - maybe you can come up with something better.
library(dplyr)
library(magrittr)
library(reshape2)
d <- data.frame(id=c(1,1,1,2,2,3,3,3,3), time=c(1,1,1,1,1,1,1,1,1), var=runif(9))
group_by(d, id) %>%
summarise(n = n()) %>%
data.frame() -> count
f <- c()
for (i in 1:nrow(count)) {
f <- c(f, 1:count$n[i])
}
d <- data.frame(d, f)
dcast(d, id ~ f, value.var = "var")

Resources