One-hot-encoding a R list of characters - r

I have the following R dataframe :
id color
001 blue
001 yellow
001 red
002 blue
003 blue
003 yellow
What's the general method to one-hot-encode such a dataframe into the following :
id blue yellow red
001 1 1 1
002 1 0 0
003 1 0 1
Thank you very much.

Try this. You can create a variable for those observations present in data equals to one and then use pivot_wider() to reshape the values. As you will get NA for classes not present in data, you can replace it with zero using replace(). Here the code using tidyverse functions:
library(dplyr)
library(tidyr)
#Code
dfnew <- df %>% mutate(val=1) %>%
pivot_wider(names_from = color,values_from=val) %>%
replace(is.na(.),0)
Output:
# A tibble: 3 x 4
id blue yellow red
<int> <dbl> <dbl> <dbl>
1 1 1 1 1
2 2 1 0 0
3 3 1 1 0
Some data used:
#Data
df <- structure(list(id = c(1L, 1L, 1L, 2L, 3L, 3L), color = c("blue",
"yellow", "red", "blue", "blue", "yellow")), class = "data.frame", row.names = c(NA,-6L))

There are many ways to do this in R. It depends on what packages you are using. Most of the modeling packages such as caret and tidymodels have functions to do this for you.
However, if you aren't using a modeling package the tidyverse has an easy way to do this.
library(dplyr)
library(tidyr)
df <- tribble(
~id, ~color,
'001', 'blue',
'001', 'yellow',
'001', 'red',
'002', 'blue',
'003', 'blue',
'003', 'yellow')
df_onehot <- df %>%
mutate(value = 1) %>%
pivot_wider(names_from = color,values_from = value,values_fill = 0)
# A tibble: 3 x 4
# id blue yellow red
# <chr> <dbl> <dbl> <dbl>
# 1 001 1 1 1
# 2 002 1 0 0
# 3 003 1 1 0

With data.table:
library(data.table)
dcast(setDT(df), id ~ color, fun.aggregate = length)
# id blue red yellow
# 1: 001 1 1 1
# 2: 002 1 0 0
# 3: 003 1 0 1
Same logic with tidyr:
library(tidyr)
pivot_wider(df, names_from=color, values_from=color, values_fn=length, values_fill=0)
# id blue yellow red
# <chr> <int> <int> <int>
# 1 001 1 1 1
# 2 002 1 0 0
# 3 003 1 1 0
Base R:
out <- as.data.frame.matrix(pmin(with(df, table(id, color)), 1))
out$id <- rownames(out)
out
# blue red yellow id
# 001 1 1 1 001
# 002 1 0 0 002
# 003 1 0 1 003
Reproducible data
df <- data.frame(
id = c("001", "001", "001", "002", "003", "003"),
color = c("blue", "yellow", "red", "blue", "blue", "yellow")
)

Related

Transform long data into wide using frequency

I would like to know a practical way to transform dat in the table below
dat <- data.frame('city' = c('A','A','B','C','A','B','B','C','C','C'),
'color' = c('red', 'green', 'blue', 'red', 'green', 'blue', 'green', 'blue', 'red', 'red'),
'sex' = c('M','F','F','M','F','F','F','M','F','M'))
city red green blue F M
A 1 2 0 2 1
B 0 1 2 3 0
C 3 0 1 1 3
With tidyr, you can pivot_longer and then pivot_wider:
library(tidyr)
dat %>%
pivot_longer(c(color, sex)) %>%
pivot_wider(id_cols = city, names_from = c(name, value), names_sort = TRUE,
values_from = value, values_fn = length, values_fill = 0)
# # A tibble: 3 × 6
# city color_blue color_green color_red sex_F sex_M
# <chr> <int> <int> <int> <int> <int>
# 1 A 0 2 1 2 1
# 2 B 2 1 0 3 0
# 3 C 1 0 3 1 3
With sapply, create multiple tables and bind them:
sapply(dat[2:3], \(x) as.data.frame.matrix(table(dat$city, x))) |>
do.call(what = 'cbind.data.frame')
color.blue color.green color.red sex.F sex.M
A 0 2 1 2 1
B 2 1 0 3 0
C 1 0 3 1 3
You could first convert to longer format and then count values per group and convert back to wider format using pivot_wider like this:
library(dplyr)
library(tidyr)
dat %>%
pivot_longer(cols = c(color, sex)) %>%
group_by(city) %>%
add_count(value) %>%
distinct() %>%
select(-name) %>%
pivot_wider(names_from = value, values_from = n, values_fill = 0)
#> # A tibble: 3 × 6
#> # Groups: city [3]
#> city red M green F blue
#> <chr> <int> <int> <int> <int> <int>
#> 1 A 1 1 2 2 0
#> 2 B 0 0 1 3 2
#> 3 C 3 3 0 1 1
Created on 2023-01-31 with reprex v2.0.2

Mutate multiple columns if grouping variable has all observations missing

I'm trying to mutate the columns "a" and "b" only if the grouping variable "group" has all observations missing. The attempted solution changes the group "blue", in which not all observations are missing. Thanks in advance for your valuable time!
Code below:
library(tidyverse)
# sample data
a <- c(NA,NA,1,1,NA,1)
b <- c(1,1,NA,NA,1,NA)
c <- letters[1:6]
group <- c("yellow","yellow","black","black", "blue", "blue")
(data <- as_tibble(data.frame(a,b,c,group)))
# a b c group
# <dbl> <dbl> <fct> <fct>
# 1 NA 1 a yellow
# 2 NA 1 b yellow
# 3 1 NA c black
# 4 1 NA d black
# 5 NA 1 e blue
# 6 1 NA f blue
# failed attempt: observations from group "blue" change
(data %>%
dplyr::group_by(group) %>%
dplyr::mutate(across(1:2, ~ ifelse(all(is.na(.x)), 99999,.x))))
# a b c group
# <dbl> <dbl> <fct> <fct>
# 1 99999 1 a yellow
# 2 99999 1 b yellow
# 3 1 99999 c black
# 4 1 99999 d black
# 5 NA 1 e blue
# 6 NA 1 f blue
# desired output - observations from blue remain the same
a2 <- c(99999,99999,1,1,NA,1)
b2 <- c(1,1,99999,99999,1,NA)
c2 <- letters[1:6]
group2 <- c("yellow","yellow","black","black", "blue", "blue")
(data_desired <- as_tibble(data.frame(a2,b2,c2,group2)))
# a2 b2 c2 group2
# <dbl> <dbl> <fct> <fct>
# 1 99999 1 a yellow
# 2 99999 1 b yellow
# 3 1 99999 c black
# 4 1 99999 d black
# 5 NA 1 e blue
# 6 1 NA f blue
You could try this:
library(tidyverse)
# sample data
a <- c(NA,NA,1,1,NA,1)
b <- c(1,1,NA,NA,1,NA)
c <- letters[1:6]
group <- c("yellow","yellow","black","black", "blue", "blue")
(data <- as_tibble(data.frame(a,b,c,group)))
(data %>%
dplyr::group_by(group) %>%
dplyr::mutate(across(1:2, ~ ifelse(is.na(.x), 99999,.x))))
# A tibble: 6 x 4
# Groups: group [3]
a b c group
<dbl> <dbl> <fct> <fct>
1 99999 1 a yellow
2 99999 1 b yellow
3 1 99999 c black
4 1 99999 d black
5 99999 1 e blue
6 1 99999 f blue
Not the best solution but you can deal with it...
data <- data %>%
group_by(group) %>%
mutate(new = paste0(a, "_", b),
new1 = if_else(new == lag(new), str_replace(new, "NA", "99999"), new),
new2 = if_else(new == lead(new), str_replace(new, "NA", "99999"), new)
) %>%
separate(col = new1, into = c("a_new1", "b_new1"), sep = "_", extra = "drop") %>%
separate(col = new2, into = c("a_new2", "b_new2"), sep = "_", extra = "drop") %>%
mutate(a2 = if_else(is.na(a_new1), replace_na(a_new2), a_new1),
b2 = if_else(is.na(b_new1), replace_na(b_new2), b_new1)
) %>%
select(a, b, c, group, a2, b2) %>%
type_convert()
data
# A tibble: 6 x 6
# Groups: group [3]
a b c group a2 b2
<dbl> <dbl> <fct> <fct> <dbl> <dbl>
1 NA 1 a yellow 99999 1
2 NA 1 b yellow 99999 1
3 1 NA c black 1 99999
4 1 NA d black 1 99999
5 NA 1 e blue NA 1
6 1 NA f blue 1 NA
Thanks all for the input!
Finally, this is how I resolved this with lists and purrr.
library(tidyverse)
library(purrr)
# sample data
a <- c(NA,NA,1,1,NA,1)
b <- c(1,1,NA,NA,1,NA)
c <- letters[1:6]
group <- c("yellow","yellow","black","black", "blue", "blue")
(data <- as_tibble(data.frame(a,b,c,group)))
# list with groups in which all cases are NA
list1 <- data %>%
split(.,.$group) %>%
map(~select(.x,as.vector(which(colSums(is.na(.)) == nrow(.))))) %>%
map(~mutate_all(.x, replace_na, 99999))
# list with groups in which there is at least one valid observation
list2 <- data %>%
split(.,.$group) %>%
map(~select(.x, as.vector(which(colSums(is.na(.)) != nrow(.)))))
# putting the groups together into a dataframe
list3 <- mapply(cbind, list1, list2, SIMPLIFY=FALSE)
(desired_output <- do.call(rbind.data.frame, list3))

R variable number of string concatenations within group_by

Let's say I have the following table of houses (or anything) and their colors:
I'm trying to:
group_by(Group)
count rows (I assume with length(unique(ID)),
mutate or summarize into a new row with a count of each color in group, as a string.
Result should be:
So I know step 3 could be done by manually entering every possible combination with something like
df <- df %>%
group_by(Group) %>%
mutate(
Summary = case_when(
all(
sum(count_green) > 0
) ~ paste(length(unique(ID)), " houses, ", count_green, " green")
)
)
but what if I have hundreds of possible combinations? Is there a way to paste into a string and append for each new color/count?
Here is one approach where we count the frequency of 'Group', 'Color' with add_count, unite that with 'Color', then grouped by 'Group', create the 'Summary' column by concatenating the unique elements of 'nColor' with the frequency (n())
library(dplyr)
library(tidyr)
library(stringr)
df %>%
add_count(Group, Color) %>%
unite(nColor, n, Color, sep= ' ', remove = FALSE) %>%
group_by(Group) %>%
mutate(
Summary = str_c(n(), ' houses, ', toString(unique(nColor)))) %>%
select(-nColor)
# Groups: Group [2]
# ID Group Color n Summary
# <int> <chr> <chr> <int> <chr>
#1 1 a Green 2 3 houses, 2 Green, 1 Orange
#2 2 a Green 2 3 houses, 2 Green, 1 Orange
#3 3 a Orange 1 3 houses, 2 Green, 1 Orange
#4 4 b Blue 2 3 houses, 2 Blue, 1 Yellow
#5 5 b Yellow 1 3 houses, 2 Blue, 1 Yellow
#6 6 b Blue 2 3 houses, 2 Blue, 1 Yellow
data
df <- structure(list(ID = 1:6, Group = c("a", "a", "a", "b", "b", "b"
), Color = c("Green", "Green", "Orange", "Blue", "Yellow", "Blue"
)), class = "data.frame", row.names = c(NA, -6L))
Here's an approach with map_chr from purrr and a lot of pasting.
library(dplyr)
library(purrr)
df %>%
group_by(Group) %>%
mutate(Summary = paste(n(),"houses,",
paste(map_chr(unique(as.character(Color)),
~paste(sum(Color == .x),.x)),
collapse = ", ")))
## A tibble: 6 x 4
## Groups: Group [2]
# ID Group Color Summary
# <int> <fct> <fct> <chr>
#1 1 a Green 3 houses, 2 Green, 1 Orange
#2 2 a Green 3 houses, 2 Green, 1 Orange
#3 3 a Orange 3 houses, 2 Green, 1 Orange
#4 4 b Blue 3 houses, 2 Blue, 1 Yellow
#5 5 b Yellow 3 houses, 2 Blue, 1 Yellow
#6 6 b Blue 3 houses, 2 Blue, 1 Yellow

Group by, summarize, spread in R not working

I have a data frame that looks like the following:
ID Code Desc
1 0A Red
1 NA Red
2 1A Blue
3 2B Green
I want to first create a new column where I concatenate the values in the Code column where the IDs are the same. So:
ID Combined_Code Desc
1 0A | NA Red
2 1A Blue
3 2B Green
Then I want to take the original Code column and spread it. The values in this case would be a count of how many times each Code shows up for a given ID. So:
ID Combined_Code 0A NA 1A 2B Desc
1 0A | NA 1 1 0 0 Red
2 1A 0 0 1 0 Blue
3 2B 0 0 0 1 Green
I've tried:
sample_data %>%
group_by(ID) %>%
summarise(Combined_Code = paste(unique(Combined_Code), collapse ='|'))
This works for creating the concatenation. However, I can't get this to work in tandem with spread:
sample_data %>%
group_by(ID) %>%
summarise(Combined_Code = paste(unique(Combined_Code), collapse ='|'))
sample_data <- spread(count(sample_data, ID, Combined_Code, Desc., Code), Code, n, fill = 0)
Doing this spreads, but drops the concatenation. I've also tried this with filter instead of summarise, which gives the same result. This results in:
ID Combined_Code 0A NA 1A 2B Desc
1 0A 1 0 0 0 Red
1 NA 0 1 0 0 Red
2 1A 0 0 1 0 Blue
3 2B 0 0 0 1 Green
Finally, I've tried piping spread through the summarise function:
sample_data %>%
group_by(ID) %>%
summarise(Combined_Code = paste(unique(Combined_Code), collapse ='|')) %>%
spread(count(sample_data, ID, Combined_Code, Desc., Code), Code, n, fill = 0)
This results in the error:
Error: `var` must evaluate to a single number or a column name, not a list
Run `rlang::last_error()` to see where the error occurred.
What can I do to solve these problems?
We can do a group by paste
library(dplyr)
library(stringr)
df1 %>%
group_by(ID, Desc) %>%
summarise(Combined_Code = str_c(Code, collapse="|"))
# A tibble: 3 x 3
# Groups: ID [3]
# ID Desc Combined_Code
# <int> <chr> <chr>
#1 1 Red 0A|0B
#2 2 Blue 1A
#3 3 Green 2B
For the second case, after creating a 'val' column of 1s, paste the 'Code' elements afte grouping by 'ID', 'Desc', then use pivot_wider from tidyr to reshape from 'long' to 'wide format.
library(tidyr)
df1 %>%
mutate(val = 1) %>%
group_by(ID, Desc) %>%
mutate(Combined_Code = str_c(Code, collapse="|")) %>%
pivot_wider(names_from = Code, values_from = val, values_fill = list(val = 0))
# A tibble: 3 x 7
# Groups: ID, Desc [3]
# ID Desc Combined_Code `0A` `0B` `1A` `2B`
# <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#1 1 Red 0A|0B 1 1 0 0
#2 2 Blue 1A 0 0 1 0
#3 3 Green 2B 0 0 0 1
The OP's expected output is
ID Combined_Code 0A 0B 1A 2B Desc
1 0A | 0B 1 1 0 0 Red
2 1A 0 0 1 0 Blue
3 2B 0 0 0 1 Green
Update
For the updated dataset, there are NA elements in the 'Code', and by default str_c returns NA if there any NA as one of the elements, while paste still returns the NA along with the other elements. Here, we replace the str_c with paste
df2 %>%
mutate(val = 1) %>%
group_by(ID, Desc) %>%
mutate(Combined_Code = paste(Code, collapse="|")) %>%
pivot_wider(names_from = Code, values_from = val, values_fill = list(val = 0))
# A tibble: 3 x 7
# Groups: ID, Desc [3]
# ID Desc Combined_Code `0A` `NA` `1A` `2B`
# <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#1 1 Red 0A|NA 1 1 0 0
#2 2 Blue 1A 0 0 1 0
#3 3 Green 2B 0 0 0 1
data
df1 <- structure(list(ID = c(1L, 1L, 2L, 3L), Code = c("0A", "0B", "1A",
"2B"), Desc = c("Red", "Red", "Blue", "Green")),
class = "data.frame", row.names = c(NA,
-4L))
df2 <- structure(list(ID = c(1L, 1L, 2L, 3L), Code = c("0A", NA, "1A",
"2B"), Desc = c("Red", "Red", "Blue", "Green")), class = "data.frame",
row.names = c(NA,
-4L))

pivot_wider when there's no value column

I'm trying to reshape a dataset from long to wide. The following code works, but I'm curious if there's a way not to provide a value column and still use pivot_wider. In the following example, I have to create a temporary column "val" to use pivot_wider, but is there a way I can do it without it?
a <- data.frame(name = c("sam", "rob", "tom"),
type = c("a", "b", "c"))
a
name type
1 sam a
2 rob b
3 tom c
I want to convert it as the following.
name a b c
1 sam 1 0 0
2 rob 0 1 0
3 tom 0 0 1
This can be done by the following code, but can I do it without creating "val" column (and still using tidyverse language)?
a <- data.frame(name = c("sam", "rob", "tom"),
type = c("a", "b", "c"),
val = rep(1, 3)) %>%
pivot_wider(names_from = type, values_from = val, values_fill = list(val = 0))
You can use the values_fn argument to assign 1 and values_fill to assign 0:
library(tidyr)
pivot_wider(a, names_from = type, values_from = type, values_fn = ~1, values_fill = 0)
# A tibble: 3 x 4
name a b c
<fct> <dbl> <dbl> <dbl>
1 sam 1 0 0
2 rob 0 1 0
3 tom 0 0 1
We can mutate with a column of 1s and use that in pivot_wider
library(dplyr)
library(tidyr)
a %>%
mutate(n = 1) %>%
pivot_wider(names_from = type, values_from = n, values_fill = list(n = 0))
# A tibble: 3 x 4
# name a b c
# <fct> <dbl> <dbl> <dbl>
#1 sam 1 0 0
#2 rob 0 1 0
#3 tom 0 0 1
In base R, it would be easier..
table(a)
Going older school, reshape2::dcast, or the thriving data.table::dcast, let you do this by specifying an aggregate function:
reshape2::dcast(a, name ~ type, fun.aggregate = length)
# name a b c
# 1 rob 0 1 0
# 2 sam 1 0 0
# 3 tom 0 0 1
data.table::dcast(setDT(a), name ~ type, fun.aggregate = length)
# name a b c
# 1: rob 0 1 0
# 2: sam 1 0 0
# 3: tom 0 0 1

Resources