separate_columns for tidyr - r

Let's say I had a survey question that read:
What did you eat?
[ ] apple
[ ] pear
[x] banana
[x] grapes
Now, I have the endorsed options as comma-separated strings in one variable.
I wrote myself a little helper to turn this comma-separated list of answers into boolean dummies showing whether each box was checked.
df <- data.frame(
x = 1:5,
ate = c("apple", "apple, pear, banana", "banana, grapes", NA_character_, ""),
stringsAsFactors = FALSE
)
separate_columns <- function(df, col, convert = TRUE, sep = ", ") {
colname <- deparse(substitute(col))
# sorry about this ugly non-rlang approach, hoping not to reuse this
df$.splitcol <- df %>% pull(colname)
separate_rows(df, .splitcol, convert = convert, sep = sep) %>%
mutate(.splitcol = stringr::str_c(colname, "_", .splitcol), value = 1) %>%
mutate(.splitcol = if_else(is.na(.splitcol), stringr::str_c(colname, "_nonresponse"), .splitcol)) %>%
spread(.splitcol, value, fill = 0) %>%
select(-colname)
}
separate_columns(df, ate)
Gets me to this:
x ate_apple ate_banana ate_grapes ate_nonresponse ate_pear
1 1 0 0 0 0
2 1 1 0 0 1
3 0 1 1 0 0
4 0 0 0 1 0
5 0 0 0 1 0
Writing the helper felt clunky, and I feel like I'm missing a more tidyverse way of accomplishing the same transformation (despite lots of searching).
Also, I found no easy way for missings to propagate using this method (I'd prefer if all dummies would be missing if the response was NA, but 0 if it was an empty string). So, I'd rather get this
x ate_apple ate_banana ate_grapes ate_pear
1 1 0 0 0
2 1 1 0 1
3 0 1 1 0
4 NA NA NA NA
5 0 0 0 0
Is there a nicer tidyverse way?

After changing into 'long' format by splitting the 'ate' column by the delimiter ,, create a column of 1 and spread from 'long' to 'wide'
library(tidyverse)
df %>%
separate_rows(ate, sep=", ", convert = TRUE) %>%
mutate(ate = replace(ate, is.na(ate), "NA"),
n = paste(NA ^ (ate == "NA")),
ate = paste0("ate_", replace(ate, ate == "", "nonresponse" ))) %>%
spread(ate, n, fill = "0") %>%
mutate_at(vars(-x, -ate_NA),
funs(replace(as.integer(.), ate_NA=="NA", NA_integer_))) %>%
select(-ate_NA)
# x ate_apple ate_banana ate_grapes ate_nonresponse ate_pear
#1 1 1 0 0 0 0
#2 2 1 1 0 0 1
#3 3 0 1 1 0 0
#4 4 NA NA NA NA NA
#5 5 0 0 0 1 0

I take a different approach, by first extracting want was there to eat and then matching it in the data:
total_eat_list <- map(df$ate, str_split, patter = ",") %>%
unlist() %>%
str_trim() %>%
na.exclude() %>%
unique()
Remove empty strings:
total_eat_list <- total_eat_list[total_eat_list != ""]
total_eat_list
# [1] "apple" "pear" "banana" "grapes"
Now lets map everything in the original data:
map_df(total_eat_list, ~
df %>%
mutate(ate_what = str_c("ate_", .x),
ind = case_when(str_detect(string = df$ate, .x) ~ 1,
!str_detect(string = df$ate, .x) ~ 0,
TRUE ~ NA_real_))) %>%
spread(ate_what, ind) %>%
select(-ate)
# A tibble: 5 x 5
# x ate_apple ate_banana ate_grapes ate_pear
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 0 0 0
# 2 2 1 1 0 1
# 3 3 0 1 1 0
# 4 4 NA NA NA NA
# 5 5 0 0 0 0
The nice thing is that NAs are infectious for the str_-functions.
As function:
who_ate_what <- function(data, col) {
col <- enquo(col)
col_name <- quo_name(col)
match_list <- data %>%
select(!!col) %>%
map(str_split, patter = ",") %>%
unlist() %>%
str_trim() %>%
na.exclude() %>%
unique()
match_list <- match_list[match_list != ""]
map_df(match_list, ~
data %>%
mutate(matches = str_c(!!col_name, "_", .x),
ind = case_when(str_detect(string = !!col, .x) ~ 1,
!str_detect(string = !!col, .x) ~ 0,
TRUE ~ NA_real_)
)) %>%
spread(matches, ind) %>%
select(-!!col)
}

This is way too verbose I'm sure, but I guess its a start.
library(tidyverse)
df <- data.frame(
x = 1:5,
ate = c("apple", "apple, pear, banana", "banana, grapes", NA_character_, ""),
stringsAsFactors = FALSE
)
df %>%
nest(-x) %>%
mutate(data = map(data, ~str_split(.x$ate, ",") %>% unlist())) %>%
unnest() %>%
group_by(x, data) %>%
summarise(n = n()) %>%
ungroup() %>%
spread(data, n, fill = NA) %>%
select(-`<NA>`) %>%
mutate(rs = rowSums(.[2:ncol(.)],na.rm = TRUE)) %>%
gather(nm, val, -x, -rs) %>%
mutate(val = case_when(
is.na(val) & rs > 0 ~ "0",
is.na(val) & rs == 0 ~ "NA",
!is.na(val) ~ as.character(val)
), val = as.numeric(val)) %>%
spread(nm, val, fill = NA) %>%
select(-rs, -V1)
#> # A tibble: 5 x 6
#> x ` banana` ` grapes` ` pear` apple banana
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0 0 0 1 0
#> 2 2 1 0 1 1 0
#> 3 3 0 1 0 0 1
#> 4 4 NA NA NA NA NA
#> 5 5 0 0 0 0 0
EDIT
Lets wrap this into a function and take care of the name issue. I adopted the splitting from your original function to make the use of quosures easier.
my_sep_fun <- function(data, col){
col <- enquo(col)
col_name <- quo_name(col)
data %>%
separate_rows(!!col, sep =', ', convert = TRUE) %>%
group_by(x, !!col) %>%
summarise(n = n()) %>%
ungroup() %>%
spread(!!col, n, fill = NA) %>%
select(-`<NA>`) %>%
mutate(rs = rowSums(.[2:ncol(.)],na.rm = TRUE)) %>%
gather(nm, val, -x, -rs) %>%
mutate(val = case_when(
is.na(val) & rs > 0 ~ "0",
is.na(val) & rs == 0 ~ "NA",
!is.na(val) ~ as.character(val)
), val = as.numeric(val)) %>%
spread(nm, val, fill = NA) %>%
select(-rs, -V1) %>%
rename_at(vars(2:ncol(.)), funs(paste0(!!col_name,"_", .)))
}
my_sep_fun(df, ate)
#> # A tibble: 5 x 5
#> x ate_apple ate_banana ate_grapes ate_pear
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 0 0 0
#> 2 2 1 1 0 1
#> 3 3 0 1 1 0
#> 4 4 NA NA NA NA
#> 5 5 0 0 0 0
Created on 2018-08-20 by the reprex
package (v0.2.0).

One solution, much less verbose, in just three lines. Once you have the dataframe:
First, separate the values in each cell:
df <- separate_rows_(df, 'ate')
Second, dummify every answer using the function dummify from DataExplorer:
df <- DataExplorer::dummify(df, 'ate')
Third, aggregate the redundant rows like that:
df <- aggregate(df[,2:6], by=df$x, FUN= sum)
(you could also apply a max function here since you want to capture all 1's in the columns).
Done!

Related

Extract separate columns from data frame

I am trying to extract the first names of the titles of the columns such as pack_10, pack_18 and pack_20 and group all of them with sum. Below you can see my data
df<-data.frame(
packs_10_value5=c(100,0,0,0,0),
packs_18_value9=c(200,0,0,0,0),
packs_20_value13=c(300,0,0,0,0),
packs_10_value15=c(100,0,0,0,0),
packs_18_value17=c(200,0,0,0,0),
packs_20_value18=c(300,0,0,0,0)
)
df
So can anybody help me with how to solve this?
You can split the columns and apply rowSums by group:
library(purrr)
split.default(df, f = gsub("_value.*", "", names(df))) %>%
map_dfc(rowSums)
# A tibble: 5 × 3
packs_10 packs_18 packs_20
<dbl> <dbl> <dbl>
1 200 400 600
2 0 0 0
3 0 0 0
4 0 0 0
5 0 0 0
A bit more convoluted/less elegant, but you could also create a row_number column, pivot_longer by all other columns, do the regex, aggregate by the clean column name and each row and pivot_wider back:
library(dplyr)
library(tidyr)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn) %>%
group_by(name = sub('_value\\d+', '', name), rn) %>%
summarise(value = sum(value, na.rm = TRUE)) %>%
pivot_wider(names_from = 'name', values_from = 'value') %>%
select(-rn)
Output:
# A tibble: 5 x 3
packs_10 packs_18 packs_20
<dbl> <dbl> <dbl>
1 200 400 600
2 0 0 0
3 0 0 0
4 0 0 0
5 0 0 0

Automatize repetitive process using a function applied for different variables and names in R

I have a dataset like original
id <- c(1,1,1,2,3,3,3,4,4)
period <- c(1,1,2,2,1,2,3,1,3)
iso <- c("USA", "USA", "CHN", "ESP", "UK", "FRA", "KOR", "KOR", "ITA")
via <- c(1, 1, 2, 7, 5, 4, 4, 3, 2)
region <- c(4, 4, 4, 1, 27, 35, 9, 35, 35)
original <- data.frame(id, period, iso, via, region)
and I want to generate a 1-row (by period) dataset with dummies for each main variable (iso, via, region -> in my dataset I have way more main variables). So the dummy-dataset for destination (iso) looks like dest.dummies:
dest.dummies <- original %>%
select(id, period, iso) %>%
filter(., iso != "") %>%
distinct() %>%
mutate(iso_=1)
dest.dummies <- reshape(dest.dummies, idvar=c("id","period"), timevar="iso", direction="wide")
dest.dummies[is.na(dest.dummies)==T] <- 0
dest.dummies <- dest.dummies %>%
dplyr::rename_all(
funs(stringr::str_replace_all(., "iso_.", "iso_")) #change pattern in names "iso_." by "iso_"
)
My intention is to create a function such that I can substitute in the code above (dest.dummies) the name "iso" by each of the core variables defined (via, region, etc).
I have tried to create a function() imputing the name of the variable in each case:
dummyfier <- function(data, var){
df <- data %>%
select(id, period, {{var}}) %>%
filter(., {{var}} != "") %>%
distinct() %>%
mutate('{{var}}_' :=1)
df <- reshape(df, idvar=c("id","period"), timevar={{var}}, direction="wide")
df[is.na(df)==T] <- 0
df <- df %>%
dplyr::rename_all(
funs(stringr::str_replace_all(., "var_.", "var_")) #change pattern in names "var_." by "var_"
)
}
dest.dummies <- dummyfier(original, "iso")
via.dummies <- dummyfier(original, "via")
region.dummies <- dummyfier(original, "region")
but it does not make the trick.
Any idea on how to avoid this repetitive coding?
Thank you?
You can simplify your original attempt to :
library(dplyr)
library(tidyr)
original %>%
select(id, period, iso) %>%
filter(iso != "") %>%
distinct() %>%
pivot_wider(names_from = iso, values_from = iso,
values_fn = length, values_fill = 0, names_prefix = 'iso_')
# id period iso_USA iso_CHN iso_ESP iso_UK iso_FRA iso_KOR iso_ITA
# <dbl> <dbl> <int> <int> <int> <int> <int> <int> <int>
#1 1 1 1 0 0 0 0 0 0
#2 1 2 0 1 0 0 0 0 0
#3 2 2 0 0 1 0 0 0 0
#4 3 1 0 0 0 1 0 0 0
#5 3 2 0 0 0 0 1 0 0
#6 3 3 0 0 0 0 0 1 0
#7 4 1 0 0 0 0 0 1 0
#8 4 3 0 0 0 0 0 0 1
Now put that in a function :
dummyfier <- function(data, var) {
col <- deparse(substitute(var))
original %>%
select(id, period, {{var}}) %>%
filter({{var}} != "") %>%
distinct() %>%
pivot_wider(names_from = {{var}}, values_from = {{var}},
values_fn = length, values_fill = 0,
names_prefix = paste0(col, '_'))
}
dest.dummies <- dummyfier(original, iso)
via.dummies <- dummyfier(original, via)
region.dummies <- dummyfier(original, region)

Selecting all columns that have some specific values

I have a data.frame with more than 50 columns and 10,000 rows I want select those columns that are haveing 0 or 1 in them excluding other values in those columna
sample data.frame is as below:
dummy_df <- data.frame(
id=1:4,
gender=c(4,1,0,1),
height=seq(150, 180,by = 10),
smoking=c(3,0,1,0)
)
I want to select all those columns with 0 or 1 value and exclude other values like 4 in gender and 3 in smoking and as below
gender smoking
1 0
0 1
1 0
but I have 50 columns in actual data frame and I don't know which of them are having 0 or 1
What I'm trying is:
dummy_df %>% select_if(~ all( . %in% 0:1))
Is this useful for you?
dummy_df %>%
select(- c(id, height)) %>%
rowwise() %>%
filter(any(c_across() == 0)|any(c_across() == 1))
# A tibble: 3 x 2
# Rowwise:
gender smoking
<dbl> <dbl>
1 1 0
2 0 1
3 1 0
EDIT:
If you don't know in advance which cols contain 0 and/or 1, you can determine that in base R:
temp <- dummy_df[sapply(dummy_df, function(x) any(x == 0|x == 1))]
Now you can filter for rows with 0and/or 1:
temp %>%
rowwise() %>%
filter(any(c_across() == 0)|any(c_across() == 1))
I think it's more like a case of filter than select:
library(dplyr)
dummy_df %>%
filter(if_all(c(gender, smoking), ~ .x %in% c(0, 1)))
id gender height smoking
1 2 1 160 0
2 3 0 170 1
3 4 1 180 0

Rename and create indicator variables?

I have a data frame in which one of the columns ('subject') has a bit of an odd format. I would like to eliminate all observations where the first digit is greater than one. Additionally, I would like to create indicator variables for the remaining observations, and eliminate the number from the beginning.
So I want this:
Subject
1; HMB 2 (HB)
1; HRB 4 (HB-R)
2; HRB 1 (HB-L); HRB4
1; HRB 2 (HB-L)
To become this:
HMB 2 (HB) HRB 4 (HB-R) HRB 2 (HB-L)
1 0 0
0 1 0
0 0 1
You can use separate to get data in different columns, keep observations which are less than equal to 1 and get data in wide format.
library(dplyr)
library(tidyr)
df %>%
separate(Subject, c('col1', 'col2'),
sep = ';', extra = 'drop', convert = TRUE) %>%
filter(col1 <= 1) %>%
mutate(col1 = 1,
row = row_number()) %>%
pivot_wider(names_from = col2, values_from = col1, values_fill = 0) %>%
select(-row)
# ` HMB 2 (HB)` ` HRB 4 (HB-R)` ` HRB 2 (HB-L)`
# <dbl> <dbl> <dbl>
#1 1 0 0
#2 0 1 0
#3 0 0 1
data
df <- structure(list(Subject = c("1; HMB 2 (HB)", "1; HRB 4 (HB-R)",
"2; HRB 1 (HB-L); HRB4", "1; HRB 2 (HB-L)")),
class = "data.frame", row.names = c(NA, -4L))
Here is a more generic dplyr approach. You can separate the values into rows and then filter by group. In this way, you can avoid specifying the columns to be created for each Subject.
library(dplyr)
library(tidyr)
df %>%
mutate(id = row_number(), value = 1L) %>%
separate_rows(Subject, sep = ";\\s*") %>%
group_by(id) %>%
filter(row_number() > 1L & as.integer(Subject[[1L]]) < 2L) %>%
pivot_wider(names_from = "Subject", values_fill = 0L)
Output
# A tibble: 3 x 4
# Groups: id [3]
id `HMB 2 (HB)` `HRB 4 (HB-R)` `HRB 2 (HB-L)`
<int> <int> <int> <int>
1 1 1 0 0
2 2 0 1 0
3 4 0 0 1
Does this work:
library(tidyr)
library(dplyr)
df %>% separate(col = Subject, into = c('count','Subject', 'Subject2'), sep = ';') %>%
filter(!count >1) %>% select(1,2) %>% type.convert(as.is = T) %>%
mutate(ID = row_number()) %>% pivot_wider(id_cols = ID, names_from = Subject, values_from = count, values_fill = 0) %>%
select(-ID)
# A tibble: 3 x 3
` HMB 2 (HB)` ` HRB 4 (HB-R)` ` HRB 2 (HB-L)`
<int> <int> <int>
1 1 0 0
2 0 1 0
3 0 0 1
>

Add multiple new columns to dataframe based on condition in R

Suppose I have a data set which looks like:
library(tidyverse)
df_raw <- data.frame(id = paste0('id', sample(c(1:13), replace = TRUE)), startTime = as.Date(rbeta(13, 0.7, 10) * 100, origin = "2016-01-01"), Channel = paste0('c', sample(c(1:3), 13, replace = TRUE, prob = c(0.2, 0.12, 0.3))) ) %>%
group_by(id) %>%
mutate(totals_transactions = sample(c(0, 1), n(), prob = c(0.9, 0.1), replace = TRUE)) %>%
ungroup() %>%
arrange(id, startTime)
Now I would like to summarize the same id's together and add columns to this new dataframe which indicates whether or not a certain channel is used by that id. I have done it like this:
seq_summaries <- df_raw %>%
group_by(id) %>%
summarize(
c1_touches = max(ifelse(Channel == "c1",1,0)),
c2_touches = max(ifelse(Channel == "c2",1,0)),
c3_touches = max(ifelse(Channel == "c3",1,0)),
conversions = sum(totals_transactions)
) %>% ungroup()
However, I'm searching for a way in which I don't have to manually create columns for every channel, as the number of channels could be much more than three which results in a lot of work.
Here is one idea. Notice that you have no any c2 in your data frame. To use the complete function, you still need to provide a complete list of c (c1 to c3).
library(tidyverse)
df2 <- df_raw %>%
group_by(id, Channel) %>%
summarize(
touches = 1L,
conversions = as.integer(sum(totals_transactions))
) %>%
ungroup() %>%
complete(Channel = paste0("c", 1:3)) %>%
spread(Channel, touches, fill = 0L) %>%
drop_na(id) %>%
select(id, paste0("c", 1:3), conversions)
df2
# # A tibble: 8 x 5
# id c1 c2 c3 conversions
# <fct> <int> <int> <int> <int>
# 1 id10 1 0 0 0
# 2 id11 0 0 1 0
# 3 id12 0 0 1 1
# 4 id2 0 0 1 0
# 5 id3 0 0 1 0
# 6 id6 1 0 0 0
# 7 id8 1 0 0 1
# 8 id9 0 0 1 0

Resources