Suppose I have a data set which looks like:
library(tidyverse)
df_raw <- data.frame(id = paste0('id', sample(c(1:13), replace = TRUE)), startTime = as.Date(rbeta(13, 0.7, 10) * 100, origin = "2016-01-01"), Channel = paste0('c', sample(c(1:3), 13, replace = TRUE, prob = c(0.2, 0.12, 0.3))) ) %>%
group_by(id) %>%
mutate(totals_transactions = sample(c(0, 1), n(), prob = c(0.9, 0.1), replace = TRUE)) %>%
ungroup() %>%
arrange(id, startTime)
Now I would like to summarize the same id's together and add columns to this new dataframe which indicates whether or not a certain channel is used by that id. I have done it like this:
seq_summaries <- df_raw %>%
group_by(id) %>%
summarize(
c1_touches = max(ifelse(Channel == "c1",1,0)),
c2_touches = max(ifelse(Channel == "c2",1,0)),
c3_touches = max(ifelse(Channel == "c3",1,0)),
conversions = sum(totals_transactions)
) %>% ungroup()
However, I'm searching for a way in which I don't have to manually create columns for every channel, as the number of channels could be much more than three which results in a lot of work.
Here is one idea. Notice that you have no any c2 in your data frame. To use the complete function, you still need to provide a complete list of c (c1 to c3).
library(tidyverse)
df2 <- df_raw %>%
group_by(id, Channel) %>%
summarize(
touches = 1L,
conversions = as.integer(sum(totals_transactions))
) %>%
ungroup() %>%
complete(Channel = paste0("c", 1:3)) %>%
spread(Channel, touches, fill = 0L) %>%
drop_na(id) %>%
select(id, paste0("c", 1:3), conversions)
df2
# # A tibble: 8 x 5
# id c1 c2 c3 conversions
# <fct> <int> <int> <int> <int>
# 1 id10 1 0 0 0
# 2 id11 0 0 1 0
# 3 id12 0 0 1 1
# 4 id2 0 0 1 0
# 5 id3 0 0 1 0
# 6 id6 1 0 0 0
# 7 id8 1 0 0 1
# 8 id9 0 0 1 0
Related
I have a datafrane like this:
df = data.frame (Ref = c("1", "2", "3", "4"),
start_date = c("01/01/20", "02/04/21", NA, NA),
text = c("foo", NA, "bar", "foo"),
value= c(1000, 7000, 500, 200)
)
I want a dataframe that counts the number of NA or BLANK in a column and totals the value column.
So far, I have the following code:
naDF = colSums(is.na(df)|df == '')
naDF = data.frame(as.list(naDF))
naDF = melt(naDF)
Which produces this:
But I want another column which totals the value column for those counts e.g.
Any advice? Thank you
Or with base R
m1 <- df$value * NA^!is.na(df)
data.frame(total_value = colSums(m1, na.rm = TRUE),
value= colSums(!is.na(m1)))
-output
total_value value
Ref 0 0
start_date 700 2
text 7000 1
value 0 0
a <- df$value * is.na(df)
data.frame(value = colSums(a>0), total_value = colSums(a))
value total_value
Ref 0 0
start_date 2 700
text 1 7000
value 0 0
library(tidyverse)
df %>%
mutate(value1 = value) %>%
pivot_longer(-value1, values_to = 'res',
values_transform = as.character)%>%
group_by(name) %>%
summarise(value = sum(is.na(res)),
total_value = sum(is.na(res)*value1))
# A tibble: 4 × 3
name value total_value
<chr> <int> <dbl>
1 Ref 0 0
2 start_date 2 700
3 text 1 7000
4 value 0 0
We may use
library(dplyr)
library(tidyr)
df %>%
mutate(across(everything(), ~ sum(value[is.na(.x)]) * NA^is.na(.x))) %>%
pivot_longer(everything(), names_to = 'variable', values_to = 'total_value') %>%
group_by(variable) %>%
summarise(value = sum(is.na(total_value)), total_value = first(total_value))
-output
# A tibble: 4 × 3
variable value total_value
<chr> <int> <dbl>
1 Ref 0 0
2 start_date 2 700
3 text 1 7000
4 value 0 0
Here is another tidyverse approach using purrr package to count the NAs in each column:
library(purrr)
library(tidyr)
library(dplyr)
df %>%
purrr::map_df(~sum(is.na(.))) %>%
pivot_longer(everything()) %>%
bind_cols(total_value = df$value)
name value total_value
<chr> <int> <dbl>
1 Ref 0 1000
2 start_date 2 7000
3 text 1 500
4 value 0 200
I need to summarize one variable/column of a long table after aggregating (group_by()) by another variable/column, I need to have the summarized value by all values of other variables/columns.
Here is test data:
library(tidyverse)
set.seed(123)
Site <- str_c("S", 1:5)
Species <- str_c("Sps", 1:6)
print(Species_tbl <- bind_cols(Species = Species,
Exotic = rbinom(length(Species), 1, .3),
Migrant = rbinom(length(Species), 2, .3)))
Data_tbl <- expand.grid(Site = Site,
Species = Species) %>%
left_join(Species_tbl)
Data_tbl$Presence <- rbinom(nrow(Data_tbl), 1, .5)
And here is my best effort:
print(Data_tbl %>%
group_by(Site) %>%
summarise(N_sp = sum(Presence),
N_sp_Exo = sum(Presence[Exotic == 1]),
N_sp_Nat = sum(Presence[Exotic == 0]),
N_sp_M0 = sum(Presence[Migrant == 0]),
N_sp_M1 = sum(Presence[Migrant == 1]),
N_sp_M2 = sum(Presence[Migrant == 2])))
You can get the data in long format for your columns of interest c(Exotic, Migrant) and take sum of Presence columns for each unique column names and it's values. This can be merged with sum of each Site.
library(dplyr)
library(tidyr)
data1 <- Data_tbl %>%
group_by(Site) %>%
summarise(N_sp = sum(Presence))
data2 <- Data_tbl %>%
pivot_longer(cols = c(Exotic, Migrant)) %>%
group_by(Site, name, value) %>%
summarise(result = sum(Presence), .groups = "drop") %>%
pivot_wider(names_from = c(name, value), values_from = result)
inner_join(data1, data2, by = 'Site')
# Site N_sp Exotic_0 Exotic_1 Migrant_0 Migrant_1 Migrant_2
# <fct> <int> <int> <int> <int> <int> <int>
#1 S1 4 2 2 1 2 1
#2 S2 3 2 1 0 2 1
#3 S3 2 1 1 0 2 0
#4 S4 4 2 2 1 3 0
#5 S5 4 1 3 1 2 1
The answer has been divided in two steps for ease of readability. If you would like to do this in a single chain without creating temporary variables that can be done as well.
enter image description hereI have the following vectors:
bid = c(1,5,10,20,30,40,50)
n = c(31,29,27,25,23,21,19)
yes = c(0,3,6,7,9,13,17)
no = n - yes
I have two questions, and I don't find any solutions for them, I would appreciate if someone can help me.
Q1: I want to write R code to create a two-column dataframe df. Column 1 has Bid,
where each Bid is repeated n times; Column 2 has c(rep(1,yes),rep(0,no) at
each bid.
Q2: Then when I have the data frame df, I want to write R codes to generate
(from df) vectors bid, n, yes, and no, again.
It is a bit unclear what you actually want. It is easier if you provide the desired result. Would this fit your Q1:
library(tidyverse)
bid = c(1,5,10,20,30,40,50)
n = c(31,29,27,25,23,21,19)
yes = c(0,3,6,7,9,13,17)
no = n - yes
df <- tibble(bid, yes, n, no = n -yes) %>% dplyr::select(- n) %>% pivot_longer(cols = c(yes, no)) %>% uncount(value) %>% mutate(yesno = ifelse(name == "yes", 1,0)) %>% dplyr::select(-name)
df2 <- df %>% group_by(bid) %>% table() %>% as.data.frame() %>% pivot_wider(id_cols = bid, names_from = yesno, values_from = Freq) %>% mutate(n = yes + no) %>% rename(no = `0`, yes = `1`)
bid <- df2$bid
n <- df2$n
yes <- df2$yes
I don't know what you mean for Q2, but for Q1 you could do this:
library(tidyverse)
pmap_dfr(list(bid, n, yes, no),
\(V1, V2, V3, V4) tibble(col1 = rep(V1, V2),
col2 = c(rep(1,V3),rep(0,V4))))
#> # A tibble: 175 x 2
#> col1 col2
#> <dbl> <dbl>
#> 1 1 0
#> 2 1 0
#> 3 1 0
#> 4 1 0
#> 5 1 0
#> 6 1 0
#> 7 1 0
#> 8 1 0
#> 9 1 0
#> 10 1 0
#> # ... with 165 more rows
EDIT:
For Q2, you can follow this:
library(tidyverse)
df <- pmap_dfr(list(bid, n, yes, no),
\(V1, V2, V3, V4) tibble(col1 = rep(V1, V2),
col2 = c(rep(1,V3),rep(0,V4))))
df2 <- df |>
count(col1, col2) |>
group_by(col1) |>
summarise(yes = sum(n[col2==1]),
n = sum(n))
bid2 <- df2$col1
n2 <- df2$n
yes2 <- df2$yes
no2 <- n2 - yes2
all.equal(c(bid, n, yes, no), c(bid2, n2, yes2, no2))
#> [1] TRUE
I have a data frame in which one of the columns ('subject') has a bit of an odd format. I would like to eliminate all observations where the first digit is greater than one. Additionally, I would like to create indicator variables for the remaining observations, and eliminate the number from the beginning.
So I want this:
Subject
1; HMB 2 (HB)
1; HRB 4 (HB-R)
2; HRB 1 (HB-L); HRB4
1; HRB 2 (HB-L)
To become this:
HMB 2 (HB) HRB 4 (HB-R) HRB 2 (HB-L)
1 0 0
0 1 0
0 0 1
You can use separate to get data in different columns, keep observations which are less than equal to 1 and get data in wide format.
library(dplyr)
library(tidyr)
df %>%
separate(Subject, c('col1', 'col2'),
sep = ';', extra = 'drop', convert = TRUE) %>%
filter(col1 <= 1) %>%
mutate(col1 = 1,
row = row_number()) %>%
pivot_wider(names_from = col2, values_from = col1, values_fill = 0) %>%
select(-row)
# ` HMB 2 (HB)` ` HRB 4 (HB-R)` ` HRB 2 (HB-L)`
# <dbl> <dbl> <dbl>
#1 1 0 0
#2 0 1 0
#3 0 0 1
data
df <- structure(list(Subject = c("1; HMB 2 (HB)", "1; HRB 4 (HB-R)",
"2; HRB 1 (HB-L); HRB4", "1; HRB 2 (HB-L)")),
class = "data.frame", row.names = c(NA, -4L))
Here is a more generic dplyr approach. You can separate the values into rows and then filter by group. In this way, you can avoid specifying the columns to be created for each Subject.
library(dplyr)
library(tidyr)
df %>%
mutate(id = row_number(), value = 1L) %>%
separate_rows(Subject, sep = ";\\s*") %>%
group_by(id) %>%
filter(row_number() > 1L & as.integer(Subject[[1L]]) < 2L) %>%
pivot_wider(names_from = "Subject", values_fill = 0L)
Output
# A tibble: 3 x 4
# Groups: id [3]
id `HMB 2 (HB)` `HRB 4 (HB-R)` `HRB 2 (HB-L)`
<int> <int> <int> <int>
1 1 1 0 0
2 2 0 1 0
3 4 0 0 1
Does this work:
library(tidyr)
library(dplyr)
df %>% separate(col = Subject, into = c('count','Subject', 'Subject2'), sep = ';') %>%
filter(!count >1) %>% select(1,2) %>% type.convert(as.is = T) %>%
mutate(ID = row_number()) %>% pivot_wider(id_cols = ID, names_from = Subject, values_from = count, values_fill = 0) %>%
select(-ID)
# A tibble: 3 x 3
` HMB 2 (HB)` ` HRB 4 (HB-R)` ` HRB 2 (HB-L)`
<int> <int> <int>
1 1 0 0
2 0 1 0
3 0 0 1
>
Let's say I had a survey question that read:
What did you eat?
[ ] apple
[ ] pear
[x] banana
[x] grapes
Now, I have the endorsed options as comma-separated strings in one variable.
I wrote myself a little helper to turn this comma-separated list of answers into boolean dummies showing whether each box was checked.
df <- data.frame(
x = 1:5,
ate = c("apple", "apple, pear, banana", "banana, grapes", NA_character_, ""),
stringsAsFactors = FALSE
)
separate_columns <- function(df, col, convert = TRUE, sep = ", ") {
colname <- deparse(substitute(col))
# sorry about this ugly non-rlang approach, hoping not to reuse this
df$.splitcol <- df %>% pull(colname)
separate_rows(df, .splitcol, convert = convert, sep = sep) %>%
mutate(.splitcol = stringr::str_c(colname, "_", .splitcol), value = 1) %>%
mutate(.splitcol = if_else(is.na(.splitcol), stringr::str_c(colname, "_nonresponse"), .splitcol)) %>%
spread(.splitcol, value, fill = 0) %>%
select(-colname)
}
separate_columns(df, ate)
Gets me to this:
x ate_apple ate_banana ate_grapes ate_nonresponse ate_pear
1 1 0 0 0 0
2 1 1 0 0 1
3 0 1 1 0 0
4 0 0 0 1 0
5 0 0 0 1 0
Writing the helper felt clunky, and I feel like I'm missing a more tidyverse way of accomplishing the same transformation (despite lots of searching).
Also, I found no easy way for missings to propagate using this method (I'd prefer if all dummies would be missing if the response was NA, but 0 if it was an empty string). So, I'd rather get this
x ate_apple ate_banana ate_grapes ate_pear
1 1 0 0 0
2 1 1 0 1
3 0 1 1 0
4 NA NA NA NA
5 0 0 0 0
Is there a nicer tidyverse way?
After changing into 'long' format by splitting the 'ate' column by the delimiter ,, create a column of 1 and spread from 'long' to 'wide'
library(tidyverse)
df %>%
separate_rows(ate, sep=", ", convert = TRUE) %>%
mutate(ate = replace(ate, is.na(ate), "NA"),
n = paste(NA ^ (ate == "NA")),
ate = paste0("ate_", replace(ate, ate == "", "nonresponse" ))) %>%
spread(ate, n, fill = "0") %>%
mutate_at(vars(-x, -ate_NA),
funs(replace(as.integer(.), ate_NA=="NA", NA_integer_))) %>%
select(-ate_NA)
# x ate_apple ate_banana ate_grapes ate_nonresponse ate_pear
#1 1 1 0 0 0 0
#2 2 1 1 0 0 1
#3 3 0 1 1 0 0
#4 4 NA NA NA NA NA
#5 5 0 0 0 1 0
I take a different approach, by first extracting want was there to eat and then matching it in the data:
total_eat_list <- map(df$ate, str_split, patter = ",") %>%
unlist() %>%
str_trim() %>%
na.exclude() %>%
unique()
Remove empty strings:
total_eat_list <- total_eat_list[total_eat_list != ""]
total_eat_list
# [1] "apple" "pear" "banana" "grapes"
Now lets map everything in the original data:
map_df(total_eat_list, ~
df %>%
mutate(ate_what = str_c("ate_", .x),
ind = case_when(str_detect(string = df$ate, .x) ~ 1,
!str_detect(string = df$ate, .x) ~ 0,
TRUE ~ NA_real_))) %>%
spread(ate_what, ind) %>%
select(-ate)
# A tibble: 5 x 5
# x ate_apple ate_banana ate_grapes ate_pear
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 0 0 0
# 2 2 1 1 0 1
# 3 3 0 1 1 0
# 4 4 NA NA NA NA
# 5 5 0 0 0 0
The nice thing is that NAs are infectious for the str_-functions.
As function:
who_ate_what <- function(data, col) {
col <- enquo(col)
col_name <- quo_name(col)
match_list <- data %>%
select(!!col) %>%
map(str_split, patter = ",") %>%
unlist() %>%
str_trim() %>%
na.exclude() %>%
unique()
match_list <- match_list[match_list != ""]
map_df(match_list, ~
data %>%
mutate(matches = str_c(!!col_name, "_", .x),
ind = case_when(str_detect(string = !!col, .x) ~ 1,
!str_detect(string = !!col, .x) ~ 0,
TRUE ~ NA_real_)
)) %>%
spread(matches, ind) %>%
select(-!!col)
}
This is way too verbose I'm sure, but I guess its a start.
library(tidyverse)
df <- data.frame(
x = 1:5,
ate = c("apple", "apple, pear, banana", "banana, grapes", NA_character_, ""),
stringsAsFactors = FALSE
)
df %>%
nest(-x) %>%
mutate(data = map(data, ~str_split(.x$ate, ",") %>% unlist())) %>%
unnest() %>%
group_by(x, data) %>%
summarise(n = n()) %>%
ungroup() %>%
spread(data, n, fill = NA) %>%
select(-`<NA>`) %>%
mutate(rs = rowSums(.[2:ncol(.)],na.rm = TRUE)) %>%
gather(nm, val, -x, -rs) %>%
mutate(val = case_when(
is.na(val) & rs > 0 ~ "0",
is.na(val) & rs == 0 ~ "NA",
!is.na(val) ~ as.character(val)
), val = as.numeric(val)) %>%
spread(nm, val, fill = NA) %>%
select(-rs, -V1)
#> # A tibble: 5 x 6
#> x ` banana` ` grapes` ` pear` apple banana
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0 0 0 1 0
#> 2 2 1 0 1 1 0
#> 3 3 0 1 0 0 1
#> 4 4 NA NA NA NA NA
#> 5 5 0 0 0 0 0
EDIT
Lets wrap this into a function and take care of the name issue. I adopted the splitting from your original function to make the use of quosures easier.
my_sep_fun <- function(data, col){
col <- enquo(col)
col_name <- quo_name(col)
data %>%
separate_rows(!!col, sep =', ', convert = TRUE) %>%
group_by(x, !!col) %>%
summarise(n = n()) %>%
ungroup() %>%
spread(!!col, n, fill = NA) %>%
select(-`<NA>`) %>%
mutate(rs = rowSums(.[2:ncol(.)],na.rm = TRUE)) %>%
gather(nm, val, -x, -rs) %>%
mutate(val = case_when(
is.na(val) & rs > 0 ~ "0",
is.na(val) & rs == 0 ~ "NA",
!is.na(val) ~ as.character(val)
), val = as.numeric(val)) %>%
spread(nm, val, fill = NA) %>%
select(-rs, -V1) %>%
rename_at(vars(2:ncol(.)), funs(paste0(!!col_name,"_", .)))
}
my_sep_fun(df, ate)
#> # A tibble: 5 x 5
#> x ate_apple ate_banana ate_grapes ate_pear
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 0 0 0
#> 2 2 1 1 0 1
#> 3 3 0 1 1 0
#> 4 4 NA NA NA NA
#> 5 5 0 0 0 0
Created on 2018-08-20 by the reprex
package (v0.2.0).
One solution, much less verbose, in just three lines. Once you have the dataframe:
First, separate the values in each cell:
df <- separate_rows_(df, 'ate')
Second, dummify every answer using the function dummify from DataExplorer:
df <- DataExplorer::dummify(df, 'ate')
Third, aggregate the redundant rows like that:
df <- aggregate(df[,2:6], by=df$x, FUN= sum)
(you could also apply a max function here since you want to capture all 1's in the columns).
Done!