convert R list to tibble() - purrr or better option? - r

library(tidyverse)
library(purrr)
x <- c(20, 30, 58)
n <- 100
mylist <- data_frame(x = c(0, x), n) %>%
distinct() %>%
filter(x >= 0 & x < n) %>%
arrange(x) %>%
bind_rows(data_frame(x = n)) %>%
mutate(lag_x = lag(x)) %>%
mutate(y = x - lag_x) %>%
filter(!is.na(y)) %>%
summarise(n = list(rep(row_number(), y))) %>%
pull(n)
What's the best way to convert the list above into a tibble? purrr maybe? I am actually going to use this list inside of a mutate call, to add said list as a column to another tibble.
# A tibble: 100 x 1
grp
<dbl>
1 1
2 1
3 1
4 1
etc...

unnest() & rename()
library(tidyverse)
x <- c(20, 30, 58)
n <- 100
data_frame(x = c(0, x), n) %>%
distinct() %>%
filter(x >= 0 & x < n) %>%
arrange(x) %>%
bind_rows(data_frame(x = n)) %>%
mutate(lag_x = lag(x)) %>%
mutate(y = x - lag_x) %>%
filter(!is.na(y)) %>%
summarise(n = list(rep(row_number(), y))) %>%
unnest(n) %>%
rename(grp = n)
## # A tibble: 100 x 1
## grp
## <int>
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## 7 1
## 8 1
## 9 1
## 10 1
## # ... with 90 more rows

I would use a combination of tibble and unlist. This way:
new_tibble <- tibble(grp = unlist(mylist))
##if you want to add it as column to a data frame, here is how I'd do it
mock_df <- tibble(x = rnorm(100),
y = rnorm(100))
mock_df %>% mutate(grp = unlist(mylist))

Related

how to get the output of proc tabulate (SAS) in R

Is there any R function which could give me directly the same output of proc tabulate ??
var1<-c(rep("A",4),rep("B",4))
var2<-c(rep("C",4),rep("D",4))
var3<-c(rep("E",2),rep("F",4),rep("G",2))
dataset<-data.frame(var1,var2,var3)
proc tabulate data=dataset;
class var1 var2 var3;
table var1*var2 ,var3 all (n rowpctn);
run;
The output that I want is like this:
Here is a way with R -
Create a column of 1s - n
Expand the data to fill the missing combinations - complete
Reshape to 'wide' format - pivot_wider
Create the 'Total' column by getting the row wise sum - rowSums
Add the percentage by looping across the 'var3' columns
library(dplyr)
library(tidyr)
library(stringr)
dataset %>%
mutate(n = 1, var3 = str_c('var3_', var3)) %>%
complete(var1, var2, var3, fill = list(n = 0)) %>%
pivot_wider(names_from = var3, values_from = n, values_fn = sum) %>%
mutate(Total = rowSums(across(where(is.numeric)))) %>%
group_by(var1) %>%
mutate(across(starts_with('var3'),
~ case_when(. == 0 ~ '0(0%)',
TRUE ~ sprintf('%d(%d%%)', ., 100 * mean(. != 0))))) %>%
ungroup
-output
# A tibble: 4 × 6
var1 var2 var3_E var3_F var3_G Total
<chr> <chr> <chr> <chr> <chr> <dbl>
1 A C 2(50%) 2(50%) 0(0%) 4
2 A D 0(0%) 0(0%) 0(0%) 0
3 B C 0(0%) 0(0%) 0(0%) 0
4 B D 0(0%) 2(50%) 2(50%) 4
Update
Based on the comments by #IceCreamToucan, there was a bug, which is corrected in the below code
dataset %>%
mutate(n = 1, var3 = str_c('var3_', var3)) %>%
complete(var1, var2, var3, fill = list(n = 0)) %>%
pivot_wider(names_from = var3, values_from = n, values_fn = sum) %>%
mutate(Total = rowSums(across(where(is.numeric))),
100 * across(starts_with('var3'), ~ . != 0,
.names = "{.col}_perc")/rowSums(across(starts_with('var3'), ~ .!= 0)),
across(matches('var3_[A-Z]$'), ~ case_when(. == 0 ~ '0(0%)',
TRUE ~ sprintf('%d(%.f%%)', ., get(str_c(cur_column(), '_perc')))))) %>%
select(-ends_with('perc'))
Here's a more generic version, where I define a function.
var1<-c(rep("A",4),rep("B",4))
var2<-c(rep("C",4),rep("D",4))
var3<-c(rep("E",2),rep("F",4),rep("G",2))
df<-data.frame(var1,var2,var3)
df_tabulate(df, id_cols = c(var1, var2), names_from = var3)
#> # A tibble: 4 × 6
#> var1 var2 var3_E var3_F var3_G Total
#> <chr> <chr> <chr> <chr> <chr> <dbl>
#> 1 A C 2(50.0%) 2(50.0%) 0(0.0%) 4
#> 2 A D 0(0%) 0(0%) 0(0%) 0
#> 3 B C 0(0%) 0(0%) 0(0%) 0
#> 4 B D 0(0.0%) 2(50.0%) 2(50.0%) 4
You can define the function using janitor
library(janitor, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(rlang)
library(tidyr)
df_tabulate <- function(df, id_cols, names_from){
id_cols <- enquo(id_cols)
if (quo_is_call(id_cols, 'c'))
id_cols <- call_args(id_cols)
else
id_cols <- ensym(id_cols)
names_from_chr <- as_label(enquo(names_from))
counts <- df %>%
mutate(g = eval(call2(paste, !!!id_cols, sep = ',')),
col = paste0(names_from_chr, '_', {{ names_from }})) %>%
tabyl(g, col) %>%
adorn_totals('col')
percs <- adorn_percentages(counts) %>%
adorn_pct_formatting()
rbind(counts, percs) %>%
group_by(g) %>%
summarise(across(-Total, ~ paste0(first(.), '(', last(.), ')')),
Total = as.numeric(first(Total))) %>%
separate(g, into = as.character(id_cols)) %>%
complete(!!!id_cols) %>%
mutate(across(starts_with(names_from_chr), ~ coalesce(., '0(0%)')),
across(Total, ~ coalesce(., 0)))
}
Here it is as a single pipeline with discrete simple steps. Long, to be sure, but if you wanted many tables like this you could store it as a function.
library(tidyverse)
library(janitor)
dataset %>%
mutate(across(var1:var2, as.factor)) %>%
count(var1, var2, var3, .drop = FALSE) %>%
unite(vars, var1, var2) %>%
pivot_wider(names_from = var3, values_from = n) %>%
select(-`NA`) %>%
replace(is.na(.), 0) %>%
adorn_totals("col") %>%
adorn_percentages(,,,,-c(vars, Total)) %>%
adorn_pct_formatting(digits = 0,,,,-c(vars, Total)) %>%
adorn_ns(position = "front",,,-c(vars, Total)) %>%
separate(vars, into = c("var1", "var2"))
#> # A tibble: 4 x 6
#> var1 var2 E F G Total
#> <chr> <chr> <chr> <chr> <chr> <dbl>
#> 1 A C 2 (50%) 2 (50%) 0 (0%) 4
#> 2 A D 0 (-) 0 (-) 0 (-) 0
#> 3 B C 0 (-) 0 (-) 0 (-) 0
#> 4 B D 0 (0%) 2 (50%) 2 (50%) 4
This replaces the questionable 0/0 = 0% with simply - for a cleaner(IMO) result.

dplyr concatenate column by variable value

I can concatenate one column of data.frame, following the code as below if the column name is available.
However, How about the "column" name saved in the variable?
Further question is, how can I specify the columns by the value of a variable? (!!sym() ?)
Here are test code:
> library(dplyr)
> packageVersion("dplyr")
[1] ‘1.0.7’
> df <- data.frame(x = 1:3, y = c("A", "B", "A"))
> df %>%
group_by(y) %>%
summarise(z = paste(x, collapse = ","))
# A tibble: 2 x 2
y z
<chr> <chr>
1 A 1,3
2 B 2
I have a variable a, with the value x, How can I do above summarize?
> a <- "x"
> df %>%
group_by(y) %>%
summarise(z = paste(a, collapse = ","))
# A tibble: 2 x 2
y z
<chr> <chr>
1 A x
2 B x
Solution-1: use !!sym()
> a <- "x"
> df %>%
group_by(y) %>%
summarise(z = paste(!!sym(a), collapse = ","))
# A tibble: 2 x 2
y z
<chr> <chr>
1 A 1,3
2 B 2
Solution-2: Assign the column to new variable
> df %>%
group_by(y) %>%
rename(new_col = a) %>%
summarise(z = paste(new_col, collapse = ","))
# A tibble: 2 x 2
y z
<chr> <chr>
1 A 1,3
2 B 2
Are there any other ways to do the job?
similar questions could be found: https://stackoverflow.com/a/15935166/2530783 ,https://stackoverflow.com/a/50537209/2530783,
Here are some other options -
Use .data -
library(dplyr)
a <- "x"
df %>% group_by(y) %>% summarise(z = toString(.data[[a]]))
# y z
# <chr> <chr>
#1 A 1, 3
#2 B 2
get
df %>% group_by(y) %>% summarise(z = toString(get(a)))
as.name
df %>% group_by(y) %>% summarise(z = toString(!!as.name(a)))
paste(..., collapse = ',') is equivalent to toString.

turn pivot_wider() into spread()

I love the new tidyr pivot_wider function but since it hasn't been officially added to the CRAN package I was wondering how to convert the following code into the older spread() function (I do not have access to the server to DL tidyr from github)
test <- data.frame(x = c(1,1,2,2,2,2,3,3,3,4),
y = c(rep("a", 5), rep("b", 5)))
test %>%
count(x, y) %>%
group_by(x) %>%
mutate(prop = prop.table(n)) %>%
mutate(v1 = paste0(n, ' (', round(prop, 2), ')')) %>%
pivot_wider(id_cols = x, names_from = y, values_from = v1)
Desired Output:
# A tibble: 4 x 3
# Groups: x [4]
x a b
<dbl> <chr> <chr>
1 1 2 (1) NA
2 2 3 (0.75) 1 (0.25)
3 3 NA 3 (1)
4 4 NA 1 (1)
I tried (but is not quite right):
test %>%
count(x, y) %>%
group_by(x) %>%
mutate(prop = prop.table(n)) %>%
mutate(v1 = paste0(n, ' (', round(prop, 2), ')')) %>%
spread(y, v1) %>%
select(-n, -prop)
Any help appreciated!
One option is to remove the columns 'n', 'prop' before the spread statement as including them would create unique rows with that column values as well
library(dplyr)
library(tidyr)
test %>%
count(x, y) %>%
group_by(x) %>%
mutate(prop = prop.table(n)) %>%
mutate(v1 = paste0(n, ' (', round(prop, 2), ')')) %>%
select(-n, -prop) %>%
spread(y, v1)
# A tibble: 4 x 3
# Groups: x [4]
# x a b
# <dbl> <chr> <chr>
#1 1 2 (1) <NA>
#2 2 3 (0.75) 1 (0.25)
#3 3 <NA> 3 (1)
#4 4 <NA> 1 (1)
Or using base R
tbl <- table(test)
tbl[] <- paste0(tbl, "(", prop.table(tbl, 1), ")")
You can use data.table package:
> library(data.table)
> setDT(test)[,.(n=.N),by=.(x,y)][,.(y=y,n=n,final=gsub('\\(1\\)','',paste0(n,'(',round(prop.table(n),2), ')'))),by=x]
x y n final
1: 1 a 2 2
2: 2 a 3 3(0.75)
3: 2 b 1 1(0.25)
4: 3 b 3 3
5: 4 b 1 1

Identify subsets containing only repeats of an expression

I have a dataset like so:
df<-data.frame(x=c("A","A","A","A", "B","B","B","B","B",
"C","C","C","C","C","D","D","D","D","D"),
y= as.factor(c(rep("Eoissp2",4),rep("Eoissp1",5),"Eoissp1","Eoisp4","Automerissp1","Automerissp2","Acharias",rep("Eoissp2",3),rep("Eoissp1",2))))
I want to identify, for each subset of x, the corresponding levels in y that are entirely duplicates containing the expression Eois. Therefore, A , B, and D will be returned in a vector because every level of A , B, and D contains the expression Eois , while level C consists of various unique levels (e.g. Eois, Automeris and Acharias). For this example the output would be:
output<- c("A", "B", "D")
Using new df:
> df %>% filter(str_detect(y,"Eois")) %>% group_by(x) %>% distinct(y) %>%
count() %>% filter(n==1) %>% select(x)
# A tibble: 2 x 1
# Groups: x [2]
x
<fct>
1 A
2 B
(Answer below uses the original df posted by the question author.)
Using the pipe function in magrittr & functions from dplyr:
> df %>% group_by(x) %>% distinct(y)
# A tibble: 7 x 2
# Groups: x [3]
x y
<fct> <fct>
1 A plant1a
2 B plant1b
3 C plant1a
4 C plant2a
5 C plant3a
6 C plant4a
7 C plant5a
Then you can roll up the results like this:
> results <- df %>% group_by(x) %>% distinct(y) %>%
count() %>% filter(n==1) %>% select(x)
> results
# A tibble: 2 x 1
# Groups: x [2]
x
<fct>
1 A
2 B
If you know your original data frame is always going to come with the x's in order, you can drop the group_by part.
A dplyr based solution could be as:
library(dplyr)
df %>% group_by(x) %>%
filter(grepl("Eoiss", y)) %>%
mutate(y = sub("\\d+", "", y)) %>%
filter(n() >1 & length(unique(y)) == 1) %>%
select(x) %>% unique(.)
# A tibble: 3 x 1
# Groups: x [3]
# x
# <fctr>
#1 A
#2 B
#3 D
Data
df<-data.frame(x=c("A","A","A","A", "B","B","B","B","B",
"C","C","C","C","C","D","D","D","D","D"),
y= as.factor(c(rep("Eoissp2",4),
rep("Eoissp1",5),"Eoissp1","Eoisp4","Automerissp1","Automerissp2",
"Acharias",rep("Eoissp2",3),rep("Eoissp1",2))))

Are there any disadvantages for doing a mutate + filter vs summarise on a grouped data frame?

In dplyr 0.5.0, calling summarise on a grouped data frame does not guarantee any resultant row order (Currently, it reorders the rows by group, not sure how it handles duplicate grouping levels).
To get around this, I would like to replace all summarise(x = ...) operations with mutate(x = ...) %>% filter(row_number() == 1). Are there any disadvantages or drawbacks to doing this?
Example of the two operations.
tmp_df <-
data.frame(group = rep(c(2L, 1L), each = 5), b = rep(c(-1, 1), each = 5)) %>%
group_by(group)
tmp_df %>%
summarise(b = sum(b))
tmp_df %>%
mutate(b = sum(b)) %>%
filter(row_number() == 1)
producing:
> tmp_df %>%
+ summarise(b = sum(b))
# A tibble: 2 × 2
group b
<int> <dbl>
1 1 5
2 2 -5
> tmp_df %>%
+ mutate(b = sum(b)) %>%
+ filter(row_number() == 1)
Source: local data frame [2 x 2]
Groups: group [2]
group b
<int> <dbl>
1 2 -5
2 1 5
EDIT: In response to a comment, for readability I can define the function:
summarise_o <- function (.data, ...) {
# order preserving summarise
mutate_(.data, .dots = lazyeval::lazy_dots(...)) %>%
filter(row_number() == 1) %>%
return
}
and simply call:
tmp_df %>%
summarise_o(b = sum(b))
One option is to create the 'group' as a factor
tmp_df <- data.frame(group = rep(c(2L, 1L), each = 5), b = rep(c(-1, 1), each = 5)) %>%
group_by(group = factor(group, levels = unique(group)))
tmp_df %>%
summarise(b = sum(b))
# A tibble: 2 x 2
# group b
# <fctr> <dbl>
#1 2 -5
#2 1 5

Resources