Loop to create crosstabs of columns using tidyr - r

I would like to use a loop to create crosstabs of one column with every other column in a df. I started with this code (substituting in the iris df), which works nicely for two variables:
iris <- iris
tbl <- iris %>%
tabyl(Species, Sepal.Length, show_missing_levels = FALSE, show_na = FALSE) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>%
adorn_title("combined") %>%
knitr::kable()
print(tbl)
My df contains ~200 columns. I thought I would write a for loop to print a crosstab for one variable with each of the other variables. Here's what I tried:
cols <- c('Sepal.Length', 'Sepal.Width')
for (c in cols){
tbl <- iris %>%
tabyl(Species, c, show_missing_levels = FALSE, show_na = FALSE) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>%
adorn_title("combined") %>%
knitr::kable()
print(tbl)
}
This returns Column `c` is not found.
This seems like it should be simple, but I can't figure it out. Thanks for any help.

Change the c in your code to !!sym(c). I can't explain this non-standard tidyverse evaluation thingy, but in layman's terms, you want to access an object (i.e. "c") outside of your pipe (iris). That's why you need !!sym.

You can use the .data pronoun when passing columns names as strings.
cols <- c('Sepal.Length', 'Sepal.Width')
for (col in cols){
tbl <- iris %>%
tabyl(Species, .data[[col]],show_missing_levels = FALSE,show_na = FALSE) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>%
adorn_title("combined") %>%
knitr::kable()
print(tbl)
}

Related

Create R function from multiple scripts?

Not too good with functions. Is there a way to write the below script as a function? I have a list of dataframes that I want to apply the below scripts to.
head(iris)
iris1 <- iris %>%
group_by(Species) %>%
mutate_at(vars(Petal.Length), ~replace_na(., 0)) %>%
summarise(Petal.Length = sum(Petal.Length))
iris2 <- iris %>%
group_by(Species) %>%
tally()
iris3 <- iris2 %>%
inner_join(iris1)
iris3$average <- iris3$Petal.Length/iris3$n
Yes, its quite easy.
Let me know if this helps you:
my_function_name <- function(df){
table1 <- df %>%
group_by(org) %>%
tally()
table2 <- df %>%
group_by(org) %>%
mutate_at(vars(hours), ~replace_na(., 0)) %>%
summarise(hours = sum(hours))
table3 <- table1 %>%
inner_join(table2)
table3$average <- table3$hours/table3$n
return(list(table1,table2,table3))
}
# Calling the function
results <- my_function_name(df)
results$table1
results$table2
results$table3
In this case I used the function to retrieve all the tables. If you only want the final number table3$hours/table3$n what we can do is change the return of the function:
my_function_name <- function(df){
table1 <- df %>%
group_by(org) %>%
tally()
table2 <- df %>%
group_by(org) %>%
mutate_at(vars(hours), ~replace_na(., 0)) %>%
summarise(hours = sum(hours))
table3 <- table1 %>%
inner_join(table2)
table3$average <- table3$hours/table3$n
return(table3$average)
}
# Calling the function
results <- my_function_name(df)
results

Use dplyr to get index of first column with certain value per group or row

I have the following script. Option 1 uses a long format and group_by to identify the first step of many where the status equals 0.
Another option (2) is to use apply to calculate this value for each row, and then transform the data to a long format.
The firs option does not scale well. The second does, but I was unable to get it into a dplyr pipe. I tried to solve this with purrr but did not succeeed.
Questions:
Why does the first option not scale well?
How can I transform the second option in a dplyr pipe?
require(dplyr)
require(tidyr)
require(ggplot2)
set.seed(314)
# example data
dat <- as.data.frame(matrix(sample(c(0,1),
size = 9000000,
replace = TRUE,
prob = c(5,95)),
ncol = 9))
names(dat) <- paste("step",1:9, sep="_")
steps <- dat %>% select(starts_with("step_")) %>% names()
# option 1 is slow
dat.cum <- dat %>%
mutate(id = row_number()) %>%
gather(step, status,-id) %>%
group_by(id) %>%
mutate(drop = min(if_else(status==0,match(step, steps),99L))) %>%
mutate(status = if_else(match(step, steps)>=drop,0,1))
ggplot(dat.cum, aes(x = step, fill = factor(status))) +
geom_bar()
# option 2 is faster
dat$drop <- apply(dat,1,function(x) min(which(x==0),99))
dat.cum <- dat %>%
gather(step,status,-drop) %>%
mutate(status = if_else(match(step,steps)>=drop,0,1))
ggplot(dat.cum, aes(x = step, fill = factor(status))) +
geom_bar()
If you would like to map along rows you could do:
dat %>%
mutate(drop2 = map_int(seq_len(nrow(dat)), ~ min(which(dat[.x, ] == 0L), 99L)))
It could be that "gathering and grouping" is faster than Looping:
dat %>%
as_tibble() %>%
select(starts_with("step_")) %>%
mutate(row_nr = row_number()) %>%
gather(key = "col", value = "value", -row_nr) %>%
arrange(row_nr, col) %>%
group_by(row_nr) %>%
mutate(col_index = row_number()) %>%
filter(value == 0) %>%
summarise(drop3 = min(col_index)) %>%
ungroup() %>%
right_join(dat %>%
mutate(row_nr = row_number()),
by = "row_nr") %>%
mutate(drop3 = if_else(is.na(drop3), 99, drop3))

Adding a list-column to a tibble in which each element is the first element of another list-column

I know how to do something analogousfor non-list-columns. For example:
library(tidyverse)
mtcars %>% mutate(first_cyl = first(cyl))
I expected that for list-columns, this will work:
mtcars %>%
group_by(gear) %>%
nest(.key = "x") %>%
mutate(first_x = first(x))
but produces an error.
I think I was missing the list function. Now, this works:
mtcars %>%
group_by(gear) %>%
nest(.key = "x") %>%
mutate(first_x = list(first(x)))

r: dplyr function to make long format with multiple groups

I'm trying to take this code and turn it into a function:
mtcars %>% group_by(gear) %>% select(hp, disp) %>%
summarise_all(funs(n=sum(!is.na(.)), mean=mean(.,na.rm=T))) %>%
gather(variable, value, -gear) %>%
arrange(gear, sub('_.*', '', variable), sub('.*_', '', variable)) %>%
separate(variable, into = c('var', 'metric'), '_')
What it does is create a dataframe that I can easily import into Excel and create pivots with. I've tried the following, but it doesn't seem to work:
mean_func <- function(vars,groups) {
results <- test %>% group_by_at(vars(one_of(groups))) %>%
summarise_at(vars(starts_with(vars)), funs(n=sum(!is.na(.)), mean=mean(.,na.rm=T))) %>%
gather_(variable, value, -groups) %>%
arrange_(groups) %>%
separate_(variable, into = c('var', 'metric'), '_'); View(results)
}
Seems like the problem is somewhere along the gather statement, but I'm not sure what could be wrong here. Any thoughts?
Use a bit of tidy cleanup to change the grouping variable and use group_by_ for programming.
library(tidyverse)
mean_func <- function(vars, groups) {
groups = enquo(groups)
vars %>%
group_by_(groups) %>%
dplyr::select(hp, disp) %>%
summarise_all(funs(n=sum(!is.na(.)), mean=mean(.,na.rm=T))) %>%
gather(variable, value, -!!groups) %>%
arrange(!!groups, sub('_.*', '', variable), sub('.*_', '', variable)) %>%
separate(variable, into = c('var', 'metric'), '_')
}
mean_func(mtcars, gear)

Combine list of data frames with one column of characters

I am learning to get, cleaning and combining data. I am confused why in a loop rbind command result in returning 10 data instead of expected 30 data as when I combine it manually (i by i).
library(XML)
mergeal <- NULL
tabnums <- 3
for (i in 1:length(tabnums)) {
bnn <- paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
tabnums[i], "&&coming=22-Oct-2015&coming=22-Oct-2015")
tem <- readHTMLTable(bnn, header=T, stringsAsFactors=F)
#data cleaning
ff <- tem[8] #wanted data
ff1 <- as.data.frame(ff)
ff2 <- ff1[ , 1] #get 1st col data only
ff3 <- unique(ff2)
ff4 <- ff3[c(2,5:13)] #wanted list only
#merging dataset
mergeal <- rbind(mergeal, ff4)
}
I've tried using list rbind list of data frames with one column of characters and numerics but still have the same result as above. Appreciate any help on what I missed, thanks.
I cleaned up the data cause I was bored.
library(plyr)
library(XML)
library(dplyr)
library(magrittr)
library(stringi)
library(tidyr)
library(lubridate)
answer =
data_frame(tabnums = 1:3) %>%
group_by(tabnums) %>%
do(.$tabnums %>%
paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
., "&&coming=22-Oct-2015&coming=22-Oct-2015") %>%
readHTMLTable(header = T, stringsAsFactors = F) %>%
extract2(8)) %>%
ungroup %>%
select(V1) %>%
distinct %>%
mutate(V1 =
V1 %>%
stri_replace_all_fixed("Â", "\n") %>%
stri_replace_all_fixed("Type:", "\nType:") %>%
stri_replace_all_fixed("Time:", "\nTime:") %>%
stri_replace_all_fixed("Area:", "\nArea:") %>%
stri_split_fixed("\n")) %>%
unnest(V1) %>%
mutate(V1 = V1 %>% stri_trim) %>%
filter(V1 %>% stri_detect_regex("^There are currently") %>% `!`) %>%
filter(V1 != "") %>%
separate(V1, c("variable", "value"), sep = ":", fill = "left") %>%
mutate(variable = variable %>% mapvalues(NA, "Description"),
ID = variable %>% `==`("Description") %>% cumsum) %>%
spread(variable, value) %>%
mutate(Area = Area %>% extract_numeric,
Price = Price %>% extract_numeric,
Datetime =
Time %>%
stri_replace_all_fixed("a.m.", "am") %>%
stri_replace_all_fixed("p.m.", "pm") %>%
paste(Date, .) %>%
dmy_hm) %>%
select(-Date, -Time)

Resources