Create R function from multiple scripts? - r

Not too good with functions. Is there a way to write the below script as a function? I have a list of dataframes that I want to apply the below scripts to.
head(iris)
iris1 <- iris %>%
group_by(Species) %>%
mutate_at(vars(Petal.Length), ~replace_na(., 0)) %>%
summarise(Petal.Length = sum(Petal.Length))
iris2 <- iris %>%
group_by(Species) %>%
tally()
iris3 <- iris2 %>%
inner_join(iris1)
iris3$average <- iris3$Petal.Length/iris3$n

Yes, its quite easy.
Let me know if this helps you:
my_function_name <- function(df){
table1 <- df %>%
group_by(org) %>%
tally()
table2 <- df %>%
group_by(org) %>%
mutate_at(vars(hours), ~replace_na(., 0)) %>%
summarise(hours = sum(hours))
table3 <- table1 %>%
inner_join(table2)
table3$average <- table3$hours/table3$n
return(list(table1,table2,table3))
}
# Calling the function
results <- my_function_name(df)
results$table1
results$table2
results$table3
In this case I used the function to retrieve all the tables. If you only want the final number table3$hours/table3$n what we can do is change the return of the function:
my_function_name <- function(df){
table1 <- df %>%
group_by(org) %>%
tally()
table2 <- df %>%
group_by(org) %>%
mutate_at(vars(hours), ~replace_na(., 0)) %>%
summarise(hours = sum(hours))
table3 <- table1 %>%
inner_join(table2)
table3$average <- table3$hours/table3$n
return(table3$average)
}
# Calling the function
results <- my_function_name(df)
results

Related

Error when exporting R table to excel using group_by

I am writing a function that will create a table from a dataframe using the kable() function, then export it to excel using the writexl library.
However I get an error with summarise() and group_by() function:
show_data <- function(df,col1,col2){
myTable <- df %>%
filter(!is.na(col1),!is.na(col2)) %>%
group_by(col2, col1) %>%
summarise(subjects = n()) %>%
mutate(percent = round(subjects / sum(subjects, na.rm = TRUE)* 100, 1)) %>% select(-subjects) %>%
spread(col2,percent)
names(myTable) <- c(col1,"col",col2)
MyTable %>% a
rrange(desc(col2)) %>%
kable(caption = paste(col1,"_",col2))
}
Error messages:
Error: Can't extract columns that don't exist.
x Column foo doesn't exist.
and
***Error: Must group by variables found in .data.
Column columnTwo is not found.
Column columnOne is not found.***
Also, I would like to export a resulting table (via kable()) from R to Excel without having to repeat the same lines of code. Is this possible?
show_data <- function(df,col1,col2){
myTable <- df %>%
filter(!is.na(col1),!is.na(col2)) %>%
group_by(col2, col1) %>%
summarise(subjects = n()) %>%
mutate(percent = round(subjects / sum(subjects, na.rm = TRUE)* 100, 1)) %>% select(-subjects) %>%
spread(col2,percent)
names(myTable) <- c(col1,"col",col2)
MyTable %>% a
rrange(desc(col2)) %>%
kable(caption = paste(col1,"_",col2))
myTable <- df %>%
filter(!is.na(col1),!is.na(col2)) %>%
group_by(col2, col1) %>%
summarise(subjects = n()) %>%
mutate(percent = round(subjects / sum(subjects, na.rm = TRUE)* 100, 1)) %>% select(-subjects) %>%
spread(col2,percent)
names(myTable) <- c(col1,"col",col2)
MyTable %>%
arrange(desc(col2)) %>%
write_xlsx(results.xlsx")
}
show_data(my_data,"column_one","column_two")

Loop to create crosstabs of columns using tidyr

I would like to use a loop to create crosstabs of one column with every other column in a df. I started with this code (substituting in the iris df), which works nicely for two variables:
iris <- iris
tbl <- iris %>%
tabyl(Species, Sepal.Length, show_missing_levels = FALSE, show_na = FALSE) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>%
adorn_title("combined") %>%
knitr::kable()
print(tbl)
My df contains ~200 columns. I thought I would write a for loop to print a crosstab for one variable with each of the other variables. Here's what I tried:
cols <- c('Sepal.Length', 'Sepal.Width')
for (c in cols){
tbl <- iris %>%
tabyl(Species, c, show_missing_levels = FALSE, show_na = FALSE) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>%
adorn_title("combined") %>%
knitr::kable()
print(tbl)
}
This returns Column `c` is not found.
This seems like it should be simple, but I can't figure it out. Thanks for any help.
Change the c in your code to !!sym(c). I can't explain this non-standard tidyverse evaluation thingy, but in layman's terms, you want to access an object (i.e. "c") outside of your pipe (iris). That's why you need !!sym.
You can use the .data pronoun when passing columns names as strings.
cols <- c('Sepal.Length', 'Sepal.Width')
for (col in cols){
tbl <- iris %>%
tabyl(Species, .data[[col]],show_missing_levels = FALSE,show_na = FALSE) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>%
adorn_title("combined") %>%
knitr::kable()
print(tbl)
}

add summary `n` from one dataframe to another dataframe (tidyverse)

I was wondering if there might be a way to replace the column fpc in DATA2 with corresponding fpc obtained from DATA1?
library(tidyverse)
dat <- read.csv('https://raw.githubusercontent.com/rnorouzian/d/master/su.csv')
## 10000 rows ################
DATA1 <- dat %>%
group_by(across(all_of(c("gender", "pre")))) %>%
summarise(n = n(), .groups = 'drop') %>%
mutate(fpc = n/sum(n)) %>%
right_join(dat)
dat2 <- read.csv('https://raw.githubusercontent.com/rnorouzian/d/master/out.csv')
## 200 rows #################
DATA2 <- dat2 %>%
group_by(across(all_of(c("gender", "pre")))) %>%
summarise(n = n(), .groups = 'drop') %>%
mutate(fpc = n/sum(n)) %>%
right_join(dat2)
You can join the dataframe and use coalesce to select fpc from DATA2.
library(dplyr)
result <- DATA2 %>%
left_join(DATA1 %>% distinct(gender, pre, fpc),
by = c('gender', 'pre')) %>%
mutate(fpc = coalesce(fpc.y, fpc.x)) %>%
select(names(DATA2))
nrow(result)
#[1] 200
It would be more efficient to do this in data.table
library(data.table)
setDT(DATA2)[as.data.table(unique(DATA1[c('gender', 'pre', 'fpc')])),
fpc := i.fpc, on = .(gender, pre)]

How do I parameterize the column names in dplyr function calls

Suppose we have the following setup
library(dplyr)
set.seed(10101)
id <- sample(3,20,replace = TRUE)
x <- sample(2,20,replace = TRUE)
df <- data.frame(id,x)
How do I parameterize the following:
df %>% group_by(id) %>% arrange(id) %>% mutate(x.lag=lag(x,1,default=0))
cl <- "x"
cl.lag <- "x.lag.1"
my naive attempt does not seem to work:
df %>% group_by(id) %>% arrange(id) %>% mutate(cl.lag=lag(cl,1,default=0))

Combine list of data frames with one column of characters

I am learning to get, cleaning and combining data. I am confused why in a loop rbind command result in returning 10 data instead of expected 30 data as when I combine it manually (i by i).
library(XML)
mergeal <- NULL
tabnums <- 3
for (i in 1:length(tabnums)) {
bnn <- paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
tabnums[i], "&&coming=22-Oct-2015&coming=22-Oct-2015")
tem <- readHTMLTable(bnn, header=T, stringsAsFactors=F)
#data cleaning
ff <- tem[8] #wanted data
ff1 <- as.data.frame(ff)
ff2 <- ff1[ , 1] #get 1st col data only
ff3 <- unique(ff2)
ff4 <- ff3[c(2,5:13)] #wanted list only
#merging dataset
mergeal <- rbind(mergeal, ff4)
}
I've tried using list rbind list of data frames with one column of characters and numerics but still have the same result as above. Appreciate any help on what I missed, thanks.
I cleaned up the data cause I was bored.
library(plyr)
library(XML)
library(dplyr)
library(magrittr)
library(stringi)
library(tidyr)
library(lubridate)
answer =
data_frame(tabnums = 1:3) %>%
group_by(tabnums) %>%
do(.$tabnums %>%
paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
., "&&coming=22-Oct-2015&coming=22-Oct-2015") %>%
readHTMLTable(header = T, stringsAsFactors = F) %>%
extract2(8)) %>%
ungroup %>%
select(V1) %>%
distinct %>%
mutate(V1 =
V1 %>%
stri_replace_all_fixed("Â", "\n") %>%
stri_replace_all_fixed("Type:", "\nType:") %>%
stri_replace_all_fixed("Time:", "\nTime:") %>%
stri_replace_all_fixed("Area:", "\nArea:") %>%
stri_split_fixed("\n")) %>%
unnest(V1) %>%
mutate(V1 = V1 %>% stri_trim) %>%
filter(V1 %>% stri_detect_regex("^There are currently") %>% `!`) %>%
filter(V1 != "") %>%
separate(V1, c("variable", "value"), sep = ":", fill = "left") %>%
mutate(variable = variable %>% mapvalues(NA, "Description"),
ID = variable %>% `==`("Description") %>% cumsum) %>%
spread(variable, value) %>%
mutate(Area = Area %>% extract_numeric,
Price = Price %>% extract_numeric,
Datetime =
Time %>%
stri_replace_all_fixed("a.m.", "am") %>%
stri_replace_all_fixed("p.m.", "pm") %>%
paste(Date, .) %>%
dmy_hm) %>%
select(-Date, -Time)

Resources