I am using this script to produce a table. In the second line, for collectionName, I am using "Organization X". I have many different organizations (Org Y, Org Z, .....) to create this table for. Is there a way to automate this? And to automate the naming of the object (currently "orgx" below)?
orgx <- df %>%
filter(str_detect(collectionName, c("Organization X"))) %>%
filter(str_detect(Year, paste(years, collapse = "|"))) %>%
corpus(text_field = "text") %>%
tokens(remove_punct = TRUE) %>%
tokens_select(stopwords('english'),selection='remove') %>%
tokens_tolower(keep_acronyms = FALSE) %>%
tokens_lookup(dictionary = dict, nomatch = TRUE) %>%
dfm() %>%
dfm_group(groups = "Title") %>%
dfm_weight(scheme = "prop") %>%
as.data.frame() %>%
mutate_at(vars(keyterms, true), funs(round(., 4)))
Get the column names specific to that organizations as vector, use that as pattern in str_detect by looping over the vector in map and return the output in a list
library(dplyr)
library(purrr)
library(stringr)
vec <- c("Organization X", "Organization Y")
out <- map(vec, ~
df %>%
filter(str_detect(collectionName, .x)) %>%
filter(str_detect(Year, paste(years, collapse = "|"))) %>%
corpus(text_field = "text") %>%
tokens(remove_punct = TRUE) %>%
tokens_select(stopwords('english'),selection='remove') %>%
tokens_tolower(keep_acronyms = FALSE) %>%
tokens_lookup(dictionary = dict, nomatch = TRUE) %>%
dfm() %>%
dfm_group(groups = "Title") %>%
dfm_weight(scheme = "prop") %>%
as.data.frame() %>%
mutate_at(vars(keyterms, true), funs(round(., 4)))
)
names(out) <- sub("^(...).*\\s+(\\S)$", "\\1\\2", vec)
It may be better to keep the output in a list. But, if we need to assign it to different objects, it can be done with list2env or assign
list2env(out, .GlobalEnv)
Related
I would like to use a loop to create crosstabs of one column with every other column in a df. I started with this code (substituting in the iris df), which works nicely for two variables:
iris <- iris
tbl <- iris %>%
tabyl(Species, Sepal.Length, show_missing_levels = FALSE, show_na = FALSE) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>%
adorn_title("combined") %>%
knitr::kable()
print(tbl)
My df contains ~200 columns. I thought I would write a for loop to print a crosstab for one variable with each of the other variables. Here's what I tried:
cols <- c('Sepal.Length', 'Sepal.Width')
for (c in cols){
tbl <- iris %>%
tabyl(Species, c, show_missing_levels = FALSE, show_na = FALSE) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>%
adorn_title("combined") %>%
knitr::kable()
print(tbl)
}
This returns Column `c` is not found.
This seems like it should be simple, but I can't figure it out. Thanks for any help.
Change the c in your code to !!sym(c). I can't explain this non-standard tidyverse evaluation thingy, but in layman's terms, you want to access an object (i.e. "c") outside of your pipe (iris). That's why you need !!sym.
You can use the .data pronoun when passing columns names as strings.
cols <- c('Sepal.Length', 'Sepal.Width')
for (col in cols){
tbl <- iris %>%
tabyl(Species, .data[[col]],show_missing_levels = FALSE,show_na = FALSE) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>%
adorn_title("combined") %>%
knitr::kable()
print(tbl)
}
I have a data.frame with the following structure:
What I need is that in case that a value in the first column occures more than once, all corresponding entries in column V18 are concluded in one cell.
I applied the folling code.
p <- function(v) {
Reduce(f=paste0, x = v)
}
Data %>%
group_by(V1) %>%
summarise(test = p(as.character(V18))) %>%
merge(., M_TEST, by = 'V1') %>%
select(V1, V18, test)
It gives:
What I need is that instead of 4344, it is {43,44}.
How can I do this?
Thank you really much for your help!
Sincerely
Try This:
Data %>%
group_by(V1) %>%
summarise(test = p(as.character(V18))) %>%
merge(., M_TEST, by = 'V1') %>%
select(V1, V18, test) %>%
mutate(test = str_remove_all(test, pattern = "NA")) %>%
mutate(test = formatC(as.numeric(test), big.mark=",", big.interval = 2L)) %>%
mutate(test = paste0("{", test, "}"))
Edit: For Multiple Columns, this should work:
Data %>%
group_by(V1) %>%
summarise_at(vars(V2:V18), paste0, collapse="") %>%
mutate_at(vars(V2:V18), str_remove_all, pattern = "NA") %>%
mutate_at(vars(V2:V18), as.numeric) %>%
mutate_at(vars(V2:V18), formatC, big.mark=",", big.interval = 2L)
I want to create a summary table for some dichotomous variables using the expss package. Since the variables are dichotomous, one of the two levels would the sufficient to "show the picture".
I tried to use the function tab_net_cell, but was not able to get the right results. Here is some example code with BrCa (Breast cancer) with 1 or 0. I only want to show the number of patients with but not without breast cancer.
df <- data.frame(BrCa = c(1,1,1,0,0,0,NA,NA,0,0))
df$group <- c(1,2,1,2,1,2,1,2,1,2)
df %>%
expss::tab_cols(group) %>%
expss::tab_cells(BrCa) %>%
expss::tab_stat_cpct(total_row_position = "none",label = "%") %>%
expss::tab_stat_cases(total_row_position = "none",label = "N") %>%
expss::tab_pivot(stat_position = "inside_rows")
df %>%
expss::tab_cols(group) %>%
expss::tab_cells(BrCa) %>%
expss::tab_net_cells("BrCa" = eq(1)) %>%
expss::tab_stat_cpct(total_row_position = "none",label = "%") %>%
expss::tab_stat_cases(total_row_position = "none",label = "N") %>%
expss::tab_pivot(stat_position = "inside_rows")
The simplest way is to filter resulted table:
df <- data.frame(BrCa = c(1,1,1,0,0,0,NA,NA,0,0))
df$group <- c(1,2,1,2,1,2,1,2,1,2)
df %>%
expss::tab_cols(group) %>%
expss::tab_cells(BrCa) %>%
expss::tab_stat_cpct(total_row_position = "none",label = "%") %>%
expss::tab_stat_cases(total_row_position = "none",label = "N") %>%
expss::tab_pivot(stat_position = "inside_rows") %>%
expss::where(grepl(1, row_labels))
Another way is to use mean and sum instead of cpct and cases:
df %>%
expss::tab_cols(group) %>%
expss::tab_cells(BrCa*100) %>%
expss::tab_stat_mean(label = "%") %>%
expss::tab_stat_sum(label = "N") %>%
expss::tab_pivot(stat_position = "inside_rows")
I have the following script. Option 1 uses a long format and group_by to identify the first step of many where the status equals 0.
Another option (2) is to use apply to calculate this value for each row, and then transform the data to a long format.
The firs option does not scale well. The second does, but I was unable to get it into a dplyr pipe. I tried to solve this with purrr but did not succeeed.
Questions:
Why does the first option not scale well?
How can I transform the second option in a dplyr pipe?
require(dplyr)
require(tidyr)
require(ggplot2)
set.seed(314)
# example data
dat <- as.data.frame(matrix(sample(c(0,1),
size = 9000000,
replace = TRUE,
prob = c(5,95)),
ncol = 9))
names(dat) <- paste("step",1:9, sep="_")
steps <- dat %>% select(starts_with("step_")) %>% names()
# option 1 is slow
dat.cum <- dat %>%
mutate(id = row_number()) %>%
gather(step, status,-id) %>%
group_by(id) %>%
mutate(drop = min(if_else(status==0,match(step, steps),99L))) %>%
mutate(status = if_else(match(step, steps)>=drop,0,1))
ggplot(dat.cum, aes(x = step, fill = factor(status))) +
geom_bar()
# option 2 is faster
dat$drop <- apply(dat,1,function(x) min(which(x==0),99))
dat.cum <- dat %>%
gather(step,status,-drop) %>%
mutate(status = if_else(match(step,steps)>=drop,0,1))
ggplot(dat.cum, aes(x = step, fill = factor(status))) +
geom_bar()
If you would like to map along rows you could do:
dat %>%
mutate(drop2 = map_int(seq_len(nrow(dat)), ~ min(which(dat[.x, ] == 0L), 99L)))
It could be that "gathering and grouping" is faster than Looping:
dat %>%
as_tibble() %>%
select(starts_with("step_")) %>%
mutate(row_nr = row_number()) %>%
gather(key = "col", value = "value", -row_nr) %>%
arrange(row_nr, col) %>%
group_by(row_nr) %>%
mutate(col_index = row_number()) %>%
filter(value == 0) %>%
summarise(drop3 = min(col_index)) %>%
ungroup() %>%
right_join(dat %>%
mutate(row_nr = row_number()),
by = "row_nr") %>%
mutate(drop3 = if_else(is.na(drop3), 99, drop3))
I am learning to get, cleaning and combining data. I am confused why in a loop rbind command result in returning 10 data instead of expected 30 data as when I combine it manually (i by i).
library(XML)
mergeal <- NULL
tabnums <- 3
for (i in 1:length(tabnums)) {
bnn <- paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
tabnums[i], "&&coming=22-Oct-2015&coming=22-Oct-2015")
tem <- readHTMLTable(bnn, header=T, stringsAsFactors=F)
#data cleaning
ff <- tem[8] #wanted data
ff1 <- as.data.frame(ff)
ff2 <- ff1[ , 1] #get 1st col data only
ff3 <- unique(ff2)
ff4 <- ff3[c(2,5:13)] #wanted list only
#merging dataset
mergeal <- rbind(mergeal, ff4)
}
I've tried using list rbind list of data frames with one column of characters and numerics but still have the same result as above. Appreciate any help on what I missed, thanks.
I cleaned up the data cause I was bored.
library(plyr)
library(XML)
library(dplyr)
library(magrittr)
library(stringi)
library(tidyr)
library(lubridate)
answer =
data_frame(tabnums = 1:3) %>%
group_by(tabnums) %>%
do(.$tabnums %>%
paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
., "&&coming=22-Oct-2015&coming=22-Oct-2015") %>%
readHTMLTable(header = T, stringsAsFactors = F) %>%
extract2(8)) %>%
ungroup %>%
select(V1) %>%
distinct %>%
mutate(V1 =
V1 %>%
stri_replace_all_fixed("Â", "\n") %>%
stri_replace_all_fixed("Type:", "\nType:") %>%
stri_replace_all_fixed("Time:", "\nTime:") %>%
stri_replace_all_fixed("Area:", "\nArea:") %>%
stri_split_fixed("\n")) %>%
unnest(V1) %>%
mutate(V1 = V1 %>% stri_trim) %>%
filter(V1 %>% stri_detect_regex("^There are currently") %>% `!`) %>%
filter(V1 != "") %>%
separate(V1, c("variable", "value"), sep = ":", fill = "left") %>%
mutate(variable = variable %>% mapvalues(NA, "Description"),
ID = variable %>% `==`("Description") %>% cumsum) %>%
spread(variable, value) %>%
mutate(Area = Area %>% extract_numeric,
Price = Price %>% extract_numeric,
Datetime =
Time %>%
stri_replace_all_fixed("a.m.", "am") %>%
stri_replace_all_fixed("p.m.", "pm") %>%
paste(Date, .) %>%
dmy_hm) %>%
select(-Date, -Time)