My code...
library(expss)
library(haven)
X4707 <- read_sav("/home/cfmc/4707/data/4707.sav")
X4707 %>%
tab_cells("By phone"=qpd4_1==1,"By email"=qpd4_2==1,"Utility website"=qpd4_3==1,"Roseville Electric notification"=qpd4_4==1,"Social media"=qpd4_5==1,"Text"=qpd4_6==1,"Not sure"=qpd4_8==1) %>%
tab_cols(total(), qf5) %>%
tab_stat_cpct() %>%
tab_last_sig_cpct() %>%
tab_pivot()
My output looks like this...
I would like for the output to simply contain the text of the code going down the stub (By phone, By email, etc.) without the TRUE, FALSE, etc.
You need to designate that you want multiple response. You have multiple choice with positional coding so you need mdset (m(ultiple) d(ichotomy) set) function:
library(expss)
library(haven)
X4707 <- read_sav("/home/cfmc/4707/data/4707.sav")
X4707 %>%
tab_cells(mdset("By phone"=qpd4_1==1,"By email"=qpd4_2==1,"Utility website"=qpd4_3==1,"Roseville Electric notification"=qpd4_4==1,"Social media"=qpd4_5==1,"Text"=qpd4_6==1,"Not sure"=qpd4_8==1)) %>%
tab_cols(total(), qf5) %>%
tab_stat_cpct() %>%
tab_last_sig_cpct() %>%
tab_pivot()
Related
I am using the gt() table package in R and so far I love it. However for some reason when I publish the below in quarto I get an awkward table header that says "?caption" in bold. However when I run the table separately, I don't get anything.
Any thoughts?
Ignore the titles and columns names, I know it doesn't make sense with the diamonds package
library(tidyverse)
library(gt)
business_segment_summary <- diamonds %>%
group_by(cut) %>%
summarise(n=n(),
sum=sum(price),
sum_od=sum(price,na.rm=TRUE),
prop_25=quantile(price,.25,na.rm=TRUE),
prop_50=quantile(price,.5,na.rm=TRUE),
prop_75=quantile(price,.75,na.rm=TRUE),
mean=mean(price,na.rm=TRUE),
mean_aging=mean(table),
mean_rank=mean(depth),
prop_od=mean(carat),
sd=sd(price,na.rm=TRUE),
mad=mad(price,na.rm=TRUE),
.groups="drop"
) %>%
mutate(tar_prop=sum/sum(sum),
n_prop=n/sum(n))
business_segment_summary %>%
select(1,n,tar_prop,n_prop,prop_od,prop_25,prop_50,prop_75) %>%
gt::gt()
gt::cols_label(cut="Business Segment",
n="Customer #",
tar_prop="% of TAR",
n_prop="% of Customers",
prop_od=gt::html("% of Customers<br>with overdue"),
prop_25="25%",
prop_50="50%",
prop_75="75%") %>%
gt::tab_spanner(label="Customer Account Percentile ($k)",columns = c(prop_25,prop_50,prop_75)) %>%
gt::fmt_number(c(prop_25,prop_50,prop_75),decimals = 0,scale_by = 1/1e3) %>%
gt::fmt_number(n,decimals = 0) %>%
gt::fmt_percent(c(3:5),decimals = 0) %>%
gt::opt_stylize(style=1,color="red") %>%
gt::tab_header(title="Summary of TAR by business segments") %>%
gt::cols_align(align="left",columns = 1)
I created a function to scrape the fbref.com website. Sometimes in this website or in others that I'm trying to scrape I receive a timeout error. I read about it and it is suggested to include a sys.sleep between the requisitions or a purrr:slowly. I tried to include inside the map but I could not. How can I include a 10 seconds gap between each requisition inside the map (It would be 7 requisitions and 6 intervals of 10 seconds).
Thanks in advance and if I do not include something please inform me!
#packages
library(rvest)
library(stringr)
library(dplyr)
library(tidyr)
library(purrr)
library(lubridate)
library(tm)
#function
funcao_extract <- function(link1,link2){
fbref <- "https://fbref.com/pt/equipes/"
dados <- fbref %>%
map2_chr(rep(link1,7),paste0) %>%
map2_chr(seq(from=2014,to=2020),paste0) %>%
map2_chr(rep(link2,7),paste0) %>%
map(. %>%
read_html() %>%
html_table() %>%
.[[1]] %>% #select first table
dplyr::bind_rows()%>%
janitor::clean_names() %>%
slice(-1) %>%
select(-21) %>%
rename(nome=1,nacionalidade=2,posicao=3,idade=4,jogos=5,inicios=6,minutos=7,minutos_90=8,gol=9,assistencia=10,
gol_normal=11,gol_penalti=12,penalti_batido=13,amarelo=14,vermelho=15,gol_90=16,assistencia_90=17,
gol_assistencia_90=18,gol_normal_90=19,gol_assistencia_penalti_90=20) %>%
as.data.frame() %>%
format(scientific=FALSE) %>%
mutate_at(.,c(5:15),as.numeric) %>%
mutate(nacionalidade = str_extract(nacionalidade, "[A-Z]+")) #only capital letters
) %>% #renomear as colunas
setNames(paste0(rep("Gremio_",7),seq(from=2014,to=2020))) #name lists
}
#test with GrĂªmio
gremio <- funcao_extract("d5ae3703/","/Gremio-Estatisticas")
Just a small example
library(tidyverse)
example <- list(data.frame(a=1:10), data.frame(a=11:20))
example %>%
map_df(~ {Sys.sleep(10) ;
message(Sys.time());.x} %>%
summarise(a = sum(a)))
Attempting to call Expss from within a function. However it returns an empty table.
s1_a<-c("a","b","b")
s1_b<-c("a","a","b")
df<-data.frame(s1_a,s1_b)
multi<-function(v) {
df %>%
tab_cells(mrset_p("v")) %>%
tab_stat_cpct() %>%
tab_sort_desc() %>%
tab_pivot()
}
multi("s1_")
In your case you don't need quotes in the mrset_p:
library(expss)
s1_a<-c("a","b","b")
s1_b<-c("a","a","b")
df<-data.frame(s1_a,s1_b)
multi<-function(v) {
df %>%
tab_cells(mrset_p(v)) %>% # no quotes
tab_stat_cpct() %>%
tab_sort_desc() %>%
tab_pivot()
}
multi("s1_")
Error in unnest_tokens.data.frame(., entity, text, token = tokenize_scispacy_entities, :
Expected output of tokenizing function to be a list of length 100
The unnest_tokens() works well for a sample of few observations but fails on the entire dataset.
https://github.com/dgrtwo/cord19
Reproducible example:
library(dplyr)
library(cord19)
library(tidyverse)
library(tidytext)
library(spacyr)
Install the model from here - https://github.com/allenai/scispacy
spacy_initialize("en_core_sci_sm")
tokenize_scispacy_entities <- function(text) {
spacy_extract_entity(text) %>%
group_by(doc_id) %>%
nest() %>%
pull(data) %>%
map("text") %>%
map(str_to_lower)
}
paragraph_entities <- cord19_paragraphs %>%
select(paper_id, text) %>%
sample_n(10) %>%
unnest_tokens(entity, text, token = tokenize_scispacy_entities)
I face the same problem. I don't know the reason why, after I filter out empty abstract and shorter abstract string, everything seems work just fine.
abstract_entities <- article_data %>%
filter(nchar(abstract) > 30) %>%
select(paper_id, title, abstract) %>%
sample_n(1000) %>%
unnest_tokens(entity, abstract, token = tokenize_scispacy_entities)
I'm trying to write a function that does a split-apply-combine for which the split variable(s) are parameters, and - importantly - a null split is acceptable. For example, running statistics either on subsets of data or on the entire dataset.
somedata=expand.grid(a=1:3,b=1:3)
somefun=function(df_in,grpvars=NULL){
df_in %>% group_by_(.dots=grpvars) %>% nest() %>%
mutate(X2.Resid=map(data,~with(.x,chisq.test(b)$residuals))) %>%
unnest(data,X2.Resid) %>% return()
}
somefun(somedata,"a") # This works
somefun(somedata) # This fails
The null condition fails because nest() seems to need a variable to nest by, rather than nesting the entire df into a 1x1 data.frame. I can get around this as follows:
somefun2=function(df_in,grpvars="Dummy"){
df_in$Dummy=1
df_in %>% group_by_(.dots=grpvars) %>% nest() %>%
mutate(X2.Resid=map(data,~with(.x,chisq.test(b)$residuals))) %>%
unnest(data,X2.Resid) %>%
select(-Dummy) %>% return()
}
somefun2(somedata) # This works
However, I'm wondering if there is a more elegant way to fix this, without needing the dummy variabe?
Hmm, that behavior is a little surprising to me. A fix is easy though: you just have to make sure you nest everything():
somefun3 <- function(df_in, grpvars = NULL) {
df_in %>%
group_by_(.dots = grpvars) %>%
nest(everything()) %>%
mutate(X2.Resid = map(data, ~with(.x, chisq.test(b)$residuals))) %>%
unnest()
}
somefun3(somedata, "a")
somefun3(somedata)
Both work.