If I run the following code, everything runs fine:
library(dplyr)
library(tidyr)
library(shiny)
id <- 1:100
gender <- sample(c('M','F'), 100, replace=TRUE)
age <- sample(18:22, 100, replace=TRUE)
ethnicity <- sample(c('W','B','H','A','O'), 100, replace = TRUE)
grade <- sample(LETTERS[1:4], 100, replace=TRUE)
df <- cbind(id,gender,age,ethnicity,grade) %>% as.data.frame()
list1 <- list("id"="id","gender"="gender","age"="age","ethnicity"="ethnicity","grade"="grade")
list2 <- list("id"="id","gender"="gender","age"="age","ethnicity"="ethnicity","grade"="grade")
ui <-fluidPage(
selectInput("picker1", "PICKER 1", choices = list1, selected = "gender"),
selectInput("picker2", "PICKER 2", choices = list2, selected = "grade"),
tableOutput("crosstabs")
)
server <- function(input,output,session){
output$crosstabs <- renderTable({
t<-df %>% select_all() %>% select(-id) %>%
pivot_longer(cols = input$picker1) %>%
count(name,value, grade) %>% pivot_wider(names_from = grade, values_from = n)
t
})
}
shinyApp(ui,server)
But when I update output$crosstabs to include input$picker2, it returns an error. I'm rather confused, as I'm using the same structure, but simply with grade swapped out for input$picker2:
output$crosstabs <- renderTable({
t<-df %>% select_all() %>% select(-id) %>%
pivot_longer(cols = input$picker1) %>%
count(name,value, input$picker2) %>% pivot_wider(names_from = input$picker2, values_from = n)
t
})
What exactly is happening here?
It is a string and the count expects a unquoted column name, we can convert it to symbol and evaluate (!!)
server <- function(input,output,session){
output$crosstabs <- renderTable({
df %>%
select_all() %>%
select(-id) %>%
pivot_longer(cols = input$picker1) %>%
count(name,value, !!rlang::sym(input$picker2)) %>%
pivot_wider(names_from = input$picker2, values_from = n)
})
}
-testing
Related
I am trying to create an app that creates word frequency plot from text. The data I used is here: https://www.kaggle.com/datasets/bharadwaj6/kindle-reviews#:~:text=Never-,kindle_reviews,-.csv(701.88
Following is my shiny code:
library(shiny)
library(tidyverse)
library(tidytext)
# Source functions
source("wrd_freq_df.R")
source("bigrm_df.R")
# UI design ----
ui <- fluidPage(fluidRow(
column(
4,
fileInput("upload", NULL, accept = c(".csv", ".tsv")),
actionButton(
inputId = "submit",
label = "RUN",
class = "btn-primary"
),fluidRow(
column(4, numericInput(
"n",
"Top n words",
value = 5,
min = 1,
step = 1
)),
column(4, selectInput(
"myselect",
label = "Choose Words"
, choices = NULL
, multiple = TRUE
# , options = list(create = TRUE)
, selected = NULL
))),
plotOutput("wrd_frq_plot"),
tableOutput("head")
)
))
# SERVER ----
options(shiny.maxRequestSize = 60 * 1024 ^ 2)
server <- function(input, output, session) {
#upload csv data
data <- reactive({
req(input$upload)
ext <- tools::file_ext(input$upload$name)
switch(
ext,
csv = vroom::vroom(input$upload$datapath, delim = ","),
tsv = vroom::vroom(input$upload$datapath, delim = "\t"),
validate("Invalid file; Please upload a .csv or .tsv file")
)
})
#create different data subset to be used for plotting word_freq and bigrams
rv <- reactiveValues()
#run analysis on pressing run button
observeEvent(input$submit, {
rv$data_head <- data() %>%
mutate(ID = as.character(round(ID))) %>%
mutate(group = if_else(rating >= 4, "High", if_else(rating > 2, "Avg", "Low"))) %>%
select(ID, group, reviewText)
#get base analysis dataset
rv$word_freq <- wrd_freq_df(rv$data_head)
rv$bigrm_freq <- bigrm_freq_df(rv$data_head)
rv$bigrm_tf_idf <- bigrm_freq_tf_idf(rv$bigrm_freq)
#get all unique words from the document
rv$choices <- rv$word_freq %>%
select(word) %>%
unique() %>% unlist()
output$wrd_frq_plot <- renderPlot({
#ensure that if selectinput has no value then all words are used for analysis
if (is.null(input$myselect)) {
selected_choices = rv$choices
}
else{
selected_choices = input$myselect
}
updateSelectInput(session, "myselect", selected = selected_choices)
#get relevant document ID which contains chosen word for analysis
word <- input$myselect
word_df_chose <- data.frame(word)
relvant_ID <- word_df_chose %>%
#THIS IS PROBABLY WHERE THINGS FAILS
inner_join(rv$word_freq) %>%
select(ID) %>%
unique() %>% unlist()
rv$word_freq %>%
filter(ID %in% relevant_ID) %>%
left_join(rv$data_head %>%
select(ID, group) %>%
unique()) %>%
#remove low tf_IDF words
filter(tf_idf > quantile(tf_idf, 0.25)) %>%
group_by(group, word) %>%
summarise(n_tot = sum(n)) %>% ungroup() %>%
group_by(group) %>%
slice_max(n_tot, n = input$n) %>%
ungroup() %>%
mutate(group = as.factor(group),
word = reorder_within(word, n_tot, group)) %>%
ggplot(aes(x = word, y = n_tot, fill = group)) + geom_col() + facet_wrap(group ~
., scales = "free") +
coord_flip() + scale_x_reordered()
})
})
}
#executes app
shinyApp(ui, server)
The functions sourced are as follows:
wrd_freq_df <- function(df){
df %>%
# mutate(ID = as.character(round(ID))) %>%
# mutate(group = if_else(rating >= 4, "High", if_else(rating > 2, "Avg", "Low"))) %>%
# select(ID, group, reviewText) %>%
#get word freq
unnest_tokens(output = word, input = reviewText) %>% ungroup() %>%
anti_join(stop_words) %>%
count(ID, word) %>% ungroup() %>%
group_by(ID) %>%
mutate(total = sum(n)) %>% ungroup() %>%
bind_tf_idf(word, ID, n) }
bigrm_freq_df <- function(df) {
df %>%
# mutate(ID = as.character(round(ID))) %>%
# mutate(group = if_else(rating >= 4, "High", if_else(rating > 2, "Avg", "Low"))) %>%
# select(ID, group, reviewText) %>%
unnest_tokens(bigram, reviewText, token = "ngrams", n = 2) %>%
separate(bigram,
c("word1", "word2"),
sep = " ",
remove = FALSE) %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
}
bigrm_freq_tf_idf <- function(df) {
df %>%
select(ID, bigram) %>%
#get tf_idf
count(ID, bigram) %>%
bind_tf_idf(bigram, ID, n) %>%
arrange(desc(tf_idf))
}
i get following error message:
Joining, by = "word"
Warning: Error in inner_join: by must be supplied when x and y have no common variables.
ℹ use by = character()` to perform a cross-join.
181:
Input to asJSON(keep_vec_names=TRUE) is a named vector. In a future version of jsonlite, this option will not be supported, and named vectors will be translated into arrays instead of objects. If you want JSON object output, please use a named list instead. See ?toJSON.
I assume that the error must be arising when
I am trying to make a sankey diagram including 6 levels in total in r-studio using the googlevis package. With the help of How to make a googleVis multiple Sankey from a data.frame? I was successful with three levels with the code presented there. Here it is:
'''
source <- sample(c("NorthSrc", "SouthSrc", "EastSrc", "WestSrc"), 100, replace=T)
mid <- sample(c("NorthMid", "SouthMid", "EastMid", "WestMid"), 100, replace=T)
destination <- sample(c("NorthDes", "SouthDes", "EastDes", "WestDes"), 100, replace=T)
dummy <- rep(1,100) # For aggregation
dat <- data.frame(source,mid,destination,dummy)
aggdat <- aggregate(dummy~source+mid+destination,dat,sum)
library(dplyr)
datSM <- dat %>%
group_by(source, mid) %>%
summarise(toMid = sum(dummy) ) %>%
ungroup()
datMD <- dat %>%
group_by(mid, destination) %>%
summarise(toDes = sum(dummy) ) %>%
ungroup()
colnames(datSM) <- colnames(datMD) <- c("From", "To", "Dummy")
datVis <- rbind(datSM, datMD)
p <- gvisSankey(datVis, from="From", to="To", weight="dummy")
plot(p)
'''
This results in:
enter image description here
My question now is, how can I insert additional levels and how do I have to adapt the code that multiple mid-levels are accepted? Here is the example dataset:
'''
source <- sample(c("NorthSrc", "SouthSrc", "EastSrc", "WestSrc"), 100, replace=T)
mid_one <- sample(c("North", "South", "East", "West"), 100, replace=T)
mid_two <-sample(c("WestNorth", "WestSouth", "SouthEast", "NorthWest"), 100, replace=T)
mid_three <- sample(c("NorthMid", "SouthMid", "EastMid", "WestMid"), 100, replace=T)
mid_four <- sample(c("West", "East", "NorthCis", "SouthCis"), 100, replace=T)
destination <- sample(c("NorthDes", "SouthDes", "EastDes", "WestDes"), 100, replace=T)
dummy <- rep(1,100) # For aggregation
dat <- data.frame(source,mid_one, mid_two,mid_three, mid_four,destination,dummy)
aggdat <- aggregate(dummy~source+mid_one+mid_two+mid_three+mid_four+destination,dat,sum)
'''
just found the solution myself (if someone is looking for a similar problem):
datSM1 <- dat %>%
group_by(source, mid_one) %>%
summarise(toMid1 = sum(dummy) ) %>%
ungroup()
datSM2 <- dat %>%
group_by(mid_one, mid_two) %>%
summarise(toMid2 = sum(dummy) ) %>%
ungroup()
datSM3 <- dat %>%
group_by(mid_two, mid_three) %>%
summarise(toMid3 = sum(dummy) ) %>%
ungroup()
datSM4 <- dat %>%
group_by(mid_three, mid_four) %>%
summarise(toMid4 = sum(dummy) ) %>%
ungroup()
datMD <- dat %>%
group_by(mid_four, destination) %>%
summarise(toDes = sum(dummy) ) %>%
ungroup()
colnames(datSM1)<- colnames(datSM2) <- colnames(datSM3)<- colnames(datSM4)<-
colnames(datMD) <- c("From", "To", "Dummy")
datVis <- rbind(datSM1,datSM2, datSM3,datSM4,datMD)
then plot via gvisSankey
I am using this script to produce a table. In the second line, for collectionName, I am using "Organization X". I have many different organizations (Org Y, Org Z, .....) to create this table for. Is there a way to automate this? And to automate the naming of the object (currently "orgx" below)?
orgx <- df %>%
filter(str_detect(collectionName, c("Organization X"))) %>%
filter(str_detect(Year, paste(years, collapse = "|"))) %>%
corpus(text_field = "text") %>%
tokens(remove_punct = TRUE) %>%
tokens_select(stopwords('english'),selection='remove') %>%
tokens_tolower(keep_acronyms = FALSE) %>%
tokens_lookup(dictionary = dict, nomatch = TRUE) %>%
dfm() %>%
dfm_group(groups = "Title") %>%
dfm_weight(scheme = "prop") %>%
as.data.frame() %>%
mutate_at(vars(keyterms, true), funs(round(., 4)))
Get the column names specific to that organizations as vector, use that as pattern in str_detect by looping over the vector in map and return the output in a list
library(dplyr)
library(purrr)
library(stringr)
vec <- c("Organization X", "Organization Y")
out <- map(vec, ~
df %>%
filter(str_detect(collectionName, .x)) %>%
filter(str_detect(Year, paste(years, collapse = "|"))) %>%
corpus(text_field = "text") %>%
tokens(remove_punct = TRUE) %>%
tokens_select(stopwords('english'),selection='remove') %>%
tokens_tolower(keep_acronyms = FALSE) %>%
tokens_lookup(dictionary = dict, nomatch = TRUE) %>%
dfm() %>%
dfm_group(groups = "Title") %>%
dfm_weight(scheme = "prop") %>%
as.data.frame() %>%
mutate_at(vars(keyterms, true), funs(round(., 4)))
)
names(out) <- sub("^(...).*\\s+(\\S)$", "\\1\\2", vec)
It may be better to keep the output in a list. But, if we need to assign it to different objects, it can be done with list2env or assign
list2env(out, .GlobalEnv)
I am trying to create a user-defined function which carries out some data transformations.
Mock data:
library(tidyverse)
set.seed(1)
sampledata_a <- data.frame(
patientid = sample(1:100),
servicetype = sample(c("service1", "service2", "service3", "service4", "service5"), 100, replace=TRUE),
date = sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 100)
)
sampledata_b <- data.frame(
patientid = sample(1:100),
servicetype = sample(c("service6", "service7", "service8", "service9", "service10"), 100, replace=TRUE),
date = sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 100)
)
sampledata1 <- rbind(sampledata_a, sampledata_b)
User-defined function:
get_most_recent_and_unique <- function(inputdata, groupbyvar, uniquevar, datevar) {
# first selects the most recent observation for each unique variable combination
outputdata <- inputdata %>%
distinct() %>%
arrange(groupbyvar, uniquevar, desc(datevar)) %>%
mutate(orderkey = paste0(groupbyvar, uniquevar, sep = "")) %>%
group_by(orderkey) %>%
do(head(., n=1)) %>%
ungroup() %>%
arrange(groupbyvar, desc(datevar), uniquevar)
# then tranpose from long to wide, and unite variables other than first variable into one
outputdata <- outputdata %>%
select(groupbyvar, uniquevar) %>%
group_by(groupbyvar) %>%
mutate(pos=1:n()) %>%
spread(pos, uniquevar) %>%
unite(uniquevar, -groupbyvar, sep=" / ")
return(outputdata)
}
When running the function as below:
outputdata <- get_most_recent_and_unique(sampledata1, "patientid", "servicetype", "date")
Following error message:
Error in arrange_impl(.data, dots) :
incorrect size (1) at position 1, expecting : 100
However, the code works fine when outside the user-defined function. I wonder if anyone can tell me what is wrong?
testoutputdata <- sampledata1 %>%
distinct() %>%
arrange(patientid, servicetype, desc(date)) %>%
mutate(orderkey = paste0(patientid, servicetype, sep = "")) %>%
group_by(orderkey) %>%
do(head(., n=1)) %>%
ungroup() %>%
arrange(patientid, desc(date), servicetype)
testoutputdata <- testoutputdata %>%
select(patientid, servicetype) %>%
group_by(patientid) %>%
mutate(pos=1:n()) %>%
spread(pos, servicetype) %>%
unite(servicetype, -patientid, sep=" / ")
Try this:
get_most_recent_and_unique <- function(inputdata, groupbyvar, uniquevar, datevar) {
groupbyvar <- enquo(groupbyvar)
uniquevar <- enquo(uniquevar)
datevar <- enquo(datevar)
# first selects the most recent observation for each unique variable combination
outputdata <- inputdata %>%
distinct() %>%
arrange(!! groupbyvar, !! uniquevar, desc(!! datevar)) %>%
mutate(orderkey := paste0(!! groupbyvar, !! uniquevar, sep = "")) %>%
group_by(orderkey) %>%
do(head(., n=1)) %>%
ungroup() %>%
arrange(!! groupbyvar, desc(!! datevar), !! uniquevar)
# then tranpose from long to wide, and unite variables other than first variable into one
outputdata <- outputdata %>%
select(!! groupbyvar, !! uniquevar) %>%
group_by(!! groupbyvar) %>%
mutate(pos=1:n()) %>%
spread(pos, !! uniquevar) %>%
unite(!! uniquevar, -!! groupbyvar, sep=" / ")
return(outputdata)
}
outputdata <- get_most_recent_and_unique(sampledata1, patientid, servicetype, date) # No quotation with arguments!
Here is the output:
patientid servicetype
<int> <chr>
1 1 service7 / service3
2 2 service10 / service1
3 3 service4 / service9
4 4 service8 / service3
5 5 service6 / service1
It seems to match your expectations when I compare them:
all.equal(outputdata, testoutputdata)
[1] TRUE
Note that you shouldn't quote the arguments when specifying the function, i.e. outputdata <- get_most_recent_and_unique(sampledata1, patientid, servicetype, date) will work while outputdata <- get_most_recent_and_unique(sampledata1, "patientid", "servicetype", "date") won't.
I am learning to get, cleaning and combining data. I am confused why in a loop rbind command result in returning 10 data instead of expected 30 data as when I combine it manually (i by i).
library(XML)
mergeal <- NULL
tabnums <- 3
for (i in 1:length(tabnums)) {
bnn <- paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
tabnums[i], "&&coming=22-Oct-2015&coming=22-Oct-2015")
tem <- readHTMLTable(bnn, header=T, stringsAsFactors=F)
#data cleaning
ff <- tem[8] #wanted data
ff1 <- as.data.frame(ff)
ff2 <- ff1[ , 1] #get 1st col data only
ff3 <- unique(ff2)
ff4 <- ff3[c(2,5:13)] #wanted list only
#merging dataset
mergeal <- rbind(mergeal, ff4)
}
I've tried using list rbind list of data frames with one column of characters and numerics but still have the same result as above. Appreciate any help on what I missed, thanks.
I cleaned up the data cause I was bored.
library(plyr)
library(XML)
library(dplyr)
library(magrittr)
library(stringi)
library(tidyr)
library(lubridate)
answer =
data_frame(tabnums = 1:3) %>%
group_by(tabnums) %>%
do(.$tabnums %>%
paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
., "&&coming=22-Oct-2015&coming=22-Oct-2015") %>%
readHTMLTable(header = T, stringsAsFactors = F) %>%
extract2(8)) %>%
ungroup %>%
select(V1) %>%
distinct %>%
mutate(V1 =
V1 %>%
stri_replace_all_fixed("Â", "\n") %>%
stri_replace_all_fixed("Type:", "\nType:") %>%
stri_replace_all_fixed("Time:", "\nTime:") %>%
stri_replace_all_fixed("Area:", "\nArea:") %>%
stri_split_fixed("\n")) %>%
unnest(V1) %>%
mutate(V1 = V1 %>% stri_trim) %>%
filter(V1 %>% stri_detect_regex("^There are currently") %>% `!`) %>%
filter(V1 != "") %>%
separate(V1, c("variable", "value"), sep = ":", fill = "left") %>%
mutate(variable = variable %>% mapvalues(NA, "Description"),
ID = variable %>% `==`("Description") %>% cumsum) %>%
spread(variable, value) %>%
mutate(Area = Area %>% extract_numeric,
Price = Price %>% extract_numeric,
Datetime =
Time %>%
stri_replace_all_fixed("a.m.", "am") %>%
stri_replace_all_fixed("p.m.", "pm") %>%
paste(Date, .) %>%
dmy_hm) %>%
select(-Date, -Time)