I am trying to create an app that creates word frequency plot from text. The data I used is here: https://www.kaggle.com/datasets/bharadwaj6/kindle-reviews#:~:text=Never-,kindle_reviews,-.csv(701.88
Following is my shiny code:
library(shiny)
library(tidyverse)
library(tidytext)
# Source functions
source("wrd_freq_df.R")
source("bigrm_df.R")
# UI design ----
ui <- fluidPage(fluidRow(
column(
4,
fileInput("upload", NULL, accept = c(".csv", ".tsv")),
actionButton(
inputId = "submit",
label = "RUN",
class = "btn-primary"
),fluidRow(
column(4, numericInput(
"n",
"Top n words",
value = 5,
min = 1,
step = 1
)),
column(4, selectInput(
"myselect",
label = "Choose Words"
, choices = NULL
, multiple = TRUE
# , options = list(create = TRUE)
, selected = NULL
))),
plotOutput("wrd_frq_plot"),
tableOutput("head")
)
))
# SERVER ----
options(shiny.maxRequestSize = 60 * 1024 ^ 2)
server <- function(input, output, session) {
#upload csv data
data <- reactive({
req(input$upload)
ext <- tools::file_ext(input$upload$name)
switch(
ext,
csv = vroom::vroom(input$upload$datapath, delim = ","),
tsv = vroom::vroom(input$upload$datapath, delim = "\t"),
validate("Invalid file; Please upload a .csv or .tsv file")
)
})
#create different data subset to be used for plotting word_freq and bigrams
rv <- reactiveValues()
#run analysis on pressing run button
observeEvent(input$submit, {
rv$data_head <- data() %>%
mutate(ID = as.character(round(ID))) %>%
mutate(group = if_else(rating >= 4, "High", if_else(rating > 2, "Avg", "Low"))) %>%
select(ID, group, reviewText)
#get base analysis dataset
rv$word_freq <- wrd_freq_df(rv$data_head)
rv$bigrm_freq <- bigrm_freq_df(rv$data_head)
rv$bigrm_tf_idf <- bigrm_freq_tf_idf(rv$bigrm_freq)
#get all unique words from the document
rv$choices <- rv$word_freq %>%
select(word) %>%
unique() %>% unlist()
output$wrd_frq_plot <- renderPlot({
#ensure that if selectinput has no value then all words are used for analysis
if (is.null(input$myselect)) {
selected_choices = rv$choices
}
else{
selected_choices = input$myselect
}
updateSelectInput(session, "myselect", selected = selected_choices)
#get relevant document ID which contains chosen word for analysis
word <- input$myselect
word_df_chose <- data.frame(word)
relvant_ID <- word_df_chose %>%
#THIS IS PROBABLY WHERE THINGS FAILS
inner_join(rv$word_freq) %>%
select(ID) %>%
unique() %>% unlist()
rv$word_freq %>%
filter(ID %in% relevant_ID) %>%
left_join(rv$data_head %>%
select(ID, group) %>%
unique()) %>%
#remove low tf_IDF words
filter(tf_idf > quantile(tf_idf, 0.25)) %>%
group_by(group, word) %>%
summarise(n_tot = sum(n)) %>% ungroup() %>%
group_by(group) %>%
slice_max(n_tot, n = input$n) %>%
ungroup() %>%
mutate(group = as.factor(group),
word = reorder_within(word, n_tot, group)) %>%
ggplot(aes(x = word, y = n_tot, fill = group)) + geom_col() + facet_wrap(group ~
., scales = "free") +
coord_flip() + scale_x_reordered()
})
})
}
#executes app
shinyApp(ui, server)
The functions sourced are as follows:
wrd_freq_df <- function(df){
df %>%
# mutate(ID = as.character(round(ID))) %>%
# mutate(group = if_else(rating >= 4, "High", if_else(rating > 2, "Avg", "Low"))) %>%
# select(ID, group, reviewText) %>%
#get word freq
unnest_tokens(output = word, input = reviewText) %>% ungroup() %>%
anti_join(stop_words) %>%
count(ID, word) %>% ungroup() %>%
group_by(ID) %>%
mutate(total = sum(n)) %>% ungroup() %>%
bind_tf_idf(word, ID, n) }
bigrm_freq_df <- function(df) {
df %>%
# mutate(ID = as.character(round(ID))) %>%
# mutate(group = if_else(rating >= 4, "High", if_else(rating > 2, "Avg", "Low"))) %>%
# select(ID, group, reviewText) %>%
unnest_tokens(bigram, reviewText, token = "ngrams", n = 2) %>%
separate(bigram,
c("word1", "word2"),
sep = " ",
remove = FALSE) %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
}
bigrm_freq_tf_idf <- function(df) {
df %>%
select(ID, bigram) %>%
#get tf_idf
count(ID, bigram) %>%
bind_tf_idf(bigram, ID, n) %>%
arrange(desc(tf_idf))
}
i get following error message:
Joining, by = "word"
Warning: Error in inner_join: by must be supplied when x and y have no common variables.
ℹ use by = character()` to perform a cross-join.
181:
Input to asJSON(keep_vec_names=TRUE) is a named vector. In a future version of jsonlite, this option will not be supported, and named vectors will be translated into arrays instead of objects. If you want JSON object output, please use a named list instead. See ?toJSON.
I assume that the error must be arising when
Related
I have the following function with which I want to create summary statistics (for two data sets simultaneously):
mean.k <-function(x){round(mean(x, na.rm=TRUE), digits = 3)}
sd.k <-function(x){round(sd(x, na.rm=TRUE), digits = 3)}
sumstats<-function(x, y) { sumtable <- cbind(as.matrix(colSums(!is.na(x))),sapply(x,mean.k), paste("(",sapply(x,sd.k),")", sep = ""), as.matrix(colSums(!is.na(y))),sapply(y,mean.k), paste("(",sapply(y,sd.k),")", sep = ""))
sumtable=as.data.frame(sumtable)
names(sumtable)=c("Obs","Mean","Std.Dev", "Obs","Mean","Std.Dev");
sumtable}
On some data, the result looks like:
data(iris)
libary(dplyr)
iris_1 <- iris %>% filter(Species == "setosa") %>% select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) # data set 1
iris_2 <- iris %>% filter(Species == "versicolor") %>% select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) # data set 2
iris_desk_stats <- sumstats(iris_1, iris_2)
However, for me the optimal result would look like this:
So the two changes I need are the following:
Each standard deviation should be placed under the respective mean instead of in a separate column next to the means.
Obs should not be a proper column, but a row at the top in order to avoid redundancy.
Has anyone an idea? The first of the two desired changes would be the most important one.
I thought that I'd try to approach this "directly"; And here's what I've come up with:
iris %>%
as_tibble() %>%
summarise(
across(!where(is.factor), list(
mean = . %>% mean(na.rm = TRUE),
sd = . %>% sd(na.rm = TRUE)
)),
Obs = n()
) %>%
pivot_longer(
# c(contains(c("mean", "sd")), "Obs"),
everything(),
names_to = c("variable", "metric"),
values_to = "stats",
names_sep = "_") %>%
tidyr::replace_na(list(metric = "identity")) %>%
pivot_wider(names_from = metric,
values_from = stats) %>%
select(variable, everything()) %>%
mutate(entries = glue::glue("{mean}<br>({sd})",
mean = round(mean, 3),
sd = round(sd, 3)),
entries = if_else(!is.na(identity),
glue::glue("{identity}"),
entries)) %>%
select(-c("mean", "sd", "identity")) %>%
arrange(variable) %>%
gt::gt() %>%
gt::fmt_markdown(entries) %>%
identity()
In the below reproducible example Code 1, the user stratifies data and selects which variable to stratify the data by, Value_1 or Value_2; and also selects to view the stratification as a table or as a plot. Code 1 works as intended, and the stratification range label is a static "Range" (as shown in the table left-most column header and in the plot x-axis label, and as shown in the images at the bottom).
I am making the range label dynamic (reactive), because in the fuller App this example derives from the user has many variables to choose from for stratification.
In Code 2 below showing an amplified custom function stratData(), I have succeeded in making the table left-most column header for stratification ranges reactive, but I haven't yet figured out how to make the plot x-axis label for stratification ranges similarly reactive. Just replace the stratData() in Code 1 with the stratData() in Code 2 (and comment-out the plot code under renderPlot() to avoid seeing the error) to see how this is working (or not).
How could the plot x-axis label reflect the same reactivity as the table left-most column header?
Code 1:
library(ggplot2)
library(shiny)
library(tidyverse)
library(shinyWidgets)
ui <-
fluidPage(
uiOutput("strat_values"),
radioButtons(
inputId = 'stratsView',
label = NULL,
choices = list("Table view" = 1,"Plot view" = 2),
selected = 1,
inline = TRUE
),
conditionalPanel(condition = "input.stratsView == 1",h5(strong("Stratified data:")), tableOutput("stratData")),
conditionalPanel(condition = "input.stratsView == 2",h5(strong("Stratified data:")), plotOutput("stratPlot"))
)
server <- function(input, output, session) {
dat <- reactive({
data.frame(
ID = c(1,1,2,2,2,2,3,3,3,3),
Period = c("2020-03", "2020-04", "2020-01", "2020-02", "2020-03", "2020-04", "2020-01", "2020-02", "2020-03", "2020-04"),
Values_1 = c(-6, 26, 36, 46, 56, 86, 100, 10, 20, 30),
Values_2 = c(-3, 13, 18, 23, 28, 43, 50, 5, 10, 15)
)
})
output$strat_values <- renderUI({
selectInput("strat_values",
"Variable to range-spread (col 1):",
choices = c("Values_1","Values_2"),
selected = c("Values_1"))
})
stratData <- function(){
req(input$strat_values)
filter_exp1 <- parse(text=paste0("Period", "==", "'","2020-04", "'"))
dat_1 <- reactive({dat() %>% filter(eval(filter_exp1))})
breaks <- seq(min(dat_1()[[input$strat_values]]), max(dat_1()[[input$strat_values]]), length.out = 6)
tmp <- dat() %>%
filter(eval(filter_exp1)) %>%
mutate(Range = cut(!!sym(input$strat_values), breaks=breaks, include.lowest=TRUE, right = TRUE, dig.lab = 5)) %>%
group_by(Range)
tmp <- tmp %>%
summarise(Count = n(),Values = sum(!!sym("Values_2"))) %>%
complete(Range, fill = list(Count = 0,Values = 0)) %>%
ungroup %>%
mutate(Count_pct = Count/sum(Count)*100, Values_pct = Values/sum(Values)*100) %>%
dplyr::select(everything(), Count, Count_pct, Values, Values_pct) %>%
bind_rows(summarise_all(., ~(if(is.numeric(.)) sum(.) else "Total")))
Count <- tmp %>% pull(Count)
Range <- tmp %>% pull(Range)
list(data = tmp,
Range = Range[-length(Range)],
Count = as.data.frame(Count[-length(Count)]))
}
output$stratData <- renderTable({stratData()$data})
output$stratPlot <- renderPlot({
x <- factor(stratData()$Range, levels = c(stratData()$Range))
y <- as.matrix(stratData()$Count)
ggplot(stratData()$Count,aes(x,y)) +
geom_bar(stat="identity") +
labs(x = "Ranges") +
geom_text(aes(y = y + sign(y)/4,label = y))
})
}
shinyApp(ui, server)
Code 2:
stratData <- function(){
req(input$strat_values)
filter_exp1 <- parse(text=paste0("Period", "==", "'","2020-04", "'"))
dat_1 <- reactive({dat() %>% filter(eval(filter_exp1))})
breaks <- seq(min(dat_1()[[input$strat_values]]), max(dat_1()[[input$strat_values]]), length.out = 6)
tmp <- dat() %>%
filter(eval(filter_exp1)) %>%
mutate(sumvar = cut(!!sym(input$strat_values), breaks=breaks, include.lowest=TRUE, right = TRUE, dig.lab = 5)) %>%
group_by(sumvar)
tmp <- tmp %>%
summarise(Count = n(),Values = sum(!!sym("Values_2"))) %>%
complete(sumvar, fill = list(Count = 0,Values = 0)) %>%
ungroup %>%
mutate(Count_pct = Count/sum(Count)*100, Values_pct = Values/sum(Values)*100) %>%
dplyr::select(everything(), Count, Count_pct, Values, Values_pct) %>%
bind_rows(summarise_all(., ~(if(is.numeric(.)) sum(.) else "Total")))
names(tmp)[1] <- paste(input$strat_values, " Range")
Count <- tmp %>% pull(Count)
# Range <- tmp %>% pull(Range)
list(data = tmp,
# Range = Range[-length(Range)],
Count = as.data.frame(Count[-length(Count)]))
}
Resolved code:
Now with OP code correction and reflecting Stefan's comment for plot axis label reactivity; posting revised stratData() custom function where only one line was fixed (commented below) and the renderPlot() function where Stefan's axis-label solution is also commented:
stratData <- function(){
req(input$strat_values)
filter_exp1 <- parse(text=paste0("Period", "==", "'","2020-04", "'"))
dat_1 <- reactive({dat() %>% filter(eval(filter_exp1))})
breaks <- seq(min(dat_1()[[input$strat_values]]), max(dat_1()[[input$strat_values]]), length.out = 6)
tmp <- dat() %>%
filter(eval(filter_exp1)) %>%
mutate(Range = cut(!!sym(input$strat_values), breaks=breaks, include.lowest=TRUE, right = TRUE, dig.lab = 5)) %>%
group_by(Range)
tmp <- tmp %>%
summarise(Count = n(),Values = sum(!!sym("Values_2"))) %>%
complete(Range, fill = list(Count = 0,Values = 0)) %>%
ungroup %>%
mutate(Count_pct = Count/sum(Count)*100, Values_pct = Values/sum(Values)*100) %>%
dplyr::select(everything(), Count, Count_pct, Values, Values_pct) %>%
bind_rows(summarise_all(., ~(if(is.numeric(.)) sum(.) else "Total")))
names(tmp)[1] <- paste(input$strat_values, " Ranges")
Count <- tmp %>% pull(Count)
Range <- tmp %>% pull(var = 1) # var = 1 pulls left-most column from tmp dataframe
list(data = tmp,
Range = Range[-length(Range)],
Count = as.data.frame(Count[-length(Count)]))
}
output$stratPlot <- renderPlot({
x <- factor(stratData()$Range, levels = c(stratData()$Range))
y <- as.matrix(stratData()$Count)
ggplot(stratData()$Count,aes(x,y)) +
geom_bar(stat="identity") +
labs(x = paste(input$strat_values, " Ranges")) + # < Added line this per Stefan comment
geom_text(aes(y = y + sign(y)/4,label = y),
position = position_nudge(y = 0),
size = 5)
})
I am using this script to produce a table. In the second line, for collectionName, I am using "Organization X". I have many different organizations (Org Y, Org Z, .....) to create this table for. Is there a way to automate this? And to automate the naming of the object (currently "orgx" below)?
orgx <- df %>%
filter(str_detect(collectionName, c("Organization X"))) %>%
filter(str_detect(Year, paste(years, collapse = "|"))) %>%
corpus(text_field = "text") %>%
tokens(remove_punct = TRUE) %>%
tokens_select(stopwords('english'),selection='remove') %>%
tokens_tolower(keep_acronyms = FALSE) %>%
tokens_lookup(dictionary = dict, nomatch = TRUE) %>%
dfm() %>%
dfm_group(groups = "Title") %>%
dfm_weight(scheme = "prop") %>%
as.data.frame() %>%
mutate_at(vars(keyterms, true), funs(round(., 4)))
Get the column names specific to that organizations as vector, use that as pattern in str_detect by looping over the vector in map and return the output in a list
library(dplyr)
library(purrr)
library(stringr)
vec <- c("Organization X", "Organization Y")
out <- map(vec, ~
df %>%
filter(str_detect(collectionName, .x)) %>%
filter(str_detect(Year, paste(years, collapse = "|"))) %>%
corpus(text_field = "text") %>%
tokens(remove_punct = TRUE) %>%
tokens_select(stopwords('english'),selection='remove') %>%
tokens_tolower(keep_acronyms = FALSE) %>%
tokens_lookup(dictionary = dict, nomatch = TRUE) %>%
dfm() %>%
dfm_group(groups = "Title") %>%
dfm_weight(scheme = "prop") %>%
as.data.frame() %>%
mutate_at(vars(keyterms, true), funs(round(., 4)))
)
names(out) <- sub("^(...).*\\s+(\\S)$", "\\1\\2", vec)
It may be better to keep the output in a list. But, if we need to assign it to different objects, it can be done with list2env or assign
list2env(out, .GlobalEnv)
Prior to start creating my app with Shiny I've created a function (NextWordPrediction) that updates my dataframe based on an user's input as follows:
If input exists in df increase its Frequency by 1
If input does't exist in df add it
Function code:
NextWordPrediction <- function(input) {
dat <- dat %>%
filter(., N_gram == str_count(input, "\\S+") + 1) %>%
filter(grepl(paste("^", tolower(str_squish(input)), sep = ""), Word)) %>%
arrange(., desc(Prop))
if (nrow(dat) != 0) {
assign("dat",
dat %>%
mutate(Frequency = ifelse(Word == input &
N_gram == str_count(input, "\\S+"),
Frequency + 1,
Frequency)) %>%
group_by(., N_gram) %>%
mutate(., Prop = Frequency/ sum(Frequency)) %>%
data.frame(.),
envir = .GlobalEnv)
val <- dat$Word_to_Predict[1]
ans <- paste(str_squish(input), val)
return(list(ans, head(dat,5)))
} else if (nrow(dat) == 0 & word(input, 1) != "NA") {
assign("dat",
dat %>%
add_row(., Word = tolower(input), Frequency = + 1, N_gram = str_count(input, "\\S+"),
Word_to_Predict = word(input, -1)) %>%
group_by(., N_gram) %>%
mutate(., Prop = Frequency/ sum(Frequency)) %>%
data.frame(.),
envir = .GlobalEnv)
input_1 <- Reduce(paste, word(input, 2:str_count(input,"\\S+")))
return(NextWordPrediction(input_1))
} else if (word(input, 1) == "NA") {
ans <- paste("Word not in dictionary. We added this to our database!")
return(ans)
}
}
As a next step I want to extend this functionality to a Shiny app and I've tried the following without success. The function usability is functional but after an input my df is not updated accordingly.
server.R
library(shiny)
dat <- read.csv("dat_all.csv")
shinyServer(function(input, output) {
NextWordPrediction <- function(input) {
dat <- dat %>%
filter(., N_gram == str_count(input, "\\S+") + 1) %>%
filter(grepl(paste("^", tolower(str_squish(input)), sep = ""), Word)) %>%
arrange(., desc(Prop))
if (nrow(dat) != 0) {
assign("dat",
dat %>%
mutate(Frequency = ifelse(Word == input &
N_gram == str_count(input, "\\S+"),
Frequency + 1,
Frequency)) %>%
group_by(., N_gram) %>%
mutate(., Prop = Frequency/ sum(Frequency)) %>%
data.frame(.),
envir = .GlobalEnv)
val <- dat$Word_to_Predict[1]
ans <- paste(str_squish(input), val)
return(list(ans, head(dat,5)))
} else if (nrow(dat) == 0 & word(input, 1) != "NA") {
assign("dat",
dat %>%
add_row(., Word = tolower(input), Frequency = + 1, N_gram = str_count(input, "\\S+"),
Word_to_Predict = word(input, -1)) %>%
group_by(., N_gram) %>%
mutate(., Prop = Frequency/ sum(Frequency)) %>%
data.frame(.),
envir = .GlobalEnv)
input_1 <- Reduce(paste, word(input, 2:str_count(input,"\\S+")))
return(NextWordPrediction(input_1))
} else if (word(input, 1) == "NA") {
ans <- paste("Word not in dictionary. We added this to our database!")
return(ans)
}
}
output$predictiontext = reactive({
NextWordPrediction(input$text)[1]
})
output$predictiontable = renderTable({
NextWordPrediction(input$text)[2]
})
})
ui.R
library(shiny)
# Define UI for application that draws a histogram
shinyUI(fluidPage(
# Application title
titlePanel("NextWordPrediction"),
# Sidebar with a slider input for number of bins
sidebarLayout(
sidebarPanel(
textInput("text",
"Type something...",
"")
),
# Show a plot of the generated distribution
mainPanel(
wellPanel(
# Link to report
helpText(a('More information on the app',
href=link,
target = '_blank')
),
# Link to repo
helpText(a('Code repository',
href=link,
target = '_blank')
),
textOutput("predictiontext"),
tableOutput('predictiontable')
)
)
))
)
Update 1: Data
df<- data.frame(Word = c("hello", "she was great", "this is", "long time ago in"), Frequency = c(4, 3, 10, 1),
N_gram = c(1, 3, 2, 4), Prop = c(4/18, 3/18, 10/18, 1/18), Word_to_Predict = c(NA, "great", "is", "in"))
NextWordPrediction("she was") ## returns "she was" & "great"
NextWordPrediction("hours ago") ## returns "hours ago" & "in"
NextWordPrediction("words not in data") ## returns "Word not in dictionary. We added this to our database!" after trying "not in data", "in data" and adds "words not in data" to dataset
If I run the following code, everything runs fine:
library(dplyr)
library(tidyr)
library(shiny)
id <- 1:100
gender <- sample(c('M','F'), 100, replace=TRUE)
age <- sample(18:22, 100, replace=TRUE)
ethnicity <- sample(c('W','B','H','A','O'), 100, replace = TRUE)
grade <- sample(LETTERS[1:4], 100, replace=TRUE)
df <- cbind(id,gender,age,ethnicity,grade) %>% as.data.frame()
list1 <- list("id"="id","gender"="gender","age"="age","ethnicity"="ethnicity","grade"="grade")
list2 <- list("id"="id","gender"="gender","age"="age","ethnicity"="ethnicity","grade"="grade")
ui <-fluidPage(
selectInput("picker1", "PICKER 1", choices = list1, selected = "gender"),
selectInput("picker2", "PICKER 2", choices = list2, selected = "grade"),
tableOutput("crosstabs")
)
server <- function(input,output,session){
output$crosstabs <- renderTable({
t<-df %>% select_all() %>% select(-id) %>%
pivot_longer(cols = input$picker1) %>%
count(name,value, grade) %>% pivot_wider(names_from = grade, values_from = n)
t
})
}
shinyApp(ui,server)
But when I update output$crosstabs to include input$picker2, it returns an error. I'm rather confused, as I'm using the same structure, but simply with grade swapped out for input$picker2:
output$crosstabs <- renderTable({
t<-df %>% select_all() %>% select(-id) %>%
pivot_longer(cols = input$picker1) %>%
count(name,value, input$picker2) %>% pivot_wider(names_from = input$picker2, values_from = n)
t
})
What exactly is happening here?
It is a string and the count expects a unquoted column name, we can convert it to symbol and evaluate (!!)
server <- function(input,output,session){
output$crosstabs <- renderTable({
df %>%
select_all() %>%
select(-id) %>%
pivot_longer(cols = input$picker1) %>%
count(name,value, !!rlang::sym(input$picker2)) %>%
pivot_wider(names_from = input$picker2, values_from = n)
})
}
-testing