Filtering based on matching string patters - r

I have a dataset that looks like this:
df <- data.frame("id" = c("Alpha", "Beta", "Gamma","Alpha","Beta","Gamma","Lambda","Tau"),
"group" = c("Alpha is good", "Alpha is good", "Alpha is good", "Beta is bad", "Beta is bad","Beta is bad","Beta is bad","Beta is bad"),
"Val" = c(2,2,2,5,5,5,5,5))
I would like to filter observation when the group name matches the id name. In sum, the final dataset should look like this:
final <- data.frame("id" = c("Alpha", "Beta"),
"group" = c("Alpha is good", "Beta is bad"),
"Val" = c(2,5))
The idea is that the function should be able to recognize if the string in "id" is also present in "group".
I hope this is clear
Thanks in advance for your help

We can use str_detect which is vectorized (According to ?str_detect
Vectorised over string and pattern.
library(stringr)
library(dplyr(
df %>%
mutate_if(is.factor, as.character) %>%
filter(str_detect(group, id))
If there are overlapping elements in each group
df %>%
mutate_if(is.factor, as.character) %>%
group_by(group1 = group) %>%
filter(str_detect(group, id))

One base R possibility could be:
df[unlist(Map(grepl, df$id, df$group)), ]
id group Val
1 Alpha Alpha is good 2
5 Beta Beta is bad 5
Or even more elegantly using mapply() (based on a comment from #r2evans):
df[mapply(grepl, df$id, df$group), ]
Sample data:
df <- data.frame("id" = c("Alpha", "Beta", "Gamma","Alpha","Beta","Gamma","Lambda","Tau"),
"group" = c("Alpha is good", "Alpha is good", "Alpha is good", "Beta is bad", "Beta is bad","Beta is bad","Beta is bad","Beta is bad"),
"Val" = c(2,2,2,5,5,5,5,5),
stringsAsFactors = FALSE)

Related

Grouping By Multiple Selection Answer for Likert Package

I wanted to create a likert graph that is grouped by Question i. I can create the likert graph for total responses ungrouped, but im uncertain of how to reformat question 6 without losing the column for question i. (aka do the reformatting done below but also have it take into account who selected what in question i.)
What I want is the sufficiency of Q6 grouped by their answer in question i.
Sample Dataframe:
SurveyClean2 <- data.frame(i = c("Mail,Email", "Mail", "Mail,Email,Podcast", "Radio,Podcast", "Radio", "Mail,Radio"), Q6_3 = c("Not Sufficient", "Very Sufficient", "Completely Sufficient", "Moderately Sufficient", "Moderately Sufficient", "Not Sufficient"))
Unnesting Question i:
UnnestQi <- SurveyClean2 %>%
as_tibble() %>%
mutate(i = str_split(Q3, ",")) %>%
unnest(i)
Survey2Q6 <- UnnestQi |> drop_na(Q5) |> drop_na(i)
Reformating Question 6 to Likert-friendly format:
clean_survey <- function(data, column, question) {
data %>%
dplyr::select(all_of({{column}})) %>%
dplyr::mutate(Question = question) %>%
dplyr::group_by(Question, across(1)) %>%
dplyr::count() %>%
dplyr::ungroup() %>%
tidyr::pivot_wider(names_from = 2, values_from = n)
}
# table that contains survey questions/columns and the question name
survey_table <- dplyr::tibble(
column = c("Q6_3"),
question = c("Expert advice")
)
# loop through your data and clean it, then bind as dataframe
LikertGroupqi62 <- purrr::map2_df(survey_table$column, survey_table$question, function(x, y){
clean_survey(Survey2Q6, x, y)}) |>
mutate(across(everything(), ~ifelse(is.na(.), 0, .)))
## Likert
LikertGroupqi62 <- LikertGroupqi62 |> dplyr::select(Question, `Not Sufficient`, `Slightly Sufficient`, `Moderately Sufficient`, `Very Sufficient`, `Completely Sufficient`)
Likert WITHOUT grouping:
likert(Question~., LikertGroupqi62, ReferenceZero = 0, auto.key.in = list(columns = 1), main = list("Sufficiency of Cost-share Advice Based on Person or Agency Worked With"), col = c("#db6d00", "#924900", "#000000", "#004949", "#009292"),strip.left = FALSE, ylab = "", xlab = "Total Number of Respondents")

Is there a way to create a network of word associations using a bi-partite network analysis in R?

I have a text file with words from historical accounts and I want to visualise the species and frequency of words associated with them.
So far I have tried using the following code with a txt file of all the historical documents in one doc but want to ask if there is specific formatting of a csv to then input into R for a bipartite network graph:
"""library(ggraph)
library(ggplot2)
library(dplyr)
library(pdftools)
library(tm)
library(readtext)
library(tidytext)
library(igraph)
library(tidyr)
library(FactoMineR)
library(factoextra)
library(flextable)
library(GGally)
library(ggdendro)
library(network)
library(Matrix)
library(quanteda)
library(stringr)
library(quanteda.textstats)
options(stringsAsFactors = F)
options(scipen = 999)
options(max.print=1000)
# Read in text--------
wordbase <- readtext("mq_bird_stories.txt")
# List of extra words to remove---------
extrawords <- c("the", "can", "get", "Ccchants", "make", "making", "house", "torn", "tree", "man", "however", "upon", "instructs", "wife", "coming","without", "mother", "versions","variant", "version", "thus", "got","throws", "are", "has", "already", "asks", "sacra", "can", "brings", "one", "look", "sees", "tonaheiee", "wants", "later",
"dont", "even", "may", "but", "will", "turn", "sing", "swallows", "alba", "gives", "find", "other","tonaheieee", "away","day","comes","another",
"much", "first", "but", "see", "new", "back","goes", "go","songs", "returns", "take","takes","come",
"many", "less", "now", "well", "taught", "like", "puts", "slits", "sends", "tell","tells","open","mentions",
"often", "every", "said", "two", "and", "handsome", "husband", "bring", "lives","gets", "von", "den", "steinen", "handy")
# Clean the data-------
darwin <- wordbase %>%
paste0(collapse = " ") %>%
stringr::str_squish() %>%
stringr::str_remove_all("\\(") %>%
stringr::str_remove_all("\\)") %>%
stringr::str_remove_all("!") %>%
stringr::str_remove_all(",") %>%
stringr::str_remove_all(";") %>%
stringr::str_remove_all("\\?") %>%
stringr::str_split(fixed(".")) %>%
unlist() %>%
tm :: removeWords(extrawords) %>%
paste0(collapse = " ")
# One method for calculating frequencies of bigrams------
# Process into a table of words
darwin_split <- darwin %>%
as_tibble() %>%
tidytext::unnest_tokens(words, value)
# Create data frame of bigrams-------
darwin_words <- darwin_split %>%
dplyr::rename(word1 = words) %>%
dplyr::mutate(word2 = c(word1[2:length(word1)], NA)) %>%
na.omit()
# Calculate frequency of bigrams-----
darwin2grams <- darwin_words %>%
dplyr::mutate(bigram = paste(word1, word2, sep = " ")) %>%
dplyr::group_by(bigram) %>%
dplyr::summarise(frequency = n()) %>%
dplyr::arrange(-frequency)
# Define stopwords
stps <- paste0(tm::stopwords(kind = "en"), collapse = "\\b|\\b")
# Remove stopwords from bigram table
darwin2grams_clean <- darwin2grams %>%
dplyr::filter(!str_detect(bigram, stps))
# Another method for calculating frequencies of bigrams
# Clean corpus
darwin_clean <- darwin %>%
stringr::str_to_title()
# Tokenize corpus----
darwin_tokzd <- quanteda::tokens(darwin_clean)
# Extract bigrams------
BiGrams <- darwin_tokzd %>%
quanteda::tokens_remove(stopwords("en")) %>%
quanteda::tokens_select(pattern = "^[A-Z]",
valuetype = "regex",
case_insensitive = FALSE,
padding = TRUE) %>%
quanteda.textstats::textstat_collocations(min_count = 1, tolower = FALSE)
# read in and process text
darwinsentences <- darwin %>%
stringr::str_squish() %>%
tokenizers::tokenize_sentences(.) %>%
unlist() %>%
stringr::str_remove_all("- ") %>%
stringr::str_replace_all("\\W", " ") %>%
stringr::str_squish()
# inspect data
head(darwinsentences)
darwincorpus <- Corpus(VectorSource(darwinsentences))
# clean corpus-----
darwincorpusclean <- darwincorpus %>%
tm::tm_map(removeNumbers) %>%
tm::tm_map(tolower) %>%
tm::tm_map(removeWords, stopwords()) %>%
tm::tm_map(removeWords, extrawords)
# create document term matrix
darwindtm <- DocumentTermMatrix(darwincorpusclean, control=list(bounds = list(global=c(1, Inf)), weighting = weightBin))
# convert dtm into sparse matrix
darwinsdtm <- Matrix::sparseMatrix(i = darwindtm$i, j = darwindtm$j,
x = darwindtm$v,
dims = c(darwindtm$nrow, darwindtm$ncol),
dimnames = dimnames(darwindtm))
# calculate co-occurrence counts
coocurrences <- t(darwinsdtm) %*% darwinsdtm
# convert into matrix
collocates <- as.matrix(coocurrences)
# inspect size of matrix
ncol(collocates)
#provide some summary stats
summary(rowSums(collocates))
#visualising collocations
# load function for co-occurrence calculation
source("https://slcladal.github.io/rscripts/calculateCoocStatistics.R")
# define term
coocTerm <- "pigeon"
# calculate co-occurrence statistics
coocs <- calculateCoocStatistics(coocTerm, darwinsdtm, measure="LOGLIK")
# inspect results
coocs[1:50]
coocdf <- coocs %>%
as.data.frame() %>%
dplyr::mutate(CollStrength = coocs,
Term = names(coocs)) %>%
dplyr::filter(CollStrength > 0)
###Make graph - visualize association strengths------
ggplot(coocdf, aes(x = reorder(Term, CollStrength, mean), y = CollStrength)) +
geom_point() +
coord_flip() +
theme_bw() +
labs(y = "")
##network
net = network::network(collocates_redux,
directed = FALSE,
ignore.eval = FALSE,
names.eval = "weights")
# vertex names
network.vertex.names(net) = rownames(collocates_redux)
# inspect object
net
ggnet2(net,label = TRUE,
label.size = 4,
alpha = 0.2,
size.cut = 3,
edge.alpha = 0.3) +
guides(color = FALSE, size = FALSE)"""
I'd suggest taking a look at the netCoin package. If you can transform your data into nodes and links data frames, then you can easily get a high quality network visualization:
#Example of links data frame
links <-
data.frame(
matrix(
c(
"Person A","Account 1", "not link",
"Person A","Account 2", "link",
"Person B","Account 2", "link",
"Person B","Account 3", "not link",
"Person B","Account 4", "link",
"Person C","Account 4", "link"
),
nrow = 6,
ncol = 3,
byrow = TRUE,
dimnames = list(NULL,
c("Source", "Target", "other_links_column"))
),
stringsAsFactors = FALSE
)
#Example of nodes data frame
nodes <-
data.frame(
matrix(
c(
"Person A","person",
"Person B","person",
"Person C","person",
"Account 1", "account",
"Account 2", "account",
"Account 3", "account",
"Account 4", "account"
),
nrow = 7,
ncol = 2,
byrow = TRUE,
dimnames = list(NULL,
c("name", "other_nodes_column"))
),
stringsAsFactors = FALSE
)
install.packages("netCoin") #may need to install the netCoin package
library(netCoin)
?netCoin #displays netCoin Help to see all the function options
graph_df <- netCoin(nodes = nodes, #Data frame of unique nodes and their attributes #Must contain name column
links = links, #Data frame of links and their attributes #Must contain Source and Target columns
cex = 1.25, #Font size
color = "other_nodes_column", #Column in node data frame to determine node color
shape = "other_nodes_column", #Column in node data frame to determine node shape
main = "This is the title of my visualization", #Visualization title
controls = 1:5, #Controls that will be shown in the visualization (maximum of 5)
dir = "folder-with-viz-output") #Output folder for the visualization #Entire folder should be exported as a zip file
plot(graph_df) #Command to display the visualization

Add custom rows/header in Table1()

I'm using the table1 package to create summary statistics. My current table1() looks like this (code below):
I want to add another section, with a labeled row in bold called "Co-occurring disorder", that is not referring to a specific variable. I want two un-bolded rows after it refers to just the number of TRUE in two distinct variables: "Mental Health" for one row and "Substance use" for another. For example, if 12 people have TRUE for "mental health" and 7 people have TRUE for "substance use," the following row would start like this:
Alternatively, how do I add a blank row to the table?
My current code is pasted below.
library(table1)
opp2$SEX <-
factor(opp2$SEX, levels=c(1,2),
labels=c("Male", "Female"))
opp2$REGION <-
factor(opp2$REGION, levels=c(1:5),
labels = c("Northeast", "North Central", "South", "West", "Unknown"))
opp2$GS <- factor(opp2$GS, levels = c(1:5),
labels = c("Male", "Female",
"Transmasculine", "Transfeminine", "Unknown"))
#units(opp2$AGE) <- "years"
labels <- list(variables=list(SEX="Sex",
AGE="Age (years)",
REGION="Region"),
groups=list("", "Cis", "TGM"))
strata <- c(list(Total=opp2), split(opp2, opp2$GS))
my.render.cont <- function(x) {
with(stats.apply.rounding(stats.default(x), digits=2),
c("",
"Median (IQR)"=sprintf("%s (± %s)", MEDIAN, IQR)))
}
my.render.cat <- function(x) {
c("", sapply(stats.default(x),
function(y) with(y, sprintf("%d (%0.0f %%)", FREQ, PCT))))
}
table1(strata, labels, groupspan=c(1,2, 3),
render.continuous=my.render.cont, render.categorical=my.render.cat)`

Undefined columns selected, how to solve?

When I try to run the following code I get an error:
value <- as.matrix(wsu.wide[, c(4, 3, 2)])
Error in [.data.frame(wsu.wide, , c(4, 3, 2)) : undefined columns
selected
How do I get this line of work? It's part of dcasting my data.
This is full the code:
library(readxl)
library(reshape2)
Store_and_Regional_Sales_Database <- read_excel("~/Downloads/Data_Files/Store and Regional Sales Database.xlsx", skip = 2)
store <- Store_and_Regional_Sales_Database
freq <- table(store$`Sales Region`)
freq
rel.freq <- freq / nrow(store)
rel.freq
rel.freq.scaled <- rel.freq * 100
rel.freq.scaled
labs <- paste(names(rel.freq.scaled), "\n", "(", rel.freq.scaled, "%", ")", sep = "")
pie(rel.freq.scaled, labels = labs, main = "Pie Chart of Sales Region")
monitor <- store[which(store$`Item Description` == '24" Monitor'),]
wsu <- as.data.frame(monitor[c("Week Ending", "Store No.", "Units Sold")])
wsu.wide <- dcast(wsu, "Store No." ~ "Week Ending", value.var = "Units Sold")
value <- as.matrix(wsu.wide[, c(4, 3, 2)])
Thanks.
Edit:
This is my table called "monitor":
When I then make this wsu <- as.data.frame(monitor[c("Week Ending", "Store No.", "Units Sold")]) I create another vector with only variables "Week Ending", "Store No." and "Units Sold".
However, as I write the wsu.wide code the ouput I get is only this:
Why do I only get this small table when I'm asking to dcast my data?
After this I don't get what is wrong.
The problem is at the line:
wsu.wide <- dcast(wsu, "Store No." ~ "Week Ending", value.var="Units Sold")
Instead of the double quotation mark " you should use the grave accent - ` in the formula:
wsu.wide <- dcast(wsu, `Store No.` ~ `Week Ending`, value.var = "Units Sold")
To avoid this kind of problem it is better not to use spaces in the R object names it is better to substitute Sales Region variable name to sales_region using underscore. See e.g. Google's R Style Guide.
Please see the code below, I used simulation of your data as extract it from the picture is quite cumbersome:
library(readxl)
library(reshape2)
#simulation
n <- 4
Store_and_Regional_Sales_Database <- data.frame(
a = seq_along(LETTERS[1:n]),
sr = LETTERS[1:n],
sr2 = '24" Monitor',
sr3 = 1:4,
sr4 = 2:5,
sr5 = 3:6)
names(Store_and_Regional_Sales_Database)[2:6] <- c(
"Sales Region", "Item Description",
"Week Ending", "Store No.", "Units Sold")
# algorithm
store <- Store_and_Regional_Sales_Database
freq <- table(store$`Sales Region`)
freq
rel.freq <- freq/nrow(store)
rel.freq
rel.freq.scaled <- rel.freq * 100
rel.freq.scaled
labs <- paste(names(rel.freq.scaled), "\n", "(", rel.freq.scaled, "%", ")", sep = "")
pie(rel.freq.scaled, labels = labs, main = "Pie Chart of Sales Region")
monitor <- store[which(store$`Item Description` == '24" Monitor'),]
wsu <- as.data.frame(monitor[c("Week Ending", "Store No.", "Units Sold")])
wsu.wide <- dcast(wsu, `Store No.` ~ `Week Ending`, value.var = "Units Sold")
value <- as.matrix(wsu.wide[ ,c(4,3,2)])
Output:
3 2 1
[1,] NA NA 3
[2,] NA 4 NA
[3,] 5 NA NA
[4,] NA NA NA

Function for label variable before plotting in R

I hope, this question is not too easy for this forum (actually, I'm almost a bit embarrassed to ask this question here, but I'm struggeling with this small issue the whole day...)
I have dataframes look like the following:
df <- data.frame(runif(4),
c("po", "pr", "po", "pr"),
c("Control 1","Control 1", "Treatment 1", "Treatment 1"))
names(df) <- list("values", "test_type", "group")
Now, I want easliy re-label the variables "test_type" and "group" for the plot afterwards. (it's nicer to read "pretest" instead of "pr" in a presentation :-) )
I could do it manually with:
df$test_type <- factor(df$test_type,
levels = c("pr", "po"),
labels = c("pretest", "posttest"))
df$group <- factor(df$group,
levels = c("Control 1", "Treatment 1"),
labels = c("control", "EST"))
In this case, I would have to repeat this for a lot more dataframes, which lead me to write a function:
var_label <- function(df, test, groups){
# Create labels
df$test_type <- factor(df$test,
levels = c("pr", "po"),
labels = c("pretest", "posttest"))
df$group <- factor(df$groups,
levels = c("Control 1", "Treatment 1"),
labels = c("control", "EST"))
return(list(df$test_type, df$group))
}
Unfortunately, this doesn't work. I tried a lot slight different versions and also different command from the Hmisc package, but none of these worked. I know, I can solve this problem in another way, but I try to write more efficient and shorter codes and would be really interested, what I have to change to make this function work. Or even better do you have a suggestion for a more efficient way?
Thank you a lot in advance!!
As I mentioned above, I think forcats::fct_relabel() is what you want here, along with dplyr::mutate_at(). Assuming that your relabeling needs are no more complex than what has been outlined in your question, the following should get you what you appear to be looking for.
####BEGIN YOUR DATAFRAME CREATION####
df <- data.frame(runif(4),
c("po", "pr", "po", "pr"),
c("Control 1","Control 1", "Treatment 1", "Treatment 1"))
names(df) <- list("values", "test_type", "group")
#####END YOUR DATAFRAME CREATION#####
# Load dplyr and forcats
library(dplyr)
library(forcats)
# create a map of labels and levels based on your implied logic
# the setup is label = level
label_map <- c("pretest" = "pr"
,"posttest" = "po"
,"control" = "Control 1"
,"EST" = "Treatment 1")
# create a function to exploit the label map
fct_label_select <- function(x, map) {
names(which(map == x))
}
# create a function which is responsive to a character vector
# as required by fct_relabel
fct_relabeler <- function(x, map) {
unlist(lapply(x, fct_label_select, map = map))
}
fct_relabeler(levels(df$test_type), map = label_map)
# function to meet your apparent needs
var_label <- function(df, cols, map){
df %>%
mutate_at(.vars = cols
,.fun = fct_relabeler
,map = map)
}
var_label(df = df, cols = c("test_type", "group"), map = label_map)
# values test_type group
# 1 0.05159681 posttest control
# 2 0.89050323 pretest control
# 3 0.42988881 posttest EST
# 4 0.32012811 pretest EST

Resources