I am learning R and want to manually compute the Mann-Whitney U statistic and p-value using a normal approximation (and not use wilcox.test or equivalent). My pensioner's brain struggles with coding so it has taken me hours to produce the same answers as the textbook. However, my code to sum the 'StateRank' for the state with the fewest values is convoluted. How can I replace the commented section with more efficient code? I've hunted high and low, both here and on Google, but I don't even know which search terms to use! It won't surprise me to hear that there is a one-line solution but I'm no nearer knowing what it is.
library(tidyverse)
# Activity 9: aboriginal village size in Alaska and California
a.df <- data.frame(
Alaska = c(23, 26, 30, 33, 42, 45, 45, 50, 50.5, 96, 113, 557, NA),
Calif = c(39, 48, 53.5, 55, 57, 66, 77, 79, 108, 121, 162, 197, 309)
) %>%
pivot_longer(
cols = c("Alaska", "Calif"),
names_to = "State",
values_to = "Value",
values_drop_na = TRUE
) %>%
mutate(StateRank = rank(Value, ties.method = "average"))
# clumsy code to sort, then sum ranks (StateRank) for group with fewest values (nA)
#--------------------------------------------------------------------------------
asc_or_desc <- as.matrix(count(a.df, State))
if (as.numeric(asc_or_desc[1,2])>as.numeric(asc_or_desc[2,2])) {
a.df <- arrange(a.df, desc(State))
} else {
a.df <- arrange(a.df, State)
}
#--------------------------------------------------------------------------------
nA <- as.numeric(min(count(a.df, State, sort = TRUE)$n))
nB <- as.numeric(max(count(a.df, State, sort = TRUE)$n))
a.U <- sum(a.df$StateRank[1:nA])
a.E <- (nA*(nA+nB+1))/2 # Expectation of U
a.V <- (nA*nB*(nA+nB+1))/12 # Variance of U
a.Z <- (a.U - a.E)/sqrt(a.V)
a.P <- round((1 - round(pnorm(round(abs(a.Z), 2),
mean = 0, sd = 1) ,4)) * 2, 3)
# all the rounding is to mimic statistical tables (so that
# the answer is the same as in the textbook that I use)
Please try this code and tell me if I am on the right way:
I replaced your so called clumsy code with this one
... %>%
group_by(State) %>%
mutate(mx = max(Value)) %>%
arrange(desc(mx), desc(Value)) %>%
select(-mx)
The whole code:
library(tidyverse)
# Activity 9: aboriginal village size in Alaska and California
a.df <- data.frame(
Alaska = c(23, 26, 30, 33, 42, 45, 45, 50, 50.5, 96, 113, 557, NA),
Calif = c(39, 48, 53.5, 55, 57, 66, 77, 79, 108, 121, 162, 197, 309)
) %>%
pivot_longer(
cols = c("Alaska", "Calif"),
names_to = "State",
values_to = "Value",
values_drop_na = TRUE
) %>%
mutate(StateRank = rank(Value, ties.method = "average")) %>%
group_by(State) %>%
mutate(mx = max(Value)) %>%
arrange(desc(mx), desc(Value)) %>%
select(-mx)
-----------------------------------------------------------------------------
a.U <- sum(a.df$StateRank[1:nA])
a.E <- (nA*(nA+nB+1))/2 # Expectation of U
a.V <- (nA*nB*(nA+nB+1))/12 # Variance of U
a.Z <- (a.U - a.E)/sqrt(a.V)
a.P <- round((1 - round(pnorm(round(abs(a.Z), 2),
mean = 0, sd = 1) ,4)) * 2, 3)
# all the rounding is to mimic statistical tables (so that
# the answer is the same as in the textbook that I use)
Related
I have two datasets called mydata and pairs respectively,
I want to get all combinations of id column from mydata with respect to the pairs by matching Value column in mydata. One way to achieve this is ,
lapply(1:nrow(pairs),function(ind) {expand.grid(mydata[mydata[,2]==pairs[ind,"a"],1],
mydata[mydata[,2]==pairs[ind,"b"],1])
} )
However I feel there must exist more efficient alternatives. Maybe some sort of mapping?
(I am not asking for efficiency like expand.grid vs data.table::CJ)
Thank you in advance.
Data:
mydata <- structure(list(id = 1:40, Value = c(230, 27, 161, 19, 40, 157,
237, 237, 237, 61, 40, 27, 237, 230, 237, 157, 27, 157, 230,
157, 19, 161, 61, 27, 61, 61, 237, 157, 27, 40, 27, 40, 40, 61,
157, 61, 157, 40, 237, 19)), class = "data.frame", row.names = c(NA,
-40L))
pairs <- structure(list(a = c(40, 61, 157), b = c(161, 237, 27)), class = "data.frame", row.names = c(NA,
-3L))
One possibility is to use data.table to get the vectors being passed to expand.grid all in one go:
library(data.table)
dtPairs <- data.table(Value = unlist(pairs), expand.grid(ind1 = 1:nrow(pairs), ind2 = 1:ncol(pairs)))
dt <- as.data.table(mydata)[Value %in% dtPairs$Value, list(list(id)), by = Value]
dt[dtPairs, `:=`(ind1 = i.ind1, ind2 = i.ind2), on = "Value"]
setorder(dt, ind1, ind2)
ans2 <- lapply(dt[, list(list(V1)), by = ind1]$V1, expand.grid)
This involves a lot of overhead, so there won't be performance gains until the dataset gets much larger.
fwithDT <- function(data, pairs) {
dtPairs <- data.table(Value = unlist(pairs), expand.grid(ind1 = 1:nrow(pairs), ind2 = 1:ncol(pairs)))
return(lapply(setorder(as.data.table(data)[Value %in% dtPairs$Value, list(list(id)), by = Value][dtPairs, `:=`(ind1 = i.ind1, ind2 = i.ind2), on = "Value"], ind1, ind2)[, list(list(V1)), by = ind1]$V1, expand.grid))
}
fOP <- function(data, pairs) {
return(lapply(1:nrow(pairs),function(ind) {expand.grid(data[data[,2]==pairs[ind,"a"],1],data[data[,2]==pairs[ind,"b"],1])}))
}
mydata2 <- mydata[rep(seq_len(nrow(mydata)), 1001), ]
mydata2[, "Value"] <- mydata2[, "Value"] + rep(seq(0, 1e6, 1e3), each = nrow(mydata))
pairs2 <- pairs[rep(seq_len(nrow(pairs)), 1001), ] + rep(seq(0, 1e6, 1e3), each = 3)
> microbenchmark(ans1 <- fOP(mydata, pairs), ans2 <- fwithDT(mydata, pairs))
Unit: microseconds
expr min lq mean median uq max neval
ans1 <- fOP(mydata, pairs) 437.3 479.55 599.969 521.75 574.4 5875.2 100
ans2 <- fwithDT(mydata, pairs) 4537.5 4824.90 5187.012 5039.95 5301.1 13231.2 100
> microbenchmark(ans1 <- fOP(mydata2, pairs2), ans2 <- fwithDT(mydata2, pairs2), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
ans1 <- fOP(mydata2, pairs2) 1640.9651 1727.1139 1755.6726 1742.130 1769.235 1958.2109 10
ans2 <- fwithDT(mydata2, pairs2) 306.3268 318.5617 334.1244 326.865 345.444 376.8728 10
> identical(ans1, ans2)
[1] TRUE
I have a sample of the data as follows:
df <- tribble(
~capacity1, ~capacity2, ~capacity3, ~capacity4, ~capacity5, ~capacity6, ~capacity7, ~capapcity8,
75, 88, 85, 71, 98, 76, 71, 57,
80, 51, 84, 72, 59, 81, 70, 64,
54, 65, 90, 66, 93, 88, 77, 59,
59, 87, 94, 75, 74, 53, 56, 87,
52, 55, 64, 77, 50, 64, 83, 87,
33,22,66,67,99,87,40,90,)
I want to get the following graph.
As you can see, capacity 1 goes with capacity2 and produce one label as Capacity1. capacity 3 with capacity4= Capapcity2, capacity5 and capacity6=Capaccity3 and capacity 7 with capacity8= Capacity4. Next, I would like to get p-values. That would be good if we could order each pair of boxes ( e.g., capacity1 with capacity2= Capacity1).
If we need pairwise plots, we can split into a list of datasets for each pair of columns, then use ggboxplot from ggpubr
library(dplyr)
library(tidyr)
library(purrr)
library(patchwork)
library(rstatix)
library(ggpubr)
lst1 <- df %>%
# // split every 2 columns
split.default(as.integer(gl(ncol(.), 2, ncol(.)))) %>%
# // loop over the list
map(~ {
# // reshape to long format
dat <- pivot_longer(.x, everything())
# // get the t.test p value
stat_test <- dat %>%
t_test(value ~ name)%>%
adjust_pvalue(method = "bonferroni") %>%
add_significance("p.adj") %>%
add_xy_position(x = "name")
# // create the boxplot
ggboxplot(dat, x = 'name', y = 'value')+
stat_pvalue_manual(stat_test,
label = "p.adj", tip.length = 0.01)
})
Now, we wrap the list of plots with wrap_plots from patchwork
wrap_plots(lst1)
-output
Try this approach. Your data is in wide format. First you have to transform to long using pivot_longer(). After that you can use ggplot2 to sketch the plot with geom_boxplot(). In order to add p-values you need to define the proper test and using stat_compare_means(). Here the code using t.test:
library(ggplot2)
library(dplyr)
library(tidyr)
#Code
df %>% pivot_longer(everything()) %>%
ggplot(aes(x=name,y=value,fill=name,group=name))+
geom_boxplot()+
stat_compare_means(label = "p", method = "t.test",
ref.group = ".all.")+
labs(fill='Variable')
Output:
For grouping:
#Data for groups
groups <- data.frame(name=c("capacity1", "capacity2", "capacity3", "capacity4", "capacity5",
"capacity6", "capacity7", "capapcity8"),
group=paste0('Group.',c(1,1,2,2,3,3,4,4)),stringsAsFactors = F)
#Code
df %>% pivot_longer(everything()) %>%
left_join(groups) %>%
ggplot(aes(x=name,y=value,fill=name,group=name))+
geom_boxplot()+
facet_wrap(.~group,scales = 'free',nrow = 1,strip.position = 'bottom')+
labs(fill='Variable')+
theme(strip.placement = 'outside',strip.background = element_blank(),
legend.position = 'none')
Output:
I'm trying to conditionally replace values in multiple columns based on a string match in a different column but I'd like to be able to do so in a single line of code using the across() function but I keep getting errors that don't quite make sense to me. I feel like this is probably a simple solution so if anyone could point me in the right direction, that would be fantastic!
df <- data.frame("type" = c("Park", "Neighborhood", "Airport", "Park", "Neighborhood", "Neighborhood"),
"total" = c(34, 56, 75, 89, 21, 56),
"group_a" = c(30, 26, 45, 60, 3, 46),
"group_b" = c(4, 30, 30, 29, 18, 10))
# working but not concise
df %>%
mutate(total = ifelse(str_detect(type, "Park"), NA, total),
group_a = ifelse(str_detect(type, "Park"), NA, group_a),
group_b = ifelse(str_detect(type, "Park"), NA, group_b))
# concise but not working
df %>% mutate(across(total, group_a, group_b), ifelse(str_detect(type, "Park"), NA, .))
Update
We got a solution that works with my dummy dataset but is not working with my real data, so I am going to share a small snippet of my real data frame with the numbers changed and organization names hidden. When I run this line of code (df %>% mutate(across(c(Attempts, Canvasses, Completes)), ~ifelse(str_detect(long_name, "park-cemetery"), NA, .))) on these data, I get the following error message:
Error: Problem with mutate() input ..2. x Input ..2 must be a
vector, not a formula object. i Input ..2 is
~ifelse(str_detect(long_name, "park-cemetery"), NA, .).
This a small sample of the data that produces this error:
df <- structure(list(Org = c("OrgName", "OrgName", "OrgName", "OrgName",
"OrgName", "OrgName", "OrgName", "OrgName", "OrgName", "OrgName"
), nCode = c("M34", "R36", "R46", "X29", "M31", "K39", "Q12",
"Q39", "X41", "K27"), Attempts = c(100, 100, 100, 100, 100, 100,
100, 100, 100, 100), Canvasses = c(80, 80, 80, 80, 80, 80, 80,
80, 80, 80), Completes = c(50, 50, 50, 50, 50, 50, 50, 50, 50,
50), van_nocc_id = c(999, 999, 999, 999, 999, 999, 999, 999,
999, 999), van_name = c("M-Upper West Side", "SI-Rosebank", "SI-Tottenville",
"BX-park-cemetery-etc-Bronx", "M-Stuyvesant Town-Cooper Village",
"BK-Kensington", "Q-Broad Channel", "Q-Lindenwood", "BX-Wakefield",
"BK-East New York"), boro_short = c("M", "SI", "SI", "BX", "M",
"BK", "Q", "Q", "BX", "BK"), long_name = c("Upper West Side",
"Rosebank", "Tottenville", "park-cemetery-etc-Bronx", "Stuyvesant Town-Cooper Village",
"Kensington", "Broad Channel", "Lindenwood", "Wakefield", "East New York"
)), row.names = c(NA, -10L), class = "data.frame")
Final update
The curse of the misplaced closing bracket! Thanks to everyone for your help... the correct solution was df %>% mutate(across(c(Attempts, Canvasses, Completes), ~ifelse(str_detect(long_name, "park-cemetery"), NA, .)))
If you use the newly introduced function across (which is the correct way to approach this task), you have to specify inside across itself the function you want to apply. In this case the function ifelse(...) has to be a purrr-style lambda (so starting with ~). Check out across documentation and look for the arguments .cols and .fns.
df %>%
mutate(across(c(total, group_a, group_b), ~ifelse(str_detect(type, "Park"), NA, .)))
Output
# type total group_a group_b
# 1 Park NA NA NA
# 2 Neighborhood 56 26 30
# 3 Airport 75 45 30
# 4 Park NA NA NA
# 5 Neighborhood 21 3 18
# 6 Neighborhood 56 46 10
Here a data.table solution.
require(data.table)
df <- data.frame("type" = c("Park", "Neighborhood", "Airport", "Park", "Neighborhood", "Neighborhood"),
"total" = c(34, 56, 75, 89, 21, 56),
"group_a" = c(30, 26, 45, 60, 3, 46),
"group_b" = c(4, 30, 30, 29, 18, 10))
setDT(df)
df[type == "Park", c("total", "group_a", "group_b") := NA]
Update: that didn't take long to figure out! Just needed to place the columns in a vector:
# concise AND working!
df %>% mutate(across(c(total, group_a, group_b)), ifelse(str_detect(type, "Park"), NA, .))
I had tried this initially but placed the columns in quotes... don't do that :)
I am struggling to extract the common values between a specific column of 5 different dataframes. I know how to do this with two, but not with more.
df1$ID<-c(121, 122, 176)
df2$ID<-c(121, 88, 199)
df3$ID<-c(77, 121, 230)
df4$ID<-c(6, 88, 121)
df5$ID<-c(121, 122, 123)
In this example, my desired output would be:
result<-c(121)
Thanks!
We can get all the datasets in a list and then use intersect
Reduce(intersect, lapply(mget(paste0('df', 1:5)), `[[`, 'ID'))
#[1] 121
Or using purrr
library(purrr)
library(stringr)
library(dplyr)
mget(paste0('df', 1:5)) %>%
map(~ .x %>%
pull(ID)) %>%
reduce(intersect)
#[1] 121
data
df1 <- data.frame(ID = c(121, 122, 176))
df2 <- data.frame(ID = c(121, 88, 199))
df3 <- data.frame(ID = c(77, 121, 230))
df4 <- data.frame(ID = c(6, 88, 121))
df5 <- data.frame(ID = c(121, 122, 123))
I have the following code for creating a boxplot in ggplot2:
throughput <- c(1, 2, 3, 4, 5)
response_time_min <- c(9, 19, 29, 39, 49)
response_time_10 <- c(50, 55, 60, 60, 61)
response_time_med <- c(100, 100, 100, 100, 100)
response_time_90 <- c(201, 201, 250, 200, 230)
response_time_max <- c(401, 414, 309, 402, 311)
df <- data.frame(throughput, response_time_min, response_time_10, response_time_med,response_time_90, response_time_max)
df
library(ggplot2)
g <- ggplot(df) +
geom_boxplot(aes(x=factor(throughput),ymax = response_time_max,upper = response_time_90,
y = response_time_med,
middle = response_time_med,
lower = response_time_10,
ymin = response_time_min), stat = "identity")
g
But now when I want to apply ggplotly(g) the graph does not render correctly. What can I do to make this work?
I don't think 90th percentile and 10th percentile can be done. Assuming they are q3 and q1, respectively, the code below
bp <- plot_ly(color=c("orange")) %>%
add_trace(lowerfence = response_time_min, q1 = response_time_10,
median = response_time_med, q3 = response_time_90,
upperfence = response_time_max, type = "box") %>%
layout(xaxis=list(title="throughput"),
yaxis=list(title="response_time"))
bp
gives the following output: