Obtain common values between 5 from the same column of different dataframes - r

I am struggling to extract the common values between a specific column of 5 different dataframes. I know how to do this with two, but not with more.
df1$ID<-c(121, 122, 176)
df2$ID<-c(121, 88, 199)
df3$ID<-c(77, 121, 230)
df4$ID<-c(6, 88, 121)
df5$ID<-c(121, 122, 123)
In this example, my desired output would be:
result<-c(121)
Thanks!

We can get all the datasets in a list and then use intersect
Reduce(intersect, lapply(mget(paste0('df', 1:5)), `[[`, 'ID'))
#[1] 121
Or using purrr
library(purrr)
library(stringr)
library(dplyr)
mget(paste0('df', 1:5)) %>%
map(~ .x %>%
pull(ID)) %>%
reduce(intersect)
#[1] 121
data
df1 <- data.frame(ID = c(121, 122, 176))
df2 <- data.frame(ID = c(121, 88, 199))
df3 <- data.frame(ID = c(77, 121, 230))
df4 <- data.frame(ID = c(6, 88, 121))
df5 <- data.frame(ID = c(121, 122, 123))

Related

Summing ranks for variable with fewest entries

I am learning R and want to manually compute the Mann-Whitney U statistic and p-value using a normal approximation (and not use wilcox.test or equivalent). My pensioner's brain struggles with coding so it has taken me hours to produce the same answers as the textbook. However, my code to sum the 'StateRank' for the state with the fewest values is convoluted. How can I replace the commented section with more efficient code? I've hunted high and low, both here and on Google, but I don't even know which search terms to use! It won't surprise me to hear that there is a one-line solution but I'm no nearer knowing what it is.
library(tidyverse)
# Activity 9: aboriginal village size in Alaska and California
a.df <- data.frame(
Alaska = c(23, 26, 30, 33, 42, 45, 45, 50, 50.5, 96, 113, 557, NA),
Calif = c(39, 48, 53.5, 55, 57, 66, 77, 79, 108, 121, 162, 197, 309)
) %>%
pivot_longer(
cols = c("Alaska", "Calif"),
names_to = "State",
values_to = "Value",
values_drop_na = TRUE
) %>%
mutate(StateRank = rank(Value, ties.method = "average"))
# clumsy code to sort, then sum ranks (StateRank) for group with fewest values (nA)
#--------------------------------------------------------------------------------
asc_or_desc <- as.matrix(count(a.df, State))
if (as.numeric(asc_or_desc[1,2])>as.numeric(asc_or_desc[2,2])) {
a.df <- arrange(a.df, desc(State))
} else {
a.df <- arrange(a.df, State)
}
#--------------------------------------------------------------------------------
nA <- as.numeric(min(count(a.df, State, sort = TRUE)$n))
nB <- as.numeric(max(count(a.df, State, sort = TRUE)$n))
a.U <- sum(a.df$StateRank[1:nA])
a.E <- (nA*(nA+nB+1))/2 # Expectation of U
a.V <- (nA*nB*(nA+nB+1))/12 # Variance of U
a.Z <- (a.U - a.E)/sqrt(a.V)
a.P <- round((1 - round(pnorm(round(abs(a.Z), 2),
mean = 0, sd = 1) ,4)) * 2, 3)
# all the rounding is to mimic statistical tables (so that
# the answer is the same as in the textbook that I use)
Please try this code and tell me if I am on the right way:
I replaced your so called clumsy code with this one
... %>%
group_by(State) %>%
mutate(mx = max(Value)) %>%
arrange(desc(mx), desc(Value)) %>%
select(-mx)
The whole code:
library(tidyverse)
# Activity 9: aboriginal village size in Alaska and California
a.df <- data.frame(
Alaska = c(23, 26, 30, 33, 42, 45, 45, 50, 50.5, 96, 113, 557, NA),
Calif = c(39, 48, 53.5, 55, 57, 66, 77, 79, 108, 121, 162, 197, 309)
) %>%
pivot_longer(
cols = c("Alaska", "Calif"),
names_to = "State",
values_to = "Value",
values_drop_na = TRUE
) %>%
mutate(StateRank = rank(Value, ties.method = "average")) %>%
group_by(State) %>%
mutate(mx = max(Value)) %>%
arrange(desc(mx), desc(Value)) %>%
select(-mx)
-----------------------------------------------------------------------------
a.U <- sum(a.df$StateRank[1:nA])
a.E <- (nA*(nA+nB+1))/2 # Expectation of U
a.V <- (nA*nB*(nA+nB+1))/12 # Variance of U
a.Z <- (a.U - a.E)/sqrt(a.V)
a.P <- round((1 - round(pnorm(round(abs(a.Z), 2),
mean = 0, sd = 1) ,4)) * 2, 3)
# all the rounding is to mimic statistical tables (so that
# the answer is the same as in the textbook that I use)

How to correct labels for boxplot get the p-values at each pair in in R

I have a sample of the data as follows:
df <- tribble(
~capacity1, ~capacity2, ~capacity3, ~capacity4, ~capacity5, ~capacity6, ~capacity7, ~capapcity8,
75, 88, 85, 71, 98, 76, 71, 57,
80, 51, 84, 72, 59, 81, 70, 64,
54, 65, 90, 66, 93, 88, 77, 59,
59, 87, 94, 75, 74, 53, 56, 87,
52, 55, 64, 77, 50, 64, 83, 87,
33,22,66,67,99,87,40,90,)
I want to get the following graph.
As you can see, capacity 1 goes with capacity2 and produce one label as Capacity1. capacity 3 with capacity4= Capapcity2, capacity5 and capacity6=Capaccity3 and capacity 7 with capacity8= Capacity4. Next, I would like to get p-values. That would be good if we could order each pair of boxes ( e.g., capacity1 with capacity2= Capacity1).
If we need pairwise plots, we can split into a list of datasets for each pair of columns, then use ggboxplot from ggpubr
library(dplyr)
library(tidyr)
library(purrr)
library(patchwork)
library(rstatix)
library(ggpubr)
lst1 <- df %>%
# // split every 2 columns
split.default(as.integer(gl(ncol(.), 2, ncol(.)))) %>%
# // loop over the list
map(~ {
# // reshape to long format
dat <- pivot_longer(.x, everything())
# // get the t.test p value
stat_test <- dat %>%
t_test(value ~ name)%>%
adjust_pvalue(method = "bonferroni") %>%
add_significance("p.adj") %>%
add_xy_position(x = "name")
# // create the boxplot
ggboxplot(dat, x = 'name', y = 'value')+
stat_pvalue_manual(stat_test,
label = "p.adj", tip.length = 0.01)
})
Now, we wrap the list of plots with wrap_plots from patchwork
wrap_plots(lst1)
-output
Try this approach. Your data is in wide format. First you have to transform to long using pivot_longer(). After that you can use ggplot2 to sketch the plot with geom_boxplot(). In order to add p-values you need to define the proper test and using stat_compare_means(). Here the code using t.test:
library(ggplot2)
library(dplyr)
library(tidyr)
#Code
df %>% pivot_longer(everything()) %>%
ggplot(aes(x=name,y=value,fill=name,group=name))+
geom_boxplot()+
stat_compare_means(label = "p", method = "t.test",
ref.group = ".all.")+
labs(fill='Variable')
Output:
For grouping:
#Data for groups
groups <- data.frame(name=c("capacity1", "capacity2", "capacity3", "capacity4", "capacity5",
"capacity6", "capacity7", "capapcity8"),
group=paste0('Group.',c(1,1,2,2,3,3,4,4)),stringsAsFactors = F)
#Code
df %>% pivot_longer(everything()) %>%
left_join(groups) %>%
ggplot(aes(x=name,y=value,fill=name,group=name))+
geom_boxplot()+
facet_wrap(.~group,scales = 'free',nrow = 1,strip.position = 'bottom')+
labs(fill='Variable')+
theme(strip.placement = 'outside',strip.background = element_blank(),
legend.position = 'none')
Output:

Intersecting row values from many R dataframes and calculate averages of the corresponding values

Below is the example:
df1 <- data.frame("names" = c('John','Peter','Jolie'), "value1" = c(21, 24, 26), "value2" = c(20, 23, 32))
df2 <- data.frame("names" = c('Sam','John','Jolie'), "value1" = c(35, 11, 10), "value2" = c(10, 28, 27))
df3 <- data.frame("names" = c('Louis','Jolie','John'), "value1" = c(42, 74, 26), "value2" = c(26, 53, 54))
df4 <- data.frame("names" = c('Ale','John','Jolie'), "value1" = c(61, 34, 76), "value2" = c(28, 63, 38))
df5 <- data.frame("names" = c('John','Jolie','peter'), "value1" = c(11, 84, 86), "value2" = c(50, 13, 68))
intersect_names <- Reduce(intersect, list(df1$names,df2$names,df3$names,df4$names,df5$names))
Using the reduce and intersect command, I can get the intersection of all the names. But, I want the corresponding mean of value1 and value2 for each of the names in the dataframes.
Expected Output dataframe:
names Value1 Value2
John 20.6 43
Jolie 54 32.6
For ex: The value 20.6 was obtained by taking mean(c(21,11,26,34,11))
We can create a list of dataframes, extract the rows for intersect_names and take mean for each name.
list_df <- mget(paste0('df', 1:5))
intersect_names <- Reduce(intersect, lapply(list_df, `[[`, 'names'))
aggregate(.~names, do.call(rbind, lapply(list_df, function(x)
x[x$names %in% intersect_names, ])), mean)
The same using tidyverse functions :
library(dplyr)
library(purrr)
map_df(list_df, ~.x %>% filter(names %in% intersect_names)) %>%
group_by(names) %>%
summarise(across(.fns = mean))
# names value1 value2
# <chr> <dbl> <dbl>
#1 John 20.6 43
#2 Jolie 54 32.6

Calculate prop.test() p-value and effect size pairs in aggregated dataframe

After running an experiment, I gathered data and reshaped in this format:
library(tidyverse)
df <- tibble::tribble(
~element, ~sessions, ~begin, ~complete,
"baseline", 256, 67, 15,
"variation", 580, 167, 22
)
Than I aggregated as:
df %>%
group_by(element) %>%
mutate(sessions_to_begin = round(begin/sessions*100,2),
sessions_to_complete = round(complete/sessions*100,2)) -> df_agg
df_agg <- tibble::tribble(
~element, ~sessions, ~begin, ~complete, ~sessions_to_begin, ~sessions_to_complete,
"baseline", 256, 67, 15, 26.17, 5.86,
"variation", 580, 167, 22, 28.79, 3.79
)
I want to achieve create a df that contains a proportion test p-value for the pairs baseline/variation for the metrics session_to_begin and sessions_to_complete.
It would look like:
df_test <- tibble::tribble(
~metric, ~baseline, ~variation, ~p-value, ~effect_size,
"session_to_begin", 25.9, 29.5, "-", "-",
"session_to_complete", 6.03, 3.95, "-", "-"
)
I assume that for the effect size this could be a viable option:
https://rdrr.io/cran/pwr/man/ES.h.html

Getting estimate and p-value into dataframe

I am fairly new to R. My data looks something like this (only with 9000 columns and 66 rows)
Time <- c(0, 6.4, 8.6, 15.2, 19.4, 28.1, 42.6, 73, 73, 85, 88, 88, 88, 88, 88)
ID1 <- c(55030, 54539, 54937, 48897, 58160, 54686, 55393, 47191, 39805, 37601, 51328, 28882, 45587, 60061, 31892, 28670)
ID2 <- c(20485, 11907, 10571, 20974, 10462, 11149, 20970, NA, NA, 9295, NA, 8714, 24446, 10748, 9037, 11859)
ID3 <- c(93914, 44482, 43705, 51144, 49485, 43908, 44324, 37342, 18872, 39660,61673, 43837, 36528, 44738, 41648, 11100)
DF <- data.frame (Time, ID1, ID2, ID3)
I want to get a data frame that looks like this :
ID1, rho, p-value
ID2, rho, p-value
...
The rho and the p-value would be the results from a cor.test (spearman) with Time and each ID
Among other things I've tried this:
results <- data.frame(ID="", Estimate="", P.value="")
estimates = numeric(16)
pvalues = numeric(16)
for (i in 2:4){
test <- cor.test(DF[,1], DF[,i])
estimates[i] = test$estimate
pvalues[i] = test$p.value
}
And R gives me the following error:
Error: object 'test' not found
I've also tried:
result <- do.call(rbind,lapply(2:4, function(x) {
cor.result<-cor.test(DF[,1],DF[,x])
pvalue <- cor.result$p.value
estimate <- cor.result$estimate
return(data.frame(pvalue = pvalue, estimate = estimate))
})
)
And R gives me a similar error
Error: object 'cor.result' not found
I'm sure it's an easy fix but I can't seem to figure it out. Any help is more than welcome.
This is what I got after running
dput(head(SmallDataset[,1:5]))
structure(list(Species = c("Human.hsapiens", "Chimpanzee.ptroglodytes",
"Gorilla.ggorilla", "Orangutan.pabelii", "Gibbon.nleucogenys",
"Macaque.mmulatta"), Time = c(0, 6.4, 8.61, 15.2, 19.43, 28.1
), ID1 = c(55030, 54539, 54937, 48897, 58160, 54686), ID2 = c(20485,
11907, 10571, 20974, 10462, 11149), ID3 = c(93914, 44482, 43705,
51144, 49485, 43908)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
My solution involves defining a function within a lapply call
##
library(dplyr)
###Create dataframe
Time <- c(0, 6.4, 8.6, 15.2, 19.4, 28.1, 42.6, 73, 73, 85, 88, 88, 88, 88, 88, 89)
ID1 <- c(55030, 54539, 54937, 48897, 58160, 54686, 55393, 47191, 39805, 37601, 51328, 28882, 45587, 60061, 31892, 28670)
ID2 <- c(20485, 11907, 10571, 20974, 10462, 11149, 20970, NA, NA, 9295, NA, 8714, 24446, 10748, 9037, 11859)
ID3 <- c(93914, 44482, 43705, 51144, 49485, 43908, 44324, 37342, 18872, 39660,61673, 43837, 36528, 44738, 41648, 11100)
DF <- data.frame (Time, ID1, ID2, ID3)
##Run the correlations
l2 <- lapply(2:4, function(i)cor.test(DF$Time, DF[,i]))
##Define function to extract p_value and coefficients
l3 <- lapply(l2, function(i){
return(tibble(estimate = i$estimate,
p_value = i$p.value))
})
##Create a dataframe with information
l4 <- bind_rows(l3) %>% mutate(ID = paste0("ID", 1:3)) ##Data frame with info
l4
Consider building a list of data frames witih lapply (an iteration function similar to for but builds a list of objects of equal length as input). Afterwards, row bind all data frame elements together:
results <- lapply(2:4, function(i){
test <- cor.test(DF[,1], DF[,i])
data.frame(ID = names(DF)[i],
estimate = unname(test$estimate),
pvalues = unname(test$p.value))
})
final_df <- do.call(rbind, results)
final_df
# ID estimate pvalues
# 1 ID1 -0.6238591 0.009805341
# 2 ID2 -0.2270515 0.455676037
# 3 ID3 -0.4964092 0.050481533
NOTE: Your posted data for Time is missing an observation and cannot immediately be cast into data.frame() with other vectors. To resolve, I supplemented a 6th 88 at end:
Time <- c(0, 6.4, 8.6, 15.2, 19.4, 28.1, 42.6, 73, 73, 85, 88, 88, 88, 88, 88, 88)
Using posted SmallDataset:
SmallDataset <- structure(...)
results <- lapply(3:5, function(i){
test <- cor.test(SmallDataset$Time, SmallDataset[,i])
data.frame(ID = names(SmallDataset)[i],
estimate = unname(test$estimate),
pvalues = unname(test$p.value))
})
final_df <- do.call(rbind, results)
final_df
# ID estimate pvalues
# 1 ID1 0.03251407 0.9512461
# 2 ID2 -0.41733336 0.4103428
# 3 ID3 -0.60732484 0.2010166

Resources