Related
I ran 5 imputations on a data set with missing values. For my purposes, I want to replace missing values with the mode from the 5 imputations. Let's say I have the following data sets, where df is my original data, ID is a grouping variable to identify each case, and imp is my imputed data:
df <- data.frame(ID = c(1,2,3,4,5),
var1 = c(1,NA,3,6,NA),
var2 = c(NA,1,2,6,6),
var3 = c(NA,2,NA,4,3))
imp <- data.frame(ID = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5),
var1 = c(1,2,3,3,2,5,4,5,6,6,7,2,3,2,5,6,5,6,6,6,3,1,2,3,2),
var2 = c(4,3,2,3,2,4,6,5,4,4,7,2,4,2,3,6,5,6,4,5,3,3,4,3,2),
var3 = c(7,6,5,6,6,2,3,2,4,2,5,4,5,3,5,1,2,1,3,2,1,2,1,1,1))
I have a method that works, but it involves a ton of manual coding as I have ~200 variables total (I'm doing this on 3 different data sets with different variables). My code looks like this for one variable:
library(dplyr)
mode <- function(codes){
which.max(tabulate(codes))
}
var1 <- imp %>% group_by(ID) %>% summarise(var1 = mode(var1))
df3 <- df %>%
left_join(var1, by = "ID") %>%
mutate(var1 = coalesce(var1.x, var1.y)) %>%
select(-var1.x, -var1.y)
Thus, the original value in df is replaced with the mode only if the value was NA.
It is taking forever to keep manually coding this for every variable. I'm hoping there is an easier way of calculating the mode from the imputed data set for each variable by ID and then replacing the NAs with that mode in the original data. I thought maybe I could put the variable names in a vector and somehow iterate through them with one code where i changes to each variable name, but I didn't know where to go with that idea.
x <- colnames(df)
# Attempting to iterate through variables names using i
i = as.factor(x[[2]])
This is where I am stuck. Any help is much appreciated!
Here is one option using tidyverse. Essentially, we can pivot both dataframes long, then join together and coalesce in one step rather than column by column. Mode function taken from here.
library(tidyverse)
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
imp_long <- imp %>%
group_by(ID) %>%
summarise(across(everything(), Mode)) %>%
pivot_longer(-ID)
df %>%
pivot_longer(-ID) %>%
left_join(imp_long, by = c("ID", "name")) %>%
mutate(var1 = coalesce(value.x, value.y)) %>%
select(-c(value.x, value.y)) %>%
pivot_wider(names_from = "name", values_from = "var1")
Output
# A tibble: 5 × 4
ID var1 var2 var3
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 6
2 2 5 1 2
3 3 3 2 5
4 4 6 6 4
5 5 3 6 3
You can use -
library(dplyr)
mode_data <- imp %>%
group_by(ID) %>%
summarise(across(starts_with('var'), Mode))
df %>%
left_join(mode_data, by = 'ID') %>%
transmute(ID,
across(matches('\\.x$'),
function(x) coalesce(x, .[[sub('x$', 'y', cur_column())]]),
.names = '{sub(".x$", "", .col)}'))
# ID var1 var2 var3
#1 1 1 3 6
#2 2 5 1 2
#3 3 3 2 5
#4 4 6 6 4
#5 5 3 6 3
mode_data has Mode value for each of the var columns.
Join df and mode_data by ID.
Since all the pairs have name.x and name.y in their name, we can take all the name.x pairs replace x with y to get corresponding pair of columns. (.[[sub('x$', 'y', cur_column())]])
Use coalesce to select the non-NA value in each pair.
Change the column name by removing .x from the name. ({sub(".x$", "", .col)}) so var1.x becomes only var1.
where Mode function is taken from here
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
library(dplyr, warn.conflicts = FALSE)
imp %>%
group_by(ID) %>%
summarise(across(everything(), Mode)) %>%
bind_rows(df) %>%
group_by(ID) %>%
summarise(across(everything(), ~ coalesce(last(.x), first(.x))))
#> # A tibble: 5 × 4
#> ID var1 var2 var3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 3 6
#> 2 2 5 1 2
#> 3 3 3 2 5
#> 4 4 6 6 4
#> 5 5 3 6 3
Created on 2022-01-03 by the reprex package (v2.0.1)
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
I have a list of vectors of characters, called l. For example:
set.seed(42) ## for sake of reproducibility
genes <- paste("gene",1:20,sep="")
tot=data.frame(term=sample(genes,30, replace=T), num=sample(1:10, 30, replace=T), stringsAsFactors =
FALSE)
s1<-sample(genes,2, replace=F)
s2<-sample(genes,4, replace=F)
s3<-sample(genes,3, replace=F)
s4<-sample(genes,2, replace=F)
s5<-sample(genes,2, replace=F)
s6<-sample(genes,3, replace=F)
l=list(s1,s2,s3,s4,s5,s6)
By considering tot[tot$term%in%l[[1]],], I obtain:
term num
1 gene17 4
3 gene1 6
7 gene17 2
26 gene1 6
and I put
df=tot[tot$term%in%l[[1]],]
sum(df$num)
I can obtain the total values of second column, i.e. 18. For the other elements of the list I obtain, respectively: 32 13 19 17 29. This can be achieved by a for loop:
v<-vector()
for (j in 1:length(l)) {
df=tot[tot$term%in%l[[j]],]
v<-c(v,sum(df$num))
}
I would like to know if there is a more simple way of doing this.
It can be simplified with sapply
v2 <- sapply(l, function(j) sum(tot$num[tot$term %in% j]))
-checking with OP's loop output
identical(v, v2)
#[1] TRUE
Or a more compact way with map
library(purrr)
map_dbl(l, ~ sum(tot$num[tot$term %in% .x]))
Or with tidyverse
library(dplyr)
stack(setNames(l, seq_along(l))) %>%
group_by(ind) %>%
summarise(Sum = tot %>%
filter(term %in% values) %>%
pull(num) %>%
sum) %>%
pull(Sum)
Here is one tidyverse way :
library(tidyverse)
enframe(l, value = 'term') %>%
unnest(term) %>%
left_join(tot, by = 'term') %>%
group_by(name) %>%
summarise(num = sum(num, na.rm = TRUE))
# name num
#* <int> <int>
#1 1 18
#2 2 32
#3 3 13
#4 4 19
#5 5 17
#6 6 29
data=data.frame("StudentID"=c(1,2,3,4,5),
"Class"=c(1,2,2,3,3),
"Type"=c('A','A','B','B','B'))
Say you have data as shown above and you wish for summaries like this,
What is the effective solution to do this and output to a csv in organized way such as shown above?
Example data if there is weights involved and you wanted weighted counts and porporitons.portions.
data1=data.frame("StudentID"=c(1,2,3,4,5),
"Class"=c(1,2,2,3,3),
"Type"=c('A','A','B','B','B'),
"Weighting"=c(10,6,13,12,2))
One option is map
library(dplyr)
library(purrr)
map_dfr(names(data)[2:3], ~
data %>%
select(.x) %>%
group_by_at(.x) %>%
summarise(COUNT = n()) %>%
mutate(PROP = COUNT/sum(COUNT)))
# A tibble: 5 x 4
# Class COUNT PROP Type
#* <dbl> <int> <dbl> <fct>
#1 1 1 0.2 <NA>
#2 2 2 0.4 <NA>
#3 3 2 0.4 <NA>
#4 NA 2 0.4 A
#5 NA 3 0.6 B
Or with data.table by melting into 'long' format
library(data.table)
melt(setDT(data), id.var = 'StudentID')[, .(COUNT = .N),
.(variable, value)][, PROP := COUNT/sum(COUNT),.(variable)][]
Or with base R using table and prop.table
lapply(data[-1], function(x) {x1 <- table(x); x2 <- prop.table(x1); cbind(COUNT = x1, PROP = x2)})
Both summaries are simple, here I use dplyr. To combine them in the way you want, it's going to need to be slapped together in a somewhat inelegant way. You can remove the name col1 if you want
library(dplyr)
df1 <- data %>% group_by(Class) %>%
summarise(Count = n(), Prop = n() / nrow(data))
df2 <- data %>% group_by(Type) %>%
summarise(Count = n(), Prop = n() / nrow(data))
names(df1)[1] <- 'col1'
names(df2)[1] <- 'col1'
rbind(
c('Class', '', ''),
df1,
c('Type', '', ''),
df2
)
# A tibble: 7 x 3
col1 Count Prop
<chr> <chr> <chr>
1 Class "" ""
2 1 1 0.2
3 2 2 0.4
4 3 2 0.4
5 Type "" ""
6 A 2 0.4
7 B 3 0.6
I want to calculate the relative proportion by group for every column - except the grouping column - of a data frame. However, this should be programmed once to be used with different data frames which will have a different number of columns with different names. Because I am relying heavily on dplyr in this project, I want to achive this with dplyr.
I have read this topic, regarding a similiar but less complex problem:
Use dynamic variable names in `dplyr`
and also vignette("programming", "dplyr") but I am still not able to set the quotation correctly. I am really stuck at this point and like to have some advice of more experienced developers.
To reproduce the problem, I have set up a minimal example with a data frame with randomly created data columns and a grouping column.
library(dplyr)
library(stringi)
df <- setNames(as.data.frame(matrix(sample(1:10, 999, replace = T), 333, 3)),
stri_rand_strings(3, 10, pattern = "[A-Za-z]"))
group <- c("group1","group2","group3")
df <- cbind(df, group)
The following function should achive two things:
calculate the sum of every column in the data frame by group
calculate the relative proportions of every column in the data frame by group
propsum <- function(df, expr){
expr_quo <- enquo(expr)
sum <- paste(quo_name(expr), "sum", sep = ".")
prop <- paste(quo_name(expr), "prop", sep = ".")
df %>%
group_by(., group) %>%
mutate(., !! sum := sum(!! expr_quo),
!! prop := expr / !! sum * 100) -> df
return(df)
}
for(i in length(df)-1){
propsum(df, names(df)[i]) -> df_new
}
The expected result is a data frame with the initial columns, the sums by group for every initial column and the relative proportions for every initial column by group. So in the example, the data frame should have 10 columns (1 goruping column, 3 initial data columns, 3 columns with sums by group, 3 columns with relative proportions by group).
However, I am getting the following error:
Error in sum(~names(df)[i]) : invalid 'type' (character) of argument
In the vignette, the code example for a similar task ist:
my_mutate <- function(df, expr) {
expr <- enquo(expr)
mean_name <- paste0("mean_", quo_name(expr))
sum_name <- paste0("sum_", quo_name(expr))
mutate(df,
!! mean_name := mean(!! expr),
!! sum_name := sum(!! expr)
)
}
my_mutate(df, a)
#> # A tibble: 5 x 6
#> g1 g2 a b mean_a sum_a
#> <dbl> <dbl> <int> <int> <dbl> <int>
#> 1 1 1 5 4 3 15
#> 2 1 2 3 2 3 15
#> 3 2 1 4 1 3 15
#> 4 2 2 1 3 3 15
#> # … with 1 more row
I tried a lot of different things as of now, but I am not able to get the RHS to use the correct column. What am I doing wrong?
I have found a solution which I just want to share in case somebody faces a similar task.
The solution is, to call rlang::parse_expr() explicitly to save the varnames as expressions.
Here is the working example:
library(dplyr)
library(stringi)
df <- setNames(as.data.frame(matrix(sample(1:10, 999, replace = T), 333, 3)),
stri_rand_strings(3, 10, pattern = "[A-Za-z]"))
group <- c("group1","group2","group3")
df <- cbind(df, group)
gpercentage <- function(df, a_var, p_var, sum_var){
df %>%
group_by(., group) %>%
mutate(., !! sum_var := sum(!! a_var),
!! p_var := !! a_var / sum(!! a_var)) -> df
return(df)
}
i <- 1
for(i in seq_along(1:(length(df)-1))){
a_var <- rlang::parse_expr(names(df)[i])
p_var <- rlang::parse_expr(paste(names(df)[i], "P", sep = "."))
sum_var <- rlang::parse_expr(paste(names(df)[i], "SUM", sep = "."))
df %>%
gpercentage(., a_var, p_var, sum_var) -> df
}
We could achieve this as follows. :
propsum <- function(df, grouping_column){
df %>%
group_by(!!sym(grouping_column)) %>%
summarise_all(list(sum,function(x)
length(x)/nrow(.) * 100)) %>%
tidyr::pivot_longer(cols=-1,
names_to = "Variable",
values_to = "Value") %>%
mutate(Variable = gsub("fn1","sum",Variable),
Variable = gsub("fn2","prop",Variable))
}
propsum(iris,"Species")
Using df in the question:
propsum(df,"group")
# A tibble: 18 x 3
group Variable Value
<fct> <chr> <dbl>
1 group1 dVFQteFGjs_sum 628
2 group1 wiQCPUeIvC_sum 599
3 group1 yBvktNXcfd_sum 644
4 group1 dVFQteFGjs_prop 33.3
5 group1 wiQCPUeIvC_prop 33.3
6 group1 yBvktNXcfd_prop 33.3
7 group2 dVFQteFGjs_sum 630
8 group2 wiQCPUeIvC_sum 606
9 group2 yBvktNXcfd_sum 656
10 group2 dVFQteFGjs_prop 33.3
11 group2 wiQCPUeIvC_prop 33.3
12 group2 yBvktNXcfd_prop 33.3
13 group3 dVFQteFGjs_sum 636
14 group3 wiQCPUeIvC_sum 581
15 group3 yBvktNXcfd_sum 635
16 group3 dVFQteFGjs_prop 33.3
17 group3 wiQCPUeIvC_prop 33.3
18 group3 yBvktNXcfd_prop 33.3
To get back to wide(can use pivot_wider, I find spread "faster" to use),
propsum(df,"group") %>%
tidyr::spread(Variable,Value)
# A tibble: 3 x 7
group dVFQteFGjs_prop dVFQteFGjs_sum wiQCPUeIvC_prop wiQCPUeIvC_sum
<fct> <dbl> <dbl> <dbl> <dbl>
1 grou~ 33.3 628 33.3 599
2 grou~ 33.3 630 33.3 606
3 grou~ 33.3 636 33.3 581
# ... with 2 more variables: yBvktNXcfd_prop <dbl>,
# yBvktNXcfd_sum <dbl>
I am trying to get the rows which have some value in one column but positive and negative values in another. Input is the below data frame
data <- data.frame(X = c(1,3,5,7,7,8,9,10,10,11,11,12,12),
Y = sample(36476545:36476557),
timepoint = c(0,1,0,-0.31,1,1,1,1,-1,1,1,1,1)
)
Output looks something like this
X Y timepoint
4 7 36476557 -0.31
5 7 36476545 1.00
8 10 36476556 1.00
9 10 36476548 -1.00
I was looking at this link, but not what I am looking for.
After grouping by 'X', filter those have both negative and positive 'timepoint' by taking the sign of 'timepoint', get the number of distinct elements (n_distinct) is 2 (assuming there is no zero)
library(dplyr)
data %>%
group_by(X) %>%
filter(n_distinct(sign(timepoint)) == 2)
# A tibble: 4 x 3
# Groups: X [2]
# X Y timepoint
# <dbl> <int> <dbl>
#1 7 36476547 -0.31
#2 7 36476556 1
#3 10 36476549 1
#4 10 36476557 -1
NOTE: 'Y' values are different as the example was created with no set.seed
If there is zero as well
data %>%
group_by(X) %>%
filter(all(c(-1, -1) %in% sign(timepoint)))
Or using base R with ave
data[with(data, ave(sign(timepoint), X, FUN = function(x) length(unique(x))) == 2),]
Or another base R option with table
subset(data, X %in% names(which(rowSums(with(subset(data,
timepoint != 0), table(X, sign(timepoint))) > 0) == 2)))
In base R, we can use ave and select groups where there is at least one timepoint value greater than 0 and one timepoint value less than 0.
data[with(data, ave(timepoint > 0, X, FUN = function(x) any(x) & any(!x))), ]
# X Y timepoint
#4 7 36476553 -0.31
#5 7 36476551 1.00
#8 10 36476556 1.00
#9 10 36476554 -1.00
In dplyr this would be
library(dplyr)
data %>%
group_by(X) %>%
filter(any(timepoint > 0) & any(timepoint < 0))