Apply subsetting or row deleting function columnwise, based on grouping - r

Some sample data frame (real data has 500k observations by 20 variables):
set.seed(1)
dataframe <- data.frame()
IDs <- as.factor(sample(LETTERS[seq( from = 1, to = 3)], prob = c(0.2, 0.3, 0.5), 1000, replace = TRUE))
Var1 <- sample(x = c(20:1500), size = 1000, replace = TRUE)
Var2 <- sample(x = c(1:15), size = 1000, replace = TRUE)
Var3 <- sample(x = c(0.1:8.5), size = 1000, replace = TRUE)
Var4<- sample(x = c(12:255), size = 1000, replace = TRUE)
Var5 <- sample(x = c(14000000:15000000), size = 1000, replace = TRUE)
dataframe <- data.frame(IDs, Var1, Var2, Var3, Var4, Var5)
dataframe$Var5 <- as.POSIXlt(dataframe$Var5, origin = "1970-01-01")
For every subject in ID, I want to remove all rows for which Var1 are not within the range of (mean +/- 0.5 standard deviation) in Var1.
I guess the way to go is to use dplyr, pipe dataframe to group_by(ID), and apply a function. If so, I need help with both the function and dplyr commands.
My first attempt was to use a for loop with ID:
for(ID in levels(dataframe$IDs)){
# Get 0.5 standard deviations
sd05 <- sd(dataframe[which(dataframe$IDs == ID), "Var1"]) * 0.5
# Get mean for subsetting
mean_for_subset <- mean(dataframe[which(dataframe$IDs == ID), "Var1"])
dataframe[which( dataframe[which(dataframe$IDs == ID), "Var1"] > (mean_for_subset + sd05)
& dataframe[which(dataframe$IDs == ID), "Var1"] < (mean_for_subset - sd05))
,] <- NULL
}
That gives warnings as is.na() was not applied to vector or list and dataframe still has 1000 observations.

Using data.table:
library(data.table)
dataframe <- data.table(dataframe)
meanV1 <- dataframe[, mean(Var1)]
sdV1 <- 0.5 * dataframe[, sd(Var1)]
dataframe <- dataframe[Var1 < meanV1 + sdV1 & Var1 > meanV1 - sdV1]
Of if this is to be done by ID:
library(data.table)
dataframe <- data.table(dataframe)
dataframe[, c("mean1", "sd1") := list(mean(Var1), 0.5 * sd(Var1)), by = IDs]
dataframe <- dataframe[Var1 < mean1 + sd1 & Var1 > mean1 - sd1]
Then to remove the new rows:
dataframe[, c("mean1", "sd1") := NULL]
Done on two columns:
library(data.table)
dataframe <- data.table(dataframe)
dataframe[, c(
"mean1",
"sd1",
"mean2",
"sd2"
) := list(
mean(Var1),
0.5 * sd(Var1)),
mean(Var2),
0.5 * sd(Var2)),
by = IDs
]
dataframe <- dataframe[
Var1 < mean1 + sd1 &
Var1 > mean1 - sd1 &
Var2 < mean2 + sd2 &
Var2 > mean2 - sd2
]
dataframe[, c("mean1", "sd1", "mean2", "sd2") := NULL]

Related

R datasummary: Show N and percent in one column

I am currently trying to display the count of factor levels (e.g., gender) and their relative frequency per group (e.g., treatment group) using datasummary. In addition, I would like to combine this with the display of quantitative variables (e.g., age) with their respective mean and standard deviation.
So far, I created a function to display mean and sd in one column and managed to calculate N and percentages. However, I am struggling with creating a function that displays N and percentage in one column as well as adding the empty column to the datasummary of the quantitative variable to combine both frames (based on Show count of unique values in datasummary and combine two different tables of descriptive statistics using data).
library(modelsummary)
library(magrittr)
library(dplyr)
set.seed(123)
iris$gender <- factor(sample(1:3, size = 150, replace = T),
labels = c("Male", "Female", "Other"))
iris$job <- factor(sample(1:5, size = 150, replace = T),
labels = c("Student", "Worker", "CEO", "Other", "None"))
empty <- function(...) ""
MeanSD = function(x) {
M = mean(x, na.rm = T)
SD = sd(x, na.rm = T)
MSD = paste(round(M, 2), " (",round(SD,2), ")", sep = "")
return(MSD)
}
#This function does not work properly
NP = function(x, y) {
N = N(x)
P = Percent(x, y, denom = "col")
out = paste(N, " (",P, ")", sep = "")
return(NP)
}
iris_tab1 <- iris %>% dplyr::select(Species,
Gender = gender,
Job = job,
Length = Sepal.Length)
tbl_1 <- datasummary((Heading("")*N + Heading("")*Percent(fn = function(x, y) 100 * length(x) / length(y), denom = "col"))*(Gender + Job)~Species,
data = iris_tab1,
fmt = 2,
output = 'data.frame'
)
tbl_1
#Cannot add the empty column
tbl_2 <- datasummary(Heading("")*(MeanSD)*Length~empty+Species,
data = iris_tab1,
output = 'data.frame'
)
tbl_2
empty is a function. MeanSD is a function. All functions need to go on the same side of the datasummary formula:
library(modelsummary)
library(magrittr)
library(dplyr)
set.seed(123)
iris$gender <- factor(sample(1:3, size = 150, replace = T),
labels = c("Male", "Female", "Other"))
iris$job <- factor(sample(1:5, size = 150, replace = T),
labels = c("Student", "Worker", "CEO", "Other", "None"))
empty <- function(...) ""
MeanSD = function(x) {
M = mean(x, na.rm = T)
SD = sd(x, na.rm = T)
MSD = paste(round(M, 2), " (", round(SD, 2), ")", sep = "")
return(MSD)
}
iris_tab1 <- iris %>%
dplyr::select(Species,
Gender = gender,
Job = job,
Length = Sepal.Length)
tbl_2 <- datasummary(Heading("") * Length ~ empty + MeanSD * Species,
data = iris_tab1,
output = "data.frame")
tbl_2
#> empty setosa versicolor virginica
#> 1 5.01 (0.35) 5.94 (0.52) 6.59 (0.64)
Simple illustration of Percent function:
library(modelsummary)
dat <- mtcars
dat$cyl <- as.factor(dat$cyl)
fn <- function(x, y) {
out <- sprintf(
"%s (%.1f%%)",
length(x),
length(x) / length(y) * 100)
}
datasummary(
cyl ~ Percent(fn = fn),
data = dat)
cyl
Percent
4
11 (34.4%)
6
7 (21.9%)
8
14 (43.8%)

data.table is slow compared to data.frame for group filtering

data.table is slow if I filter based on numeric value (greater than or less than)
library(data.table)
df <- data.frame(group = sample(paste("group", 0:2000, sep = ""), 7000, replace = TRUE),
pvalue = 10^(sample(seq(from = -5, to = -1, by = 0.1), 7000, replace = TRUE)))
groups <- setdiff(unique(df$group), "group0")
# data.frame takes 0.16 sec
system.time( lapply(groups, function(r) {
df.temp <- df[df$group == r,]
any(df.temp[["pvalue"]] < 0.01, na.rm = TRUE)
}))
DT <- as.data.table(df)
setkeyv(DT, c("group"))
# data.table takes 0.9 sec
system.time(lapply(groups, function(r) any(DT[.(r), pvalue <= 0.01], na.rm = TRUE)))
Does anyone know, what could I be doing wrong?
You are not using data.table correctly. It is expensive to call DT[ many times. Instead you could do something like the following
setkeyv(DT, c("group"))
DT[!("group0"), any(pvalue <= 0.01), by = group]
group V1
1: group1 TRUE
2: group10 TRUE
3: group100 TRUE
4: group1000 TRUE
5: group1001 TRUE

Summing the products of multiple variables per row

I have a data.table as follows:
library(data.table)
set.seed(1)
DT <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
norm = round(runif(100)/10,2),
Income = sample(0:5, 6),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID
DT[DT == 0] <- NA # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na
DT$some_NA_factor <- factor(DT$some_NA_factor)
Now, I would like to (for some artificial reason) sum the products of income & education and Sex & Age, for each observation using data.table. Please not that my actual data has way more variables, of which some are NA's. I tried:
DT<- setDT(DT)[, newvar:= sum((Income *Educ),
(Sex * Age), na.rm=TRUE)]
But that takes the sum of the columns. I also tried:
DT<- setDT(DT)[, newvar:= rowSums((Income *Educ),
(Sex * Age), na.rm=TRUE)]
But that does not work:
Error in base::rowSums(x, na.rm = na.rm, dims = dims, ...) :
'x' must be an array of at least two dimensions
What would be the correct way to do this in data.table?
DT[, newvar := rowSums(data.table(Income*Educ, Sex * Age), na.rm=TRUE)]
# ALternatively:
DT[, newvar := {x = Income*Educ; y = Sex * Age; fifelse(is.na(x), y, fifelse(is.na(y), x, x + y ))}]
Note:
setDT() is only necessary if data.frame is not a data.table yet. <- (assigning the result is not needed when you use := within the data.table.

Calculating the mean of the absolute value of all numerical columns

I want to calculate the mean of the absolute value of all numerical columns for the example dataset DT:
library(data.table)
set.seed(1)
DT <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
norm = round(runif(100)/10,2),
Income = round(rnorm(10,-5,5),2),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID
DT[DT == 0] <- NA # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na
DT$some_NA_factor <- factor(DT$some_NA_factor)
I tried to calculate the means and the absolute means as follows:
mean_of_differences <- DT[,lapply(Filter(is.numeric,.SD),mean, na.rm=TRUE)]
mean_of_differences <- as.data.frame(t(mean_of_differences))
mean_of_differences <- round(mean_of_differences, digits=2)
mean_of_absolute_diff <- DT[,lapply(Filter(is.numeric,.SD),function(x) mean(abs(x),na.rm=TRUE))]
mean_of_absolute_diff <- as.data.frame(t(mean_of_absolute_diff))
mean_of_absolute_diff <- round(mean_of_differences, digits=2)
The mean of Income for the absolute differences is however negative (as it is for the normal mean), which obviously is not possible. If I look at my code I don't understand what I am doing wrong. What am I overlooking?
Here is a solution using data.table. It (i) identifies numeric columns and (ii) obtains the mean of the absolute value of each numeric column.
Data
dt = data.table(
num1 = rnorm(100),
num2 = rnorm(100),
strv = sample(LETTERS, 100, replace = T)
)
Code
numcols = colnames(dt)[unlist(lapply(dt, is.numeric))] # Which columns are numeric?
# > numcols
# [1] "num1" "num2"
meandt = dt[, lapply(.SD, function(x) mean(abs(x))), .SDcols = numcols]
newcols = paste('mean_abs_', numcols, sep = ''); colnames(meandt) = newcols
# > meandt
# mean_abs_num1 mean_abs_num2
# 1: 0.8287523 0.8325123

Creating single column dataframe with the means of another dataset

I have a dataset which looks as follows:
set.seed(1)
DF <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("A",30),rep("B",50), rep("C",20)),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
norm = round(runif(100)/10,2),
Income = sample(100,100),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = round(rnorm(10,0.75,0.3),2),
Educ = round(rnorm(10,0.75,0.3),2))
DF [, uniqueID := .I]
DF <- as.data.table(DF) # Make sure it is a data.table
DF [, uniqueID := .I] # Add a unique ID
cols = sapply(DF, is.numeric) # Check numerical columns
DFm <- melt(DF[, cols, with = FALSE][, !"uniqueID"], id = "panelID") # https://stackoverflow.com/questions/57406654/speeding-up-a-function/57407959#57407959
DFm[, value := c(NA, diff(value)), by = .(panelID, variable)] # https://stackoverflow.com/questions/57406654/speeding-up-a-function/57407959#57407959
DF <- dcast(DFm, panelID + rowidv(DFm, cols = c("panelID", "variable")) ~ variable, value.var = "value") # ""
DF <- DF[DF[, !Reduce(`&`, lapply(.SD , is.na)), .SDcols = 3:ncol(DF)]] # Removes T1 for which there is no difference
Now what I would like to do is fairly simple. I want the mean of each column stored in a single column.
I tried:
mean_of_differences <- DF [, mean(sapply(.SD, is.numeric), na.rm=TRUE)]
mean_of_differences <- DF[,.SD[mean(sapply(.SD, is.numeric), na.rm=TRUE)]]
But somehow I cannot seems to get it right. I just end up with NA's or errors.
What am I overlooking?

Resources