I want to calculate the mean of the absolute value of all numerical columns for the example dataset DT:
library(data.table)
set.seed(1)
DT <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
norm = round(runif(100)/10,2),
Income = round(rnorm(10,-5,5),2),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID
DT[DT == 0] <- NA # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na
DT$some_NA_factor <- factor(DT$some_NA_factor)
I tried to calculate the means and the absolute means as follows:
mean_of_differences <- DT[,lapply(Filter(is.numeric,.SD),mean, na.rm=TRUE)]
mean_of_differences <- as.data.frame(t(mean_of_differences))
mean_of_differences <- round(mean_of_differences, digits=2)
mean_of_absolute_diff <- DT[,lapply(Filter(is.numeric,.SD),function(x) mean(abs(x),na.rm=TRUE))]
mean_of_absolute_diff <- as.data.frame(t(mean_of_absolute_diff))
mean_of_absolute_diff <- round(mean_of_differences, digits=2)
The mean of Income for the absolute differences is however negative (as it is for the normal mean), which obviously is not possible. If I look at my code I don't understand what I am doing wrong. What am I overlooking?
Here is a solution using data.table. It (i) identifies numeric columns and (ii) obtains the mean of the absolute value of each numeric column.
Data
dt = data.table(
num1 = rnorm(100),
num2 = rnorm(100),
strv = sample(LETTERS, 100, replace = T)
)
Code
numcols = colnames(dt)[unlist(lapply(dt, is.numeric))] # Which columns are numeric?
# > numcols
# [1] "num1" "num2"
meandt = dt[, lapply(.SD, function(x) mean(abs(x))), .SDcols = numcols]
newcols = paste('mean_abs_', numcols, sep = ''); colnames(meandt) = newcols
# > meandt
# mean_abs_num1 mean_abs_num2
# 1: 0.8287523 0.8325123
Related
Example data:
library(data.table)
set.seed(1)
DT <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
wt = 15*round(runif(100)/10,2),
Income = round(rnorm(10,-5,5),2),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na
DT$some_NA_factor <- factor(DT$some_NA_factor)
I would like to calculate the weighted mean of all numerical columns, so I tried:
DT_w <- DT[,lapply(Filter(is.numeric,.SD), function(x) weighted.mean(DT$wt, x, na.rm=TRUE)), by=c("Country", "Time")]
But then it says:
Error in weighted.mean.default(DT$wt, x, na.rm = TRUE) :
'x' and 'w' must have the same length
I think I am perhaps misunderstanding the syntax. Am I doing this right?
Two issues:
when you use DT$wt that is an explicit call to the full wt column from the DT table - the by arguments won't work on it. The by arguments will only work on columns without the DT$ prefix.
The order of arguments for weighted.mean() is x first and w (weights) second - you seem to have this backwards
Fixing those two issues:
DT_w <- DT[,lapply(Filter(is.numeric,.SD), function(x) weighted.mean(x, w = wt, na.rm=TRUE)), by=c("Country", "Time")]
# runs without errors
I have a very simple question, for which I could not find any answer. For an example I want to create, I want to give the following data.table a column with random years within a certain range say 2004-2010.
library(data.table)
set.seed(1)
DT <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
norm = round(runif(100)/10,2),
Income = round(rnorm(10,-5,5),2),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID
DT[DT == 0] <- NA # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na
DT$some_NA_factor <- factor(DT$some_NA_factor)
We can use sample to select random years between 2004:2010 with replace = TRUE.
library(data.table)
DT[, random_year := sample(2004:2010, .N, replace = TRUE)]
I have a data.table as follows:
library(data.table)
set.seed(1)
DT <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
norm = round(runif(100)/10,2),
Income = sample(0:5, 6),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID
DT[DT == 0] <- NA # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na
DT$some_NA_factor <- factor(DT$some_NA_factor)
Now, I would like to (for some artificial reason) sum the products of income & education and Sex & Age, for each observation using data.table. Please not that my actual data has way more variables, of which some are NA's. I tried:
DT<- setDT(DT)[, newvar:= sum((Income *Educ),
(Sex * Age), na.rm=TRUE)]
But that takes the sum of the columns. I also tried:
DT<- setDT(DT)[, newvar:= rowSums((Income *Educ),
(Sex * Age), na.rm=TRUE)]
But that does not work:
Error in base::rowSums(x, na.rm = na.rm, dims = dims, ...) :
'x' must be an array of at least two dimensions
What would be the correct way to do this in data.table?
DT[, newvar := rowSums(data.table(Income*Educ, Sex * Age), na.rm=TRUE)]
# ALternatively:
DT[, newvar := {x = Income*Educ; y = Sex * Age; fifelse(is.na(x), y, fifelse(is.na(y), x, x + y ))}]
Note:
setDT() is only necessary if data.frame is not a data.table yet. <- (assigning the result is not needed when you use := within the data.table.
I have a dataset which looks as follows:
set.seed(1)
DF <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("A",30),rep("B",50), rep("C",20)),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
norm = round(runif(100)/10,2),
Income = sample(100,100),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = round(rnorm(10,0.75,0.3),2),
Educ = round(rnorm(10,0.75,0.3),2))
DF [, uniqueID := .I]
DF <- as.data.table(DF) # Make sure it is a data.table
DF [, uniqueID := .I] # Add a unique ID
cols = sapply(DF, is.numeric) # Check numerical columns
DFm <- melt(DF[, cols, with = FALSE][, !"uniqueID"], id = "panelID") # https://stackoverflow.com/questions/57406654/speeding-up-a-function/57407959#57407959
DFm[, value := c(NA, diff(value)), by = .(panelID, variable)] # https://stackoverflow.com/questions/57406654/speeding-up-a-function/57407959#57407959
DF <- dcast(DFm, panelID + rowidv(DFm, cols = c("panelID", "variable")) ~ variable, value.var = "value") # ""
DF <- DF[DF[, !Reduce(`&`, lapply(.SD , is.na)), .SDcols = 3:ncol(DF)]] # Removes T1 for which there is no difference
Now what I would like to do is fairly simple. I want the mean of each column stored in a single column.
I tried:
mean_of_differences <- DF [, mean(sapply(.SD, is.numeric), na.rm=TRUE)]
mean_of_differences <- DF[,.SD[mean(sapply(.SD, is.numeric), na.rm=TRUE)]]
But somehow I cannot seems to get it right. I just end up with NA's or errors.
What am I overlooking?
Some sample data frame (real data has 500k observations by 20 variables):
set.seed(1)
dataframe <- data.frame()
IDs <- as.factor(sample(LETTERS[seq( from = 1, to = 3)], prob = c(0.2, 0.3, 0.5), 1000, replace = TRUE))
Var1 <- sample(x = c(20:1500), size = 1000, replace = TRUE)
Var2 <- sample(x = c(1:15), size = 1000, replace = TRUE)
Var3 <- sample(x = c(0.1:8.5), size = 1000, replace = TRUE)
Var4<- sample(x = c(12:255), size = 1000, replace = TRUE)
Var5 <- sample(x = c(14000000:15000000), size = 1000, replace = TRUE)
dataframe <- data.frame(IDs, Var1, Var2, Var3, Var4, Var5)
dataframe$Var5 <- as.POSIXlt(dataframe$Var5, origin = "1970-01-01")
For every subject in ID, I want to remove all rows for which Var1 are not within the range of (mean +/- 0.5 standard deviation) in Var1.
I guess the way to go is to use dplyr, pipe dataframe to group_by(ID), and apply a function. If so, I need help with both the function and dplyr commands.
My first attempt was to use a for loop with ID:
for(ID in levels(dataframe$IDs)){
# Get 0.5 standard deviations
sd05 <- sd(dataframe[which(dataframe$IDs == ID), "Var1"]) * 0.5
# Get mean for subsetting
mean_for_subset <- mean(dataframe[which(dataframe$IDs == ID), "Var1"])
dataframe[which( dataframe[which(dataframe$IDs == ID), "Var1"] > (mean_for_subset + sd05)
& dataframe[which(dataframe$IDs == ID), "Var1"] < (mean_for_subset - sd05))
,] <- NULL
}
That gives warnings as is.na() was not applied to vector or list and dataframe still has 1000 observations.
Using data.table:
library(data.table)
dataframe <- data.table(dataframe)
meanV1 <- dataframe[, mean(Var1)]
sdV1 <- 0.5 * dataframe[, sd(Var1)]
dataframe <- dataframe[Var1 < meanV1 + sdV1 & Var1 > meanV1 - sdV1]
Of if this is to be done by ID:
library(data.table)
dataframe <- data.table(dataframe)
dataframe[, c("mean1", "sd1") := list(mean(Var1), 0.5 * sd(Var1)), by = IDs]
dataframe <- dataframe[Var1 < mean1 + sd1 & Var1 > mean1 - sd1]
Then to remove the new rows:
dataframe[, c("mean1", "sd1") := NULL]
Done on two columns:
library(data.table)
dataframe <- data.table(dataframe)
dataframe[, c(
"mean1",
"sd1",
"mean2",
"sd2"
) := list(
mean(Var1),
0.5 * sd(Var1)),
mean(Var2),
0.5 * sd(Var2)),
by = IDs
]
dataframe <- dataframe[
Var1 < mean1 + sd1 &
Var1 > mean1 - sd1 &
Var2 < mean2 + sd2 &
Var2 > mean2 - sd2
]
dataframe[, c("mean1", "sd1", "mean2", "sd2") := NULL]