Generate a column with random years within a range - r

I have a very simple question, for which I could not find any answer. For an example I want to create, I want to give the following data.table a column with random years within a certain range say 2004-2010.
library(data.table)
set.seed(1)
DT <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
norm = round(runif(100)/10,2),
Income = round(rnorm(10,-5,5),2),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID
DT[DT == 0] <- NA # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na
DT$some_NA_factor <- factor(DT$some_NA_factor)

We can use sample to select random years between 2004:2010 with replace = TRUE.
library(data.table)
DT[, random_year := sample(2004:2010, .N, replace = TRUE)]

Related

Calculating the weighted mean of all numerical columns

Example data:
library(data.table)
set.seed(1)
DT <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
wt = 15*round(runif(100)/10,2),
Income = round(rnorm(10,-5,5),2),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na
DT$some_NA_factor <- factor(DT$some_NA_factor)
I would like to calculate the weighted mean of all numerical columns, so I tried:
DT_w <- DT[,lapply(Filter(is.numeric,.SD), function(x) weighted.mean(DT$wt, x, na.rm=TRUE)), by=c("Country", "Time")]
But then it says:
Error in weighted.mean.default(DT$wt, x, na.rm = TRUE) :
'x' and 'w' must have the same length
I think I am perhaps misunderstanding the syntax. Am I doing this right?
Two issues:
when you use DT$wt that is an explicit call to the full wt column from the DT table - the by arguments won't work on it. The by arguments will only work on columns without the DT$ prefix.
The order of arguments for weighted.mean() is x first and w (weights) second - you seem to have this backwards
Fixing those two issues:
DT_w <- DT[,lapply(Filter(is.numeric,.SD), function(x) weighted.mean(x, w = wt, na.rm=TRUE)), by=c("Country", "Time")]
# runs without errors

Creating a new variable when a function recognises an interaction term

Based on this link, I wrote the following code, which is part of a function:
Sample data:
panelID = c(1:50)
year= c(2001:2010)
country = c("NLD", "BEL", "GER")
urban = c("A", "B", "C")
indust = c("D", "E", "F")
sizes = c(1,2,3,4,5)
n <- 2
library(data.table)
set.seed(123)
DT <- data.table(panelID = rep(sample(panelID), each = n),
country = rep(sample(country, length(panelID), replace = T), each = n),
year = c(replicate(length(panelID), sample(year, n))),
some_NA = sample(0:5, 6),
Factor = sample(0:5, 6),
industry = rep(sample(indust, length(panelID), replace = T), each = n),
urbanisation = rep(sample(urban, length(panelID), replace = T), each = n),
size = rep(sample(sizes, length(panelID), replace = T), each = n),
income = round(runif(100)/10,2),
sales= round(rnorm(10,10,10),2),
happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID
DT <- as.data.frame(DT)
Code:
depvar <- "happiness"
othervar <- "factor:income"
insvar <- c("happiness","factor","income")
if (length(insvar)>2) {
DT$newvar <- DT[insvar[2]]*DT[insvar[3]]
othervar=newvar
}
The idea is that when othervar is a combination of two variables, othervar gets replaced by a new variable which is the combination of those two variables.
Right now I however get the error:
Error in `[.data.frame`(DT, insvar[2]) : undefined columns selected
How should I write this function properly?
If you change factor to Factor as the column is named and use DT$newvar the code runs and produces a new column, which I believe is what you are looking for.
depvar <- "happiness"
othervar <- "Factor:income"
insvar <- c("happiness","Factor","income")
if (length(insvar)>2) {
DT$newvar <- DT[insvar[2]]*DT[insvar[3]]
othervar=DT$newvar
}

Summing the products of multiple variables per row

I have a data.table as follows:
library(data.table)
set.seed(1)
DT <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
norm = round(runif(100)/10,2),
Income = sample(0:5, 6),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID
DT[DT == 0] <- NA # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na
DT$some_NA_factor <- factor(DT$some_NA_factor)
Now, I would like to (for some artificial reason) sum the products of income & education and Sex & Age, for each observation using data.table. Please not that my actual data has way more variables, of which some are NA's. I tried:
DT<- setDT(DT)[, newvar:= sum((Income *Educ),
(Sex * Age), na.rm=TRUE)]
But that takes the sum of the columns. I also tried:
DT<- setDT(DT)[, newvar:= rowSums((Income *Educ),
(Sex * Age), na.rm=TRUE)]
But that does not work:
Error in base::rowSums(x, na.rm = na.rm, dims = dims, ...) :
'x' must be an array of at least two dimensions
What would be the correct way to do this in data.table?
DT[, newvar := rowSums(data.table(Income*Educ, Sex * Age), na.rm=TRUE)]
# ALternatively:
DT[, newvar := {x = Income*Educ; y = Sex * Age; fifelse(is.na(x), y, fifelse(is.na(y), x, x + y ))}]
Note:
setDT() is only necessary if data.frame is not a data.table yet. <- (assigning the result is not needed when you use := within the data.table.

Calculating the mean of the absolute value of all numerical columns

I want to calculate the mean of the absolute value of all numerical columns for the example dataset DT:
library(data.table)
set.seed(1)
DT <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
norm = round(runif(100)/10,2),
Income = round(rnorm(10,-5,5),2),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID
DT[DT == 0] <- NA # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na
DT$some_NA_factor <- factor(DT$some_NA_factor)
I tried to calculate the means and the absolute means as follows:
mean_of_differences <- DT[,lapply(Filter(is.numeric,.SD),mean, na.rm=TRUE)]
mean_of_differences <- as.data.frame(t(mean_of_differences))
mean_of_differences <- round(mean_of_differences, digits=2)
mean_of_absolute_diff <- DT[,lapply(Filter(is.numeric,.SD),function(x) mean(abs(x),na.rm=TRUE))]
mean_of_absolute_diff <- as.data.frame(t(mean_of_absolute_diff))
mean_of_absolute_diff <- round(mean_of_differences, digits=2)
The mean of Income for the absolute differences is however negative (as it is for the normal mean), which obviously is not possible. If I look at my code I don't understand what I am doing wrong. What am I overlooking?
Here is a solution using data.table. It (i) identifies numeric columns and (ii) obtains the mean of the absolute value of each numeric column.
Data
dt = data.table(
num1 = rnorm(100),
num2 = rnorm(100),
strv = sample(LETTERS, 100, replace = T)
)
Code
numcols = colnames(dt)[unlist(lapply(dt, is.numeric))] # Which columns are numeric?
# > numcols
# [1] "num1" "num2"
meandt = dt[, lapply(.SD, function(x) mean(abs(x))), .SDcols = numcols]
newcols = paste('mean_abs_', numcols, sep = ''); colnames(meandt) = newcols
# > meandt
# mean_abs_num1 mean_abs_num2
# 1: 0.8287523 0.8325123

Creating single column dataframe with the means of another dataset

I have a dataset which looks as follows:
set.seed(1)
DF <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("A",30),rep("B",50), rep("C",20)),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
norm = round(runif(100)/10,2),
Income = sample(100,100),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = round(rnorm(10,0.75,0.3),2),
Educ = round(rnorm(10,0.75,0.3),2))
DF [, uniqueID := .I]
DF <- as.data.table(DF) # Make sure it is a data.table
DF [, uniqueID := .I] # Add a unique ID
cols = sapply(DF, is.numeric) # Check numerical columns
DFm <- melt(DF[, cols, with = FALSE][, !"uniqueID"], id = "panelID") # https://stackoverflow.com/questions/57406654/speeding-up-a-function/57407959#57407959
DFm[, value := c(NA, diff(value)), by = .(panelID, variable)] # https://stackoverflow.com/questions/57406654/speeding-up-a-function/57407959#57407959
DF <- dcast(DFm, panelID + rowidv(DFm, cols = c("panelID", "variable")) ~ variable, value.var = "value") # ""
DF <- DF[DF[, !Reduce(`&`, lapply(.SD , is.na)), .SDcols = 3:ncol(DF)]] # Removes T1 for which there is no difference
Now what I would like to do is fairly simple. I want the mean of each column stored in a single column.
I tried:
mean_of_differences <- DF [, mean(sapply(.SD, is.numeric), na.rm=TRUE)]
mean_of_differences <- DF[,.SD[mean(sapply(.SD, is.numeric), na.rm=TRUE)]]
But somehow I cannot seems to get it right. I just end up with NA's or errors.
What am I overlooking?

Resources