Convert dataframe to mids using mice in R - r

I have use mice to impute data, save the data as csv, and then run a Factor Analysis in SPSS and generated some factors. I now want to load the csv in R and run an imputed linear regression on the data. However, when I try to convert the dataframe to mids I get and error message saying:
library(mice)
# assign mtcars to a new dataframe
df <- mtcars
# loop 10 times
for (x in 1:10){
# create a fake imp number
a <- rep(x, 1, nrow(df))
# bind the fake imp number to the df
df2 <- cbind(df, a)
# crate a 10 folded version of mtcars with also the fake imp number
if (x ==1){
new_df <- df2
} else{
new_df <- rbind(new_df, df2)
}
}
# change the column name of the fake imp to ".imp"
names(new_df)[names(new_df) == 'a'] <- '.imp'
# convert df to mids
df_imp <- as.mids(new_df, .imp = .imp)
> Error in as.mids(df) : Original data not found. Use `complete(...,
> action = 'long', include = TRUE)` to save original data.
Can you please help me with this error?

From the as.mids() documentation.
This function converts imputed data stored in long format into an object of class mids. The original incomplete dataset needs to be available so that we know where the missing data are. The function is useful to convert back operations applied to the imputed data back in a mids object. It may also be used to store multiply imputed data sets from other software into the format used by mice.
The incomplete data is stored as imputation 0 in the long format. Therefore starting your procedure at 0 instead of 1 resolves the issue. (Also, you need quotes around .imp = '.imp' in the as.mids() call. Or, remove it and rely on the default. Or, just supply "a" as the imputation variable.)
library(mice)
df <- mtcars
for (x in 0:10){
a <- rep(x, 1, nrow(df))
df2 <- cbind(df, a)
if (x == 0){
new_df <- df2
} else{
new_df <- rbind(new_df, df2)
}
}
names(new_df)[names(new_df) == 'a'] <- '.imp'
df_imp <- as.mids(new_df)

Related

Unnest a ts class

My data has multiple customers data with different start and end dates along with their sales data.So I did simple exponential smoothing.
I applied the following code to apply ses
library(zoo)
library(forecast)
z <- read.zoo(data_set,FUN = function(x) as.Date(x) + seq_along(x) / 10^10 , index = "Date", split = "customer_id")
L <- lapply(as.list(z), function(x) ts(na.omit(x),frequency = 52))
HW <- lapply(L, ses)
Now my output class is list with uneven lengths.Can someone help me how to unnest or unlist the output in to a data frame and get the fitted values,actuals,residuals along with their dates,sales and customer_id.
Note : the reson I post my input data rather than data of HW is,the HW data is too large.
Can someone help me in R.
I would use tidyverse package to handle this problem.
map(HW, ~ .x %>%
as.data.frame %>% # convert each element of the list to data.frame
rownames_to_column) %>% # add row names as columns within each element
bind_rows(.id = "customer_id") # bind all elements and add customer ID
I am not sure how to relate dates and actual sales to your output (HW). If you explain it I might provide solution to that part of the problem too.
Firstly took all the unique customer_id into a variable called 'k'
k <- unique(data_set$customer_id)
Created a empty data frame
b <- data.frame()
extracted all the fitted values using a for loop and stored in 'a'.Using the rbind function attached all the fitted values to data frame 'b'
for(key in k){
print(a <- as.data.frame((as.numeric(HW_ses[[key]]$model$fitted))))
b <- rbind(b,a)
}
Finally using column bind function attached the input data set with data frame 'b'
data_set_final <- cbind(data_set,b)

Calculating log returns over columns of a data frame + store the results in a new data frame

My data frame contains 22 columns: "DATE", "INDEX" and S1, S2, S3 ... S20. There are over 4322 rows. I want to calculate log returns and store the results in a data frame. That should give me 4321 rows.
I run this code, but I am sure there is a much more elegant way to do the calculation in a short way.
# count the sum of rows in order to make the following formula work appropriately - (n-1)
n <- nrow(df)
# calculating the log returns (natural logarithm), of INDEX and S1-20
LogRet_INDEX <- log(df$INDEX[2:n])-log(df$INDEX[1:(n-1)])
LogRet_S1 <- log(df$S1[2:n])-log(df$S1[1:(n-1)])
LogRet_S2 <- log(df$S2[2:n])-log(df$S2[1:(n-1)])
LogRet_S3 <- log(df$S3[2:n])-log(df$S3[1:(n-1)])
LogRet_S4 <- log(df$S4[2:n])-log(df$S4[1:(n-1)])
LogRet_S5 <- log(df$S5[2:n])-log(df$S5[1:(n-1)])
LogRet_S6 <- log(df$S6[2:n])-log(df$S6[1:(n-1)])
LogRet_S7 <- log(df$S7[2:n])-log(df$S7[1:(n-1)])
LogRet_S8 <- log(df$S8[2:n])-log(df$S7[1:(n-1)])
LogRet_S9 <- log(df$S9[2:n])-log(df$S8[1:(n-1)])
LogRet_S10 <- log(df$S10[2:n])-log(df$S10[1:(n-1)])
LogRet_S11 <- log(df$S11[2:n])-log(df$S11[1:(n-1)])
LogRet_S12 <- log(df$S12[2:n])-log(df$S12[1:(n-1)])
LogRet_S13 <- log(df$S13[2:n])-log(df$S13[1:(n-1)])
LogRet_S14 <- log(df$S14[2:n])-log(df$S14[1:(n-1)])
LogRet_S15 <- log(df$S15[2:n])-log(df$S15[1:(n-1)])
LogRet_S16 <- log(df$S16[2:n])-log(df$S16[1:(n-1)])
LogRet_S17 <- log(df$S17[2:n])-log(df$S17[1:(n-1)])
LogRet_S18 <- log(df$S18[2:n])-log(df$S18[1:(n-1)])
LogRet_S19 <- log(df$S19[2:n])-log(df$S19[1:(n-1)])
LogRet_S20 <- log(df$S20[2:n])-log(df$S20[1:(n-1)])
# adding the results from the previous calculation (log returns) to a data frame
LogRet_df <- data.frame(LogRet_INDEX, LogRet_S1, LogRet_S2, LogRet_S3, LogRet_S4, LogRet_S5, LogRet_S6, LogRet_S7, LogRet_S8, LogRet_S9, LogRet_S10, LogRet_S11, LogRet_S12, LogRet_S13, LogRet_S14, LogRet_S15, LogRet_S16, LogRet_S17, LogRet_S18, LogRet_S19, LogRet_S20)
Is there a possibility to make this code shorter? Maybe some kind of loop or using a for argument? Since I am quite new to R, I try to improve my knowledge.
Any kind of help is highly appreciated!
You can use sapply to apply a function to each column of the data.frame.
What the code below does, is 1) take columns 2 to 22 from the data frame called df. 2) for each of this columns, calculate logarithm of the respective column and then calculate the difference between two neighboring rows. 3) when done, convert it to data.frame called df2
df2 <- as.data.frame(sapply(df[2:22], function(x) diff(log(x))))

How to combine multiply imputed data with mice?

I split a dataset into men and women, and then separately imputed it using the mice package.
#Generate predictormatrix
pred_gender_0<-quickpred(data_gender_0, include=c("age","weight_trunc"),exclude=c("ID","X","gender"),mincor = 0.1)
pred_gender_1<-quickpred(data_gender_1, include=c("age","weight_trunc"),exclude=c("ID","X","gender"),mincor = 0.1)
#impute the data with mice
imp_pred_gen0 <- mice(data_gender_0,
pred=pred_gender_0,
m=10,
maxit=5,
diagnostics=TRUE,
MaxNWts=3000) #i had to set this to 3000 because of an problematic unordered categorical variable
imp_pred_gen1 <- mice(data_gender_1,
pred=pred_gender_1,
m=10,
maxit=5,
diagnostics=TRUE,
MaxNWts=3000)
Now, I have two objects with 10 imputed datasets. One for men, one for women.
My question is, how do combine them?
Normally, I would just use:
comp_imp<-complete(imp,"long")
Should I:
use rbind.mids() to combine data of men and women and then convert it to long format?
do I first convert to long format and then use rbind.mids() or rbind()?
Thanks for any hints! =)
---------------------------------------------------------------------------
UPDATE - REPRODUCIBLE EXAMPLE
library("dplyr")
library("mice")
# We use nhanes-dataset from the mice-package as example
# first: combine age-category 2 and 3 to get two groups (as example)
nhanes$age[nhanes$age == 3] <- "2"
nhanes$age<-as.numeric(nhanes$age)
nhanes$hyp<-as.factor(nhanes$hyp)
#split data into two groups
nhanes_age_1<-nhanes %>% filter(age==1)
nhanes_age_2<-nhanes %>% filter(age==2)
#generate predictormatrix
pred1<-quickpred(nhanes_age_1, mincor=0.1, inc=c('age','bmi'), exc='chl')
pred2<-quickpred(nhanes_age_2, mincor=0.1, inc=c('age','bmi'), exc='chl')
# seperately impute data
set.seed(121012)
imp_gen1 <- mice(nhanes_age_1,
pred=pred1,
m=10,
maxit=5,
diagnostics=TRUE,
MaxNWts=3000)
imp_gen2 <- mice(nhanes_age_2,
pred=pred2,
m=10,
maxit=5,
diagnostics=TRUE,
MaxNWts=3000)
#------ ALTERNATIVE 1:
#combine imputed data
combined_imp<-rbind.mids(imp_gen1,imp_gen2)
complete_imp<-complete(combined_imp,"long")
#output
> combined_imp<-rbind.mids(imp_gen1,imp_gen2)
Warning messages:
1: In rbind.mids(imp_gen1, imp_gen2) :
Predictormatrix is not equal in x and y; y$predictorMatrix is ignored
.
2: In x$visitSequence == y$visitSequence :
longer object length is not a multiple of shorter object length
3: In rbind.mids(imp_gen1, imp_gen2) :
Visitsequence is not equal in x and y; y$visitSequence is ignored
.
> complete_imp<-complete(combined_imp,"long")
Error in inherits(x, "mids") : object 'combined_imp' not found
#------ ALTERNATIVE 2:
complete_imp1<-complete(imp_gen1,"long")
complete_imp2<-complete(imp_gen2,"long")
combined_imp<-rbind.mids(complete_imp1,complete_imp2)
#Output
> complete_imp1<-complete(imp_gen1,"long")
> complete_imp2<-complete(imp_gen2,"long")
> combined_imp<-rbind.mids(complete_imp1,complete_imp2)
Error in if (ncol(y) != ncol(x$data)) stop("The two datasets do not have the same number of columns\n") :
argument is of length zero
You can just use the following to create a new mids object which contains 10 imputed datasets of the men and women.
comp_imp <- rbind(pred_gender_0, pred_gender_1)
Doing this calls rbind.mids, not the regular bind function in R. The new object returned can be analysed in the usual way, e.g. using with.mids to fit your desired model to each of the imputed datasets.
I honestly have no knowledge of the package mice and just a faint idea about the concept of imputation.
I don't know what kind of analysis you would like to perform, but you say that normally you would do: comp_imp<-complete(imp,"long"), so I'll try to answer accordingly.
For me the first approach returns a data.frame, but without any missings. That is weird, since in complete(imp_gen1,"long") there is missing data in hyp. I don't know what rbind.mids is doing there.
I would therefore go with your second approach.
The result from complete(., "long") is a normal data.frame, hence there is no need to bind it with rbind.mids.
I would change your second approach to:
library(dplyr)
complete_imp1 <- complete(imp_gen1, "long")
complete_imp2 <- complete(imp_gen2, "long")
combined_imp <- bind_rows(complete_imp1, complete_imp2)
complete_imp1 <- complete(imp_gen1, "long") already returns the 10 (m parameter) imputed data frames, just count the total rows of complete_imp1 and multiply by m

Transfer of columns from one data frame to another under conditional statement

I have this code which was mostly written by one of the members here that exports all the graphs I need from my data set under the condition that the trendline coefficient is positive (increasing trendline).
lung <- read.csv("LAC.csv")
attach(lung) #data
age <- lung$Age
mirna <- data.frame(lung)
stuff <- data.frame(matrix(ncol = 500, nrow = 40))
pdf("test.pdf") # exports to pdf all the graphs
lapply(colnames(mirna)[-1],function(col){ #function for plotting
form <- formula(paste(col, "age", sep = "~"))
fit <- lm(form, mirna)
stuff_want <- stuff
if (coef(fit)[2] >0) { #plotting with condition
plot(form, df, xlab = "Age", main= "miRNA expression with increasing age")
abline(fit, col = 4)
}
})
dev.off()
This gives me a pdf file which I was hoping to use later to check which of the miRNA in the dataset are required and isolate the columns manually. However, I severely underestimated the number of mirRNA that meet the condition and now face a new conundrum on how to export the data from a column with and increasing trendline into a separate data frame which I would later save as a .csv file and use for further analysis.
Please keep in mind my knowledge of R is very limited although I am spending days in Rhelp and books. My idea was to create a separate data frame (stuff_want) to which the columns that satisfy the condition (coef(lm()) > 0) will be transferred. My initial thought was to use append() function and under the if condition write append(stuff_want, mirna, after = length(mirna)) followed by write.csv() function. The output of this is just NA filled .csv file.
Anyone able to explain to me why this is not working?
All the best,
Paulius
So here is one way (similar to #agstudy's comment), using the same made up data as in my previous answer
# make up some data
x <- seq(1,10,len=100)
set.seed(1) # for reproducible example
df <- data.frame(x,y1=1+2*x+rnorm(100),
y2=3-4*x+rnorm(100),
y3=2+0.001*x+rnorm(100))
# you start here...
result <- sapply(colnames(df)[-1],function(col){
form <- formula(paste(col,"x",sep="~"))
fit <- lm(form,df)
if(coef(fit)[2] > 0) TRUE else FALSE
})
cols <- names(result)[result]
cols
# [1] "y1" "y3"
This creates a named vector, result which elements have the same names as your response variables, and values = TRUE if that variable has positive slope, FALSE otherwise. Then
cols <- names(result)[result]
is a vector of the variable names with slope > 0. Finally, to extract the actual data, you would use:
stuff_want <- stuff[,cols]

Calculate statistics (e.g. average) across cells of identical data-frames

I am having a list of identically sorted dataframes. More specific these are the imputed dataframes which I get after doing Multiple imputations with the AmeliaII package. Now I want to create a new dataframe that is identical in structure, but contains the mean values of the cells calculated across the dataframes.
The way I achieve this at the moment is the following:
## do the Amelia run ------------------------------------------------------------
a.out <- amelia(merged, m=5, ts="Year", cs ="GEO",polytime=1)
## Calculate the output statistics ----------------------------------------------
left.side <- a.out$imputations[[1]][,1:2]
a.out.ncol <- ncol(a.out$imputations[[1]])
a <- a.out$imputations[[1]][,3:a.out.ncol]
b <- a.out$imputations[[2]][,3:a.out.ncol]
c <- a.out$imputations[[3]][,3:a.out.ncol]
d <- a.out$imputations[[4]][,3:a.out.ncol]
e <- a.out$imputations[[5]][,3:a.out.ncol]
# Calculate the Mean of the matrices
mean.right <- apply(abind(a,b,c,d,e,f,g,h,i,j,along=3),c(1,2),mean)
# recombine factors with values
mean <- cbind(left.side,mean.right)
I suppose that there is a much better way of doing this by using apply, plyr or the like, but as a R Newbie I am really a bit lost here. Do you have any suggestions how to go about this?
Here's an alternate approach using Reduce and plyr::llply
dfr1 <- data.frame(a = c(1,2.5,3), b = c(9.0,9,9), c = letters[1:3])
dfr2 <- data.frame(a = c(5,2,5), b = c(6,5,4), c = letters[1:3])
tst = list(dfr1, dfr2)
require(plyr)
tst2 = llply(tst, function(df) df[,sapply(df, is.numeric)]) # strip out non-numeric cols
ans = Reduce("+", tst2)/length(tst2)
EDIT. You can simplify your code considerably and accomplish what you want in 5 lines of R code. Here is an example using the Amelia package.
library(Amelia)
data(africa)
# carry out imputations
a.out = amelia(x = africa, cs = "country", ts = "year", logs = "gdp_pc")
# extract numeric columns from each element of a.out$impuations
tst2 = llply(a.out$imputations, function(df) df[,sapply(df, is.numeric)])
# sum them up and divide by length to get mean
mean.right = Reduce("+", tst2)/length(tst2)
# compute fixed columns and cbind with mean.right
left.side = a.out$imputations[[1]][1:2]
mean0 = cbind(left.side,mean.right)
If I understand your question correctly, then this should get you a long way:
#set up some data:
dfr1<-data.frame(a=c(1,2.5,3), b=c(9.0,9,9))
dfr2<-data.frame(a=c(5,2,5), b=c(6,5,4))
tst<-list(dfr1, dfr2)
#since all variables are numerical, use a threedimensional array
tst2<-array(do.call(c, lapply(tst, unlist)), dim=c(nrow(tst[[1]]), ncol(tst[[1]]), length(tst)))
#To see where you're at:
tst2
#rowMeans for a threedimensional array and dims=2 does the mean over the last dimension
result<-data.frame(rowMeans(tst2, dims=2))
rownames(result)<-rownames(tst[[1]])
colnames(result)<-colnames(tst[[1]])
#display the full result
result
HTH.
After many attempts, I've found a reasonably fast way to calculate cells' means across multiple data frames.
# First create an empty data frame for storing the average imputed values. This
# data frame will have the same dimensions of the original one
imp.df <- df
# Then create an array with the first two dimensions of the original data frame and
# the third dimension given by the number of imputations
a <- array(NA, dim=c(nrow(imp.df), ncol(imp.df), length(a.out$imputations)))
# Then copy each imputation in each "slice" of the array
for (z in 1:length(a.out$imputations)) {
a[,,z] <- as.matrix(a.out$imputations[[z]])
}
# Finally, for each cell, replace the actual value with the mean across all
# "slices" in the array
for (i in 1:dim(a)[1]) {
for (j in 1:dim(a)[2]) {
imp.df[i, j] <- mean(as.numeric(a[i, j,]))
}}

Resources