R-Hmisc impute by cluster result - r

I want to impute a variable x3 by the its mean corresponding to each cluster calculated considering other 2 variables X1 and X2.
I know that you can pass a function to impute from Hmisc package, like "mean" and it does the work. So I would like to pass a function that does all the following.
I use to write the code to do so:
df1 <- data.frame(x1=runif(1000,0,100),
x2=runif(1000,0,100),
x3=c(runif(900,0,100),rep(NA,100)))
I want to pass a function that does all of this:
clust<-kmeans(df1[,-grep('x3', colnames(df1))], 3)
df1$clust<-clust$cluster
library(plyr)
cc<-ddply(df1, 'clust',summarise, mean=mean(x3, na.rm=TRUE))
df2<-merge(df1,cc, by='clust')
df2$x3imputed2<-ifelse(is.na(df2$x3),df2$mean, df2$x3)
Is there a way to pass all this code as a function and use it in Hmisc? (I had a problem with ddply introducing x3 as a variable).
Something like the following:
ff<-function(i) {
clust<-kmeans(df1[,-grep(i, colnames(df1))], 3)
df1$clust<-clust$cluster
cc<-aggregate(df1[,i], by=list(clust=df1$clust), "mean", na.rm=TRUE)
df2<-merge(df1,cc, by='clust')
df2$x3imputed2<-ifelse(is.na(df2[, i]),df2$x, df2[,i])
}
f1$imputedx3<-with(df1, impute(x3,ff))
But I get an error:
empty cluster: try a better set of initial centers
And when I replace it by x3 I don't get the same error.

Try
library(lazyeval)
library(dplyr)
f1 <- function(dat, cname){
#get the third argument i.e, 'cname'
nm1 <- match.call()[[3]]
#paste 'imputed' for renaming the new column later
nm2 <- paste0(nm1, 'imputed')
#create an numeric column index that will be removed in kmeans calc
indx <- grep(cname, colnames(dat))
#get the 'kmeans' of the columns other than the 'cname'
clust <- kmeans(dat[,-indx],3)$cluster
#group by 'clust' and create new column with 'mutate'
dat %>%
group_by(clust=clust) %>%
mutate_(interp(~ifelse(is.na(v), mean(v, na.rm=TRUE), v),
v=as.name(cname))) %>%
#rename the column
setNames(., c(head(names(.),-1), nm2))
}
f1(df1, 'x3')
Or you could pass it without quotes by using v= lazy(cname)
f2 <- function(dat, cname){
nm1 <- match.call()[[3]]
nm2 <- paste0(nm1, 'imputed')
indx <- grep(nm1, colnames(dat))
clust <- kmeans(dat[,-indx],3)$cluster
dat %>%
group_by(clust=clust) %>%
mutate_(interp(~ifelse(is.na(v), mean(v, na.rm=TRUE), v),
v= lazy(cname))) %>%
setNames(., c(head(names(.),-1), nm2))
}
f2(df1, x3)

Related

Take row names from list of lists in R

Starting from a list of lists like outcome:
id <- c(1,2,3,4,5,1,2,3,4,5)
month <- c(3,4,2,1,5,7,3,1,8,9)
preds <- c(0.5,0.1,0.15,0.23,0.75,0.6,0.49,0.81,0.37,0.14)
l_1 <- data.frame(id, preds, month)
preds <- c(0.45,0.18,0.35,0.63,0.25,0.63,0.29,0.11,0.17,0.24)
l_2 <- data.frame(id, preds, month)
preds <- c(0.58,0.13,0.55,0.13,0.76,0.3,0.29,0.81,0.27,0.04)
l_3 <- data.frame(id, preds, month)
preds <- c(0.3,0.61,0.18,0.29,0.85,0.76,0.56,0.91,0.48,0.91)
l_4 <- data.frame(id, preds, month)
outcome <- list(l_1, l_2, l_3, l_4)
My interest is to take the assigned unique row values and create a new variable as if we do:
sample <- outcome[[1]]
sample$unique_id <- rownames(sample)
However, I don´t want to go manually because my list has 100 lists.
Moreover, I don´t want to assign values manually to each row because I want to preserve the row names generated by R.
Any clue?
We may also use rownames_to_column
library(dplyr)
library(purrr)
library(tibble)
map(outcome, ~ .x %>%
rownames_to_column('unique_id'))
With lapply and cbind:
lapply(outcome, function(x) {
cbind(unique_id=rownames(x), x)
})
Another base R option is to use Map
Map(function(x){
x$unique_id <- rownames(x)
x
}, outcome)
Try using lapply
lapply(outcome, function(x) {
x$unique_id <- rownames(x)
x
})

For loop to replace NA values in R

I want to write a for loop in R to replace NA values from one column of my dataframe and replace them for the mean of the values of the same column when 2 conditions are true.
When conditions are met, I want to assign the mean to NAs using observations from the same year and from the same group.
I wrote the following code, but I am struggling to write the conditions.
missing <- which(is.na(df$price))
for (i in 1:36){
x <- df[missing,]group
y <- df[missing,]year
selection <- df[conditions??,]$price
df[missing,]$price <- mean(selection, na.rm = TRUE)
}
You don't need a for loop, you can directly replace all the NAs with the mean(, na.rm=T) directly to calculate the mean of said column without NAs. This is for the general case:
df[is.na(df$price),]$price <- mean(df$price, na.rm = TRUE)
Using tidyverse you can achieve what you want:
library(tidyverse)
df %>% group_by(group, year) %>% mutate(price=ifelse(is.na(price), mean(price, na.rm=T), price))
Using data.table
dt <- data.table(df)
dt[,price:=fifelse(is.na(price), mean(price, na.rm=T), price), by=.(group,year)][]
A base R solution using by, which splits a data frame by the groups in the list in the second argument, and applies a function defined in the third:
result <- by(df,
list(df[["group"]], df[["year"]]),
function(x) {
x[is.na(x$price), "price"] <- mean(x[["price"]], na.rm = TRUE)
x
},
simplify = TRUE)
do.call(rbind, result)

calculate z-score across multiple dataframes in R

I have ten dataframes with equal number of rows and columns. They look like this:
df1 <- data.frame(geneID=c("AKT1","AKT2","AKT3","ALK",
"APC"),
CDKN2A=c(3490,9447,4368,908,204),
INPP4B=c(NA,9459,4395,1030,NA),
BCL2=c(NA,9480,4441,1209,NA),
IRS2=c(NA,NA,4639,1807,NA),
HRAS=c(3887,9600,4691,1936,1723))
df2 <- data.frame(geneID=c("AKT1","AKT2","AKT3","ALK",
"APC"),
CDKN2A=c(10892,17829,7156,1325,387),
INPP4B=c(NA,17840,7185,1474,NA),
BCL2=c(NA,17845,7196,1526,NA),
IRS2=c(NA,NA,12426,10244,NA),
HRAS=c(11152,17988,7545,2734,2423))
df3 <- data.frame(geneID=c("AKT1","AKT2","AKT3","ALK",
"APC"),
CDKN2A=c(11376,17103,8580,780,178),
INPP4B=c(NA,17318,9001,2829,NA),
BCL2=c(NA,17124,8621,1141,NA),
IRS2=c(NA,NA,8658,1397,NA),
HRAS=c(11454,17155,8683,1545,1345))
I would like to calculate z-score for each data frame, based on mean and variance across multiple dataframes. The z-score should be calculated as follows: z-score=(x-mean(x))/sd(x))).
I found that ddply function of plyr can do this job, but the solution was for single dataframe, while I have multiple dataframes as separate files with 18214 rows and 269 columns.
I would appreciate any suggestions.
Thank you very much for your help!
Olha
Here is one option where we bind the datasets together with bind_rows (from dplyr), then group by the grouping column and return the zscore transformed numeric columns
library(dplyr)
bind_rows(df1, df2, df3, .id = 'grp') %>%
group_by(geneID) %>%
mutate(across(where(is.numeric),
~(.- mean(., na.rm = TRUE))/sd(., na.rm = TRUE), .names = '{col}_zscore'))
NOTE: if we dont need new columns, then remove the .names part
If we need to do this in a loop, without binding into a single data.frame, can loop over the list
library(purrr)
list(df1, df2, df3) %>% # // automatically => mget(ls('^df\\d+$'))
map(~ .x %>%
mutate(across(where(is.numeric),
~(.- mean(., na.rm = TRUE))/sd(., na.rm = TRUE), .names = '{col}_zscore')))
Here is a base R solution with function scale.
df_list <- list(df1, df2, df3)
df_list2 <- lapply(df_list, function(DF){
i <- sapply(DF, is.numeric)
DF[i] <- lapply(DF[i], scale)
DF
})
S3 methods
Considering that scale is generic and that methods can be written for it, here is a data.frame method, then applied to the same list df_list.
scale.data.frame <- function(x, center = TRUE, scale = TRUE){
i <- sapply(x, is.numeric)
x[i] <- lapply(x[i], scale, center = center, scale = scale)
x
}
df_list3 <- lapply(df_list, scale)
identical(df_list2, df_list3)
#[1] TRUE

R read.csv loop with iterating variable in quotation marks

This is my code:
library(data.table)
library(stringr)
parameters <- c("conductivity","calcium","chloride","magnesium","phosphate","potassium","salinity","sodium","sulphate")
for (i in parameters){
i <- read.csv(str_c("./Data/Parameters/",i,".csv"), sep=",", header=FALSE)
i <- unique(i)
i <- subset(i, select=c(1,2,4,6))
i <- setnames(i, c("site","date", str_c("",i,""), "material"))
i[,3] <- as.numeric(i[,3])
i <- subset(i, i > 0)
}
Now, there are two things that don't work here.
The first is in the setnames function: it doesn't understand that it needs to label one of the columns of the CSV with the variable name.
The second is that it doesn't actually call the imported files with 'conductivity', 'calcium', etc but simply calls them all 'i'.
How can I fix this?
We can do this with map
library(stringr)
library(dplyr)
library(purrr)
map(parameters, ~ read_csv(str_c("./Data/Parameters/", .x, ".csv")) %>%
distinct %>%
select(1, 2, 4, 6) %>%
rename_at(3, ~ .x) %>%
mutate(!! .x := !! rlang::sym(.x)) %>%
filter_at(vars(.x), any_vars(. > 0))
)
Another option using data.table:
library(data.table)
parameters <- c("conductivity","calcium","chloride","magnesium","phosphate",
"potassium","salinity","sodium","sulphate")
lapply(parameters, function(x) {
DT <- unique(fread(paste0("./Data/Parameters/", x, ".csv"), header=FALSE))[,
V4 := as.numeric(V4)][
V4 > 0, paste0("V", c(1,2,4,6))]
setnames(DT, names(DT), c("site","date", x, "material"))
})

purrr::map_if How to return value if condition is FALSE?

I am running a function using purrr::map that will return an error if the dataframe does not contain numeric data (i.e., na.omit do not return any valid rows). I discovered map_if but it seems map_if returns .x if .p is false. Is there a way to return NA. This example should explain what I need:
library(openair)
library(tidyverse)
# Build test dataset
df1 <- mydata
df2 <- mydata
df2$no2 <- NA_real_
df3 <- mydata
dfx <- tibble(id = c(1, 2, 3), data = list(df1, df2, df3))
# polarPlot function will return error if dataframe does not contain numeric data (i.e., it only contains NAs)
polarPlot(df2, pollutant = "no2")
# Function to test length of dataframe (i.e., if 0 theneverything is NAs)
check_length <- function(x) (x %>% select(ws, wd, "no2") %>% na.omit() %>% nrow()) > 0
check_length(df1)
check_length(df2)
# purrr::map (is there a way for map_if to return NA if length == 0?)
dfx %>% mutate(mynewvar = map_if(.x = data, check_length, ~ polarPlot(.x, pollutant = "no2")))
In other words, I would like mynewvar[[2]] to return NA.
#dylanjm I posted a reprex not sure if you are not able to see it. As you suggested the function possibly is what I need.
possible_polarPlot <- possibly(polarPlot, otherwise = NA)
out <- dfx %>% mutate(mynewvar = map(.x = data, ~ possible_polarPlot(.x, pollutant = "no2")))
out$mynewvar[[2]] # Returns NA as I was looking for.

Resources