Replacing all negative values from a dataset - r

I have a dataframe with mixed data ranging from variables(or columns) with numerical values to variables(or columns) with factors.
I would like to use the following piece of code in R to replace all negative values with NA and subsequently remove the entire variable if more than 99% of observations for that variable are NA.
The first part should make sure there is no problem when encountering strings.
Would it be possible to simply start with:
mydata$v1[mydata$v1<0] <- NA
But then not specific for v1 and only if the observation is not a string ?
Follow up:
This is how far I got with the explanation provided by #stas g. It does however not seem like any variable was dropped from the df.
#mixed data
df <- data.frame(WVS_Longitudinal_1981_2014_R_v2015_04_18)
dat <- df[,sapply(df, function(x) {class(x)== "numeric" | class(x) ==
"integer"})]
foo <- function(dat, p){
ind <- colSums(is.na(dat))/nrow(dat)
dat[dat < 0] <- NA
dat[, ind < p]
}
#process numeric part of the data separately
ii <- sapply(df, class) == "numeric" | sapply(df, class) == "integer"
dat.num <- foo(as.matrix(df[, ii]), 0.99)
#then stick the two parts back together again
WVS <- data.frame(df[, !ii], dat.num)

impossible to know exactly how to help you without a minimal reproducible example, but assuming you have a sample data below:
#matrix of random normal observations, 20 samples, 5 variables
dat <- matrix(rnorm(100), nrow = 20)
#if entry is negative, replace with 'NA'
dat[dat < 0] <- NA
#threshold for dropping a variable
p <- 0.99
#check how many NAs in each column (proportionally)
ind <- colSums(is.na(dat))/nrow(dat)
#only keep columns where threshold is not exceded
dat <- dat[, ind < p]
if you have non-numeric variables and you are dealing with a data.frame you could do something like this (assuming you don't care about order of columns):
#generate mixed data
dat <- matrix(rnorm(100), nrow = 20) #20 * 50 numeric numbers
df <- data.frame(letters[1 : 20], dat) #combined with one character column
foo <- function(dat, p){
ind <- colSums(is.na(dat))/nrow(dat)
dat[dat < 0] <- NA
dat[, ind < p]
}
#process numeric part of the data separately
ii <- sapply(df, class) == "numeric" #ind of numeric columns
dat.num <- foo(as.matrix(df[, ii]), 0.99) #feed numeric part of data to foo
#then stick the two partw back together again
data.frame(df[, !ii], dat.num)

This approach: Solution by YOLO suggested by #YOLO finally solved the issue:
cleanFun <- function(df){
# set negative values as NA
df[df < 0] <- NA
# faster, vectorized solution
# select numeric columns
num_cols <- names(df)[sapply(df, is.numeric)]
# get name of columns with 99% or more NA values
col_to_remove <- names(df)[colMeans(is.na(df[num_cols]))>=0.99]
# drop those columns
return (df[setdiff(colnames(df),col_to_remove)])
}
your_df <- cleanFun(your_df)

Related

undefined columns selected and cannot xtfrm data frame error

I am trying to write a code that checks for outliers based on IQR and change those respective values to "NA". So I wrote this:
dt <- rnorm(200)
dg <- rnorm(200)
dh <- rnorm(200)
l <- c(1,3) #List of relevant columns
df <- data.frame(dt,dg,dh)
To check if the column contains any outliers and change their value to NA:
vector.is.empty <- function(x) return(length(x) ==0)
#Checks for empty values in vector and returns booleans.
for (i in 1:length(l)){
IDX <- l[i]
BP <- boxplot.stats(df[IDX])
OutIDX <- which(df[IDX] %in% BP$out)
if (vector.is.empty(OutIDX)==FALSE){
for (u in 1:length(OutIDX)){
IDX2 <- OutIDX[u]
df[IDX2,IDX] <- NA
}
}
}
So, when I run this code, I get these error messages:
I've tried to search online for any good answers. but I'm not sure why they claim that the column is unspecified. Any clues here?
I would do something like that in order to replace the outliers:
# Set a seed (to make the example reproducible)
set.seed(31415)
# Generate the data.frame
df <- data.frame(dt = rnorm(100), dg = rnorm(100), dh = rnorm(100))
# A list to save the result of boxplot.stats()
l <- list()
for (i in 1:ncol(df)){
l[[i]] <- boxplot.stats(df[,i])
df[which(df[,i]==l[[i]]$out),i] <- NA
}
# Which values have been replaced?
lapply(l, function(x) x$out)

Picking 30 random data from data with sample()?

I am stuck.
We are asked to pick 30 random data from our dataset, then replace the picked values with NAs.
I'm stuck at the beginning, using the following function, as it selects 30 random data items from each column, while I want 30 random data picked among the whole dataset.
data2[sample(nrow(data2),30), ]
I hope you can help me out, thank you for your help/
Do you mean to replace 30 random rows?
data2 <- iris # as an example
throwouts <- sample(nrow(data2),30)
data2[throwouts, ] <- NA
print(data2)
Do you mean to replace 30 values in random rows and random columns?
data2 <- iris # as an example
coords <- expand.grid(1:nrow(data2),1:ncol(data2)) # all the possible values
coords <- coords[ sample(nrow(coords), 30), ] # take 30 unique ones of all possible values
for(i in 1:30) # erase each of them individually
data2[coords$Var1[i], coords$Var2[i] ] <- NA
print(data2)
The following seems to be memory efficient, it uses a logical matrix of FALSE values and 30 TRUE values in random positions to assign NA's.
set.seed(2020)
v <- rep(FALSE, prod(dim(df1)))
v[sample(length(v), 30)] <- TRUE
is.na(df1) <- matrix(v, nrow = nrow(df1))
rm(v)
This can easily be written as a function.
assignNA <- function(x, n){
v <- rep(FALSE, prod(dim(x)))
v[sample(length(v), 30)] <- TRUE
is.na(x) <- matrix(v, nrow = nrow(x))
x
}
set.seed(2020)
assignNA(df1, n = 30)
Tested with the data
df1 <- iris

Calculate the mode of all non-numeric columns in a dataframe

I would like to calculate the mode of each column from a dataframe. I have found similar posts on how to determine the mode of a vector of rows in a dataframe (but most have been with numeric data).
df <- data.frame(c("A","B","C","C"), c("A","A","B","C"),c("A","B","B","C"))
colnames(df) <- c("V1","V2","V3")
rownames(df) <- c(1,2,3,4)
df
I am using the following function:
modefunc <- function(x){
tabresult <- tabulate(x)
themode <- which(tabresult == max(tabresult))
if(sum(tabresult == max(tabresult))>1) themode <- NA
return(themode)
}
mode.vector <- apply(df, 1, modefunc)
Since my dataframe is not numeric, I unfortunately get the following error:
Error in tabulate(x) : 'bin' must be numeric or a factor
Any assistance with this would be helpful. Thanks in advance.

How to modify some but not all variables of a data frame?

Suppose there is a data.frame where some variables are coded as integers:
a <- c(1,2,3,4,5)
b <- as.integer(c(2,3,4,5,6))
c <- as.integer(c(5,1,0,9,2))
d <- as.integer(c(5,6,7,3,1))
e <- c(2,6,1,2,3)
df <- data.frame(a,b,c,d,e)
str(df)
Suppose I want to convert columns b to d to numeric:
varlist <- names(df)[2:4]
lapply(varlist, function(x) {
df$x <- as.numeric(x, data=x)
})
str(df)
does not work.
I tried:
df$b <- as.numeric(b, data=df)
df$c <- as.numeric(c, data=df)
df$d <- as.numeric(d, data=df)
str(df)
which works fine.
Questions:
How do I do this (in a loop or better with lapply, [but I'm a Stata person and as such used to writing loops])?
And more generally: how do I apply any function to a list of variables in a data.frame
(e.g. multiply each variable on the list with some other variable[which is always stays the same,
BONUS: or changes with each variable on the list])?
For the first question you can use sapply:
df[2:4] <- sapply(df[2:4],as.numeric)
for the second you should use mapply. For example to multiply the 3 variables(2 to 4) by some 3 different random scalars:
df[2:4] <- mapply(function(x,y)df[[x]]*y,2:4,rnorm(3))
df[,2:4] <- sapply(df[,2:4], as.numeric)
As for your second question, if you want to say multiply column c by 5
df$c <- df$c * 5
Or any vector the same length as c, maybe a new column multiplying c by d
df$cd <- df$c * df$d

return identical DF or vector instead of NULL

users,
I have data.frames which are NULL in my results, but I don't want them to be NULL. I want them to be the same as the beginning (unchanged). I'm working on a list of files and the aim of my code is to fill all the NA with data from my other data.frames (according to the best correlation coefficient). Here's a small example:
Imagine these are my 3 input data frames (10 rows each):
ST1 <- data.frame(x1=c(1:10))
ST2 <- data.frame(x2=c(1:5,NA,NA,8:10))
ST3 <- data.frame(x3=c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA))
The aim here is for example, if there're NAs in ST1, ST1 must be filled with data from the best correlated file with ST1 (between ST2 and ST3 in this example)).
As ST3 has no data here, I cannot have any correlation coefficient. So NAs from ST3 cannot be filled, and ST3 cannot also be used to fill another file. So ST3 has no use if you want. Nevertheless I want to keep ST3 unchanged during all my code.
So the problem in my code comes from data.frames with no data and so with only NAs.
For the moment my code would give this for "refill" (end of my code) (filled NA in my data.frames):
ST1 <- data.frame(x1=c(1:10))
ST2 <- data.frame(x2=c(1:5,6,7,8:10))
ST3 <- NULL
But actually, I want for results in "refill" this:
ST1 <- data.frame(x1=c(1:10))
ST2 <- data.frame(x2=c(1:5,6,7,8:10))
ST3 <- data.frame(x3=c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA))
So for data.frames with only NAs, I don't want them to be NULL in "refill", but I want them to be identical as in input. I need this to have the same dimensions of data.frames between inputs and outputs.
If they are as NULL (like it is for the moment but I don't understand why and I want to change this), there will be 0 rows in this data.frame instead of 10 rows like the other data.frames.
So I think there's something wrong in my code in function "process.all" or "na.fill" or maybe "lst".
Here's my code and it is a reproductible example for you to understand my error (you'll see in head(refill) ST2 is set as NULL).
Sorry if it is a bit long but my error depends on other functions previously used. Hope you've understand my problem and what I'm trying to do. Thanks for your help!
(For information, in function "process.all" and "na.fill": x is the data.frame I want to fill, and y is the file which will be used to fill x (so the best correlated file with x)).
Geoffrey
# my data for example
DF1 <- data.frame(x1=c(NA,NA,rnorm(3:20)),x2=c(31:50))
write.table(DF1,"ST001_2008.csv",sep=";")
DF2 <- data.frame(x1=c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,rnorm(1:10)),x2=c(1:20))
write.table(DF2,"ST002_2008.csv",sep=";")
DF3 <- data.frame(x1=rnorm(81:100),x2=NA)
write.table(DF3,"ST003_2008.csv",sep=";")
DF4 <- data.frame(x1=c(21:40),x2=rnorm(1:20))
write.table(DF4,"ST004_2008.csv",sep=";")
# Correlation table
corhiver2008capt1 <- read.table(text=" ST001 ST002 ST003 ST004
ST001 1.0000000 NA -0.4350665 0.3393549
ST002 NA NA NA NA
ST003 -0.4350665 NA 1.0000000 -0.4992513
ST004 0.3393549 NA -0.4992513 1.0000000",header=T)
lst <- lapply(list.files(pattern="\\_2008.csv$"), read.table,sep=";", header=TRUE, stringsAsFactors=FALSE)
Stations <-c("ST001","ST002","ST003","ST004")
names(lst) <- Stations
# searching the highest correlation for each data.Frame
get.max.cor <- function(station, mat){
mat[row(mat) == col(mat)] <- -Inf
m <- max(mat[station, ],na.rm=TRUE)
if (is.finite(m)) {return(which( mat[station, ] == m ))}
else {return(NA)}
}
# fill the data.frame with the data.frame which has the highest correlation coefficient
na.fill <- function(x, y){
if(all(!is.finite(y[1:10,1]))) return(y)
i <- is.na(x[1:10,1])
xx <- y[1:10,1]
new <- data.frame(xx=xx)
x[1:10,1][i] <- predict(lm(x[1:10,1]~xx, na.action=na.exclude),new)[i]
x
}
process.all <- function(df.list, mat){
f <- function(station)
na.fill(df.list[[ station ]], df.list[[ max.cor[station] ]])
g <- function(station){
x <- df.list[[station]]
if(any(!is.finite(x[1:10,1]))){
mat[row(mat) == col(mat)] <- -Inf
nas <- which(is.na(x[1:10,1]))
ord <- order(mat[station, ], decreasing = TRUE)[-c(1, ncol(mat))]
for(y in ord){
if(all(!is.na(df.list[[y]][1:10,1][nas]))){
xx <- df.list[[y]][1:10,1]
new <- data.frame(xx=xx)
x[1:10,1][nas] <- predict(lm(x[1:10,1]~xx, na.action=na.exclude), new)[nas]
break
}
}
}
x
}
n <- length(df.list)
nms <- names(df.list)
max.cor <- sapply(seq.int(n), get.max.cor, corhiver2008capt1)
df.list <- lapply(seq.int(n), f)
df.list <- lapply(seq.int(n), g)
names(df.list) <- nms
df.list
}
refill <- process.all(lst, corhiver2008capt1)
refill <- as.data.frame(refill) ########## HERE IS THE PROBLEM ######
refill
How about
if(sum(!is.na(ST3)) == 0) {
skip whatever you normally would do and go to the next vector
}
This assumes, of course, that you don't have any problems with, say, a vector of 1999 NAs and one numerical value.

Resources