Covert NAs to a separate level in each variable using mutate() - r

I have 12 variables that contained NA values as well. I need to covert NAs to a separate level. Level value for some variables is different. Following is the code:
Replace_NAs <- function(colindex, na_level){
cname <- colnames(tr[colindex])
tr <- tr %>% mutate(cname = as.factor(replace(cname, is.na(cname), na_level)))
return(tr)
}
for (i in 1:12) {
if(i == 5){
na_level <- 3;
tr <- Replace_NAs(i,na_level);
}
else if(i == 11){
na_level <- 5;
tr <- Replace_NAs(i,na_level);
}
else if(i == 4|6|8){
na_level <- 1;
tr <- Replace_NAs(i,na_level);
}
else{
na_level <- 20;
tr <- Replace_NAs(i,na_level);
}
}
Please help me. Thanks.

As Johan mentioned in the comments, you should include a reproducible example. Without that, we're left guessing at what exactly you want.
That said, here's my guess at what'll help you:
df %>%
mutate_at(vars(5), ~ replace_na(., 3)) %>%
mutate_at(vars(11), ~ replace_na(., 5)) %>%
mutate_at(vars(4, 6, 8) ~ replace_na(., 1)) %>%
mutate_at(vars(-c(4, 5, 6, 8, 11)), ~ replace_na(., 20))
Again, please provide a reproducible example with data and your desired output. A more robust answer to your question would explore applying a list of intended switches to your dataframe, but that would be overkill here.

Here's a way to do this using a for loop.
Consider this example :
tr <- data.frame(a = c(NA, 2, NA, 3), b = c(2, 3, NA, 4),
c = c(5, 6, NA, NA), d = c(1, 2, 3, NA))
tr
# a b c d
#1 NA 2 5 1
#2 2 3 6 2
#3 NA NA NA 3
#4 3 4 NA NA
Now prepare a list of column indices and a vector of replacement values
cols <- list(1, c(2, 3))
vals <- c(3, 5)
Use a for loop to replace the columns with the values
for(i in seq_along(cols)) {
tr[cols[[i]]][is.na(tr[cols[[i]]])] <- vals[i]
}
For remaining columns
f_cols <- setdiff(seq_len(ncol(tr)), unlist(cols))
tr[f_cols][is.na(tr[f_cols])] <- 20
tr
# a b c d
#1 3 2 5 1
#2 2 3 6 2
#3 3 5 5 3
#4 3 4 5 20
You can notice how NA's in column 1 is replaced with 3, how NA's in column 2 and 3 are replaced with 5 and for rest of the column it is replaced by 20.

Related

For each value in a vector get the corresponding next smallest value

For each element in a vector, I want the corresponding next smaller value in the vector, without changing the original order of the elements.
For example, suppose the given vector is:
c(4, 5, 5, 10, 3, 7)
Then the result would be:
c(3, 4, 4, 7, 0, 5)
Note that since 3 does not have any smaller value, I want it to be replaced with 0.
Any help will be much appreciated. Thank you.
We may use
sapply(v1, function(x) sort(v1)[match(x, sort(v1))-1][1])
[1] 3 4 4 7 NA 5
Or use a vectorized option
v2 <- unique(v1)
v3 <- sort(v2)
v4 <- v3[-length(v3)]
i1 <- match(v1, v3) - 1
i1[i1 == 0] <- NA
v4[i1]
[1] 3 4 4 7 NA 5
data
v1 <- c(4, 5, 5, 10, 3, 7)
We can try the code below using outer + max.col
> m <- outer(v, u <- sort(unique(v)), `>`)
> replace(u[max.col(m, ties.method = "last")], rowSums(m) == 0, NA)
[1] 3 4 4 7 NA 5
Using findInterval:
sx = sort(x)
i = findInterval(x, sx, left.open = TRUE)
sx[replace(i, i == 0, NA)]
# [1] 3 4 4 7 NA 5

Best way for looping in a dataframe in R

I am trying to create a program to iterate through a R data table. I am trying to avoid for loops, because as far as I know they are slow.
#creation of the data table
col <- c(0, 1, 0, 1, 0, 1)
Priority <- c(1,2,3,4,5,6) #1 highest, 6 lowest
IEC_category <- c("a","b","c","d","e","f")
eventlog_overlap.dt <- data.table(col,Priority, IEC_category)
#comparison and assignation of the priority
if (eventlog_overlap.dt$col == 1){
if (eventlog_overlap.dt$Priority <= shift(eventlog_overlap.dt$Priority,1)){
eventlog_overlap.dt$AlarmaPrior <- eventlog_overlap.dt$IEC_category #write the actual category
}
else{
eventlog_overlap.dt$AlarmaPrior <- shift(eventlog_overlap.dt$IEC_category,1) #write the previous category
}
} else{ eventlog_overlap.dt$AlarmaPrior <- NA
}
Pleas provide the desired result. A dplyr attempt:
library(dplyr)
library(hablar)
col <- c(0, 1, 0, 1, 0, 1)
Priority <- c(1,2,3,4,5,6) #1 highest, 6 lowest
IEC_category <- c("a","b","c","d","e","f")
df <- data.frame(col,Priority, IEC_category)
df %>%
mutate(AlarmaPrior = if_else_(col == 1,
if_else_(Priority <= lag(Priority),
IEC_category,
lag(IEC_category)), NA))
gives you:
col Priority IEC_category AlarmaPrior
1 0 1 a <NA>
2 1 2 b a
3 0 3 c <NA>
4 1 4 d c
5 0 5 e <NA>
6 1 6 f e

Randomly sample contiguous rows from a data frame or matrix

I want to sample a number of contiguous rows from a data frame df.
df <- data.frame(C1 = c(1, 2, 4, 7, 9), C2 = c(2, 4, 6, 8, 10))
I am trying to get something similar to the following which allows me to sample 3 random rows and repeat the process 100 times.
test <- replicate(100, df[sample(1:nrow(df), 3, replace=T),], simplify=F)
By contiguous the result should be something like:
[[1]]
C1 C2
2 2 4
3 4 6
4 7 8
[[2]]
C1 C2
1 1 2
2 2 4
3 4 6
.
.
.
How could I achieve this?
We just need to sample the starting row index for a chunk.
sample.block <- function (DF, chunk.size) {
if (chunk.size > nrow(DF)) return(NULL)
start <- sample.int(nrow(DF) - chunk.size + 1, 1)
DF[start:(start + chunk.size - 1), ]
}
replicate(100, sample.block(df, 3), simplify = FALSE)

How to add columns to data.frame based on vector length

I have a function runBootstrap whose output result is a vector of variable length (depending on # of values for cat, which itself is a product of test). Apologies that this isn't "minimal".
require(dplyr)
test <- function(combo) {
if(combo[1] == 4) {
cat <- 4
} else if((combo[1] == 3 & combo[2] == 2) | (combo[1] == 2 & combo[2] == 2)) {
cat <- 3
} else if((combo[1] == 2 & combo[2] == 1) | (combo[1] == 1 & combo[2] == 2)) {
cat <- 2
} else {
cat <- 1
}
}
arg1.freqs <- c(0.5, 0.2, 0.1, 0.1)
arg2.freqs <- c(0.8, 0.2)
runBootstrap <- function(arg1.freqs, arg2.freqs) {
sim.df <- data.frame(x1 = 1:10000, y1 = NA)
sim.df$x1 <- sample(1:4, 10000, replace = TRUE,
prob = arg1.freqs)
sim.df$y1 <- sample(1:2, 10000, replace = TRUE,
prob = arg2.freqs)
sim.df$cat <- NA
for(i in 1:nrow(sim.df)) {
combo <- c(sim.df[i, 1], sim.df[i, 2])
sim.df$cat[i] <- test(combo)
}
sim.df <- sim.df %>%
select(cat) %>%
group_by(cat) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
sim.df <- as.data.frame(sim.df)
result <- c(sim.df[1, 3], sim.df[2, 3])
}
In this current version there are only two values for cat so result is a vector of length 2; in a future version I will adjust code so that length(result) will equal # values of cat.
When using the function in a for loop, I would like to use the vector values to create new columns in an already existing data.frame df1. The code I've tried thus far is as follows:
df1$result <- NA
for (i in 1:nrow(df1)) {
df1$result[i] <- runBootstrap(arg1.freqs, arg2.freqs)
}
This clearly doesn't work unless the result vector is length = 1. But I don't know the length of the vector until the function runs (although once it runs it will be same length each iteration).
What I would like to achieve is the following:
Example 1: if length(result) == 2
df1.col x1 x2
1 1 1 1
2 2 2 2
3 3 3 3
4 4 4 4
5 5 5 5
6 6 6 6
Example 2: if length(result) == 3
df1.col x1 x2 x3
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 4 4 4 4
5 5 5 5 5
6 6 6 6 6
Thanks for any advice or direction.
edited for clarification
UPDATE - edited with solution
I got it to work as I wanted by creating a blank list, populating, then using rbind as follows:
appendResults <- function(df1, arg1, arg2) {
my.list <- vector("list", nrow(df1))
for (i in 1:nrow(df1)) {
arg1.freqs <- as.numeric(arg1[i, 3:6])
arg2.freqs <- as.numeric(arg2[i, 3:4])
my.list[[i]] <- runBootstrap(arg1.freqs, arg2.freqs)
}
result.df <- do.call(rbind, my.list)
df2 <- do.call(cbind, list(df1, result.df))
}
Check this one, not sure what the result looks like, but this creates empty columns, equal to the length of results, with NAs:
# fake data frame
df1 <- data.frame(x = c(1,2,3), y = c("a", "b", "c"))
# say result has length 3
res <- c(5,6,7)
# make columns with names x1, ..., x + length of res
# and assign NA values to those column
df1[ , paste("x", 1:length(res), sep = "")] <- NA

R remove duplicate data from each column

I get CSV's with hundreds of different columns and would like to be able to output a new file with the duplicate values removed from each column. Everything that I have seen and tried uses a specific column. I just need each column to be unique values.
For Example My Data:
df <- data.frame(A = c(1, 2, 3, 4, 5, 6), B = c(1, 0, 1, 0, 0, 1), C = c("Mr.","Mr.","Mrs.","Miss","Mr.","Mrs."))
df
A B C
1 1 1 Mr.
2 2 0 Mr.
3 3 1 Mrs.
4 4 0 Miss
5 5 0 Mr.
6 6 1 Mrs.
I would like:
A B C
1 1 1 Mr.
2 2 0 Mrs.
3 3 Miss
4 4
5 5
6 6
Then I can:
write.csv(df, file = file.path(df, "df_No_Dupes.csv"), na="")
So I can use it as a reference for my next task.
read.csv and write.csv work best with tabular data. Your desired output is not a good example of this (every row does not have the same number of columns).
You can easily get all the unique value for your columns with
vals <- sapply(df, unique)
Then you'd be better off saving this object with save() and load() to preserve the list as an R object.
Code snippet to work with a flexible number of columns, remove duplicate columns, and preserve column names:
require(rowr)
df <- data.frame(A = c(1, 2, 3, 4, 5, 6), B = c(1, 0, 1, 0, 0, 1), C = c("Mr.","Mr.","Mrs.","Miss","Mr.","Mrs."))
#get the number of columns in the dataframe
n <- ncol(df)
#loop through the columns
for(i in 1:ncol(df)){
#replicate column i without duplicates, fill blanks with NAs
df <- cbind.fill(df,unique(df[,1]), fill = NA)
#rename the new column
colnames(df)[n+1] <- colnames(df)[1]
#delete the old column
df[,1] <- NULL
}
df <- data.frame(A = c(1, 2, 3, 4, 5, 6), B = c(1, 0, 1, 0, 0, 1), C = c("Mr.","Mr.","Mrs.","Miss","Mr.","Mrs."))
for(i in 1:ncol(df)){
assign(paste("df_",i,sep=""), unique(df[,i]))
}
require(rowr)
df <- cbind.fill(df_1,df_2,df_3, fill = NA)
V1 V1 V1
1 1 1 Mr.
2 2 0 Mrs.
3 3 NA Miss
4 4 NA <NA>
5 5 NA <NA>
6 6 NA <NA>
or you could do
require(rowr)
df <- cbind.fill(df_1,df_2,df_3, fill = "")
df
V1 V1 V1
1 1 1 Mr.
2 2 0 Mrs.
3 3 Miss
4 4
5 5
6 6
If you want to avoid typing the name of each intermediate dataframe you can just use ls(pattern="df_") and get the objects named in that vector or use another loop.
If you want to change the column names back to their original values you can use:
colnames(output_df) <- colnames(input_df)
Then you can save the results however you, like, i.e.
saveRDS()
save()
or write it to a file.
Putting it all together:
df <- data.frame(A = c(1, 2, 3, 4, 5, 6), B = c(1, 0, 1, 0, 0, 1), C = c("Mr.","Mr.","Mrs.","Miss","Mr.","Mrs."))
for(i in 1:ncol(df)){
assign(paste("df_",i,sep=""), unique(df[,i]))
}
require(rowr)
files <- ls(pattern="df_")
df_output <- data.frame()
for(i in files){
df_output <- cbind.fill(df_output, get(i), fill = "")
}
df_output <- df_output[,2:4] # fix extra colname from initialization
colnames(df_output) <- colnames(df)
write.csv(df_output, "df_out.csv",row.names = F)
verify_it_worked <- read.csv("df_out.csv")
verify_it_worked
A B C
1 1 1 Mr.
2 2 0 Mrs.
3 3 Miss
4 4
5 5
6 6

Resources