Removing columns that are all 0 - r

I am trying to remove all columns in my dataframe that solely contain the value 0. My code is the following that I found on this website.
dataset = dataset[ ,colSums(dataset != 0) > 0]
However, I keep returning an error:
Error in [.data.frame(dataset, , colSums(dataset != 0) > 0) :
undefined columns selected

It's because you have an NA in at least one column. Fix like this:
dataset = dataset[ , colSums(dataset != 0, na.rm = TRUE) > 0]

Here's some code that will check which columns are numeric (or integer) and drop those that contain all zeros and NAs:
# example data
df <- data.frame(
one = rep(0,100),
two = sample(letters, 100, T),
three = rep(0L,100),
four = 1:100,
stringsAsFactors = F
)
# create function that checks numeric columns for all zeros
only_zeros <- function(x) {
if(class(x) %in% c("integer", "numeric")) {
all(x == 0, na.rm = TRUE)
} else {
FALSE
}
}
# apply that function to your data
df_without_zero_cols <- df[ , !sapply(df, only_zeros)]

There is an alternative using all():
dataset[, !sapply(dataset, function(x) all(x == 0))]
a c d f
1 1 -1 -1 a
2 2 0 NA a
3 3 1 1 a
In case of a large dataset, time and memory consuming copying can be avoided through removing the columns by reference
library(data.table)
cols <- which(sapply(dataset, function(x) all(x == 0)))
setDT(dataset)[, (cols) := NULL]
dataset
a c d f
1: 1 -1 -1 a
2: 2 0 NA a
3: 3 1 1 a
Data
dataset <- data.frame(a = 1:3, b = 0, c = -1:1, d = c(-1, NA, 1), e = 0, f ="a")
dataset
a b c d e f
1 1 0 -1 -1 0 a
2 2 0 0 NA 0 a
3 3 0 1 1 0 a

Related

Column type set by first element being evaluated in r/data.table

I have a function that returns NA under certain conditions and an integer otherwise (an integer vector in fact, but it doesn't matter now).
When I apply this function to groups of elements in a data.table and the first group returns NA, then the whole column is erroneously set to logical thus screwing up the following elements. How can I prevent this behaviour?
Example:
library(data.table)
myfun <- function(x) {
if(x == 0) {
return(NA)
} else {
return(x*2)
}
}
DT <- data.table(x= c(0, 1, 2, 3), y= LETTERS[1:4])
DT
x y
1: 0 A
2: 1 B
3: 2 C
4: 3 D
The following should assign to column x2 the values c(NA, 2, 4, 6). Instead, I get c(NA, TRUE, TRUE, TRUE) with warnings:
DT[, x2 := myfun(x), by= y]
Warning messages:
1: In `[.data.table`(DT, , `:=`(x2, myfun(x)), by = y) :
Group 2 column 'x2': 2.000000 (type 'double') at RHS position 1 taken as TRUE when assigning to type 'logical'
2: In `[.data.table`(DT, , `:=`(x2, myfun(x)), by = y) :
Group 3 column 'x2': 4.000000 (type 'double') at RHS position 1 taken as TRUE when assigning to type 'logical'
3: In `[.data.table`(DT, , `:=`(x2, myfun(x)), by = y) :
Group 4 column 'x2': 6.000000 (type 'double') at RHS position 1 taken as TRUE when assigning to type 'logical'
DT
x y x2
1: 0 A NA
2: 1 B TRUE
3: 2 C TRUE
4: 3 D TRUE
Changing the order of the rows gives the expected result:
DT <- data.table(x= c(1, 2, 3, 0), y= LETTERS[1:4])
DT[, x2 := myfun(x), by= y]
DT
x y x2
1: 1 A 2
2: 2 B 4
3: 3 C 6
4: 0 D NA
I can preset the value of column x2:
DT <- data.table(x= c(0, 1, 2, 3), y= LETTERS[1:4])
DT[, x2 := integer()]
DT[, x2 := myfun(x), by= y]
DT
x y x2
1: 0 A NA
2: 1 B 2
3: 2 C 4
4: 3 D 6
but I wonder if there are better options that don't require me to set the column type beforehand.
This is with data.table v1.14.0, R 3.6.3
Do not let your function return NA, but NA_integer_, or NA_real_..
problem solved ;-)
myfun <- function(x) {
if(x == 0) {
return(NA_integer_) #<-- !!
} else {
return(x*2)
}
}

Dispatch values in list column to separate columns

I have a data.table with a list column "c":
df <- data.table(a = 1:3, c = list(1L, 1:2, 1:3))
df
a c
1: 1 1
2: 2 1,2
3: 3 1,2,3
I want to create separate columns for the values in "c".
I create a set of new columns F_1, F_2, F_3:
mmax <- max(df$a)
flux <- paste("F", 1:mmax, sep = "_")
df[, (flux) := 0]
df
a c F_1 F_2 F_3
1: 1 1 0 0 0
2: 2 1,2 0 0 0
3: 3 1,2,3 0 0 0
I want to dispatch values in "c" to columns F_1, F_2, F_3 like this:
df
a c F_1 F_2 F_3
1: 1 1 1 0 0
2: 2 1,2 1 2 0
3: 3 1,2,3 1 2 3
What I have tried:
comp_vect <- function(vec, mmax){
vec <- vec %>% unlist()
n <- length(vec)
answr <- c(vec, rep(0, l = mmax -n))
}
df[ , ..flux := mapply(comp_vect, c, mmax)]
The expected data.table is :
> df
a c F_1 F_2 F_3
1: 1 1 1 0 0
2: 2 1,2 1 2 0
3: 3 1,2,3 1 2 3
I followed a radically different approach. I rbinded the list column and then dcasted it, obtaining the desired result. Last part is to set the names.
library(data.table)
df <- data.table(a = 1:3, d = list(1L, c(1L, 2L), c(1L, 2L, 3L)))
df2 <- df[, rbind(d), by = a][, dcast(.SD, a ~ V1, fill = 0)]
setnames(df2, 2:4, flux)[]
a F_1 F_2 F_3
1: 1 1 0 0
2: 2 1 2 0
3: 3 1 2 3
where flux is the variable of names that you defined in your question.
Please notice that avoided using the column name c, as it may be confused with the function c().
Solution :
for(idx in seq(max(sapply(df$c, length)))){ # maximum number of values according to all the elements of the list
set(x = df,
i = NULL,
j = paste0("F_",idx), # column's name
value = sapply(df$c, function(x){
if(is.na(x[idx])){
return(0) # 0 instead of NA
} else {
return(x[idx])
}
})
)
}
Explications :
We can extract the values from a list like this :
sapply(df$c, function(ll) return(ll[1])) # first value
[1] 1 1 1
sapply(df$c, function(ll) return(ll[2])) # second value
[1] NA 2 2
sapply(df$c, function(ll) return(ll[3])) # third value
[1] NA NA 3
We see that if there is no value, we have a NA.
We need an iterator to extract all values at the position idx. For that, we'll find the number of values in each element of df$c (the list) and keep the maximum.
max(sapply(df$c, length))
[1] 3
If we want zeros instead of NAs, we need to create a function in the sapply to convert them :
vec <- c(NA, 5, 1, NA)
> sapply(vec, function(x) if(is.na(x)) return(0) else return(x))
[1] 0 5 1 0

Insert a blank row before zero

x<-c(0,1,1,0,1,1,1,0,1,1)
aaa<-data.frame(x)
How to insert a blank row before zero? When the first row is zeroļ¼Œdo not add blank row. Thank you.
Result:
0
1
1
.
0
1
1
1
.
0
1
1
Below we used dot but you can replace "." with NA or "" or something else depending on what you want.
1) We can use Reduce and append:
Append <- function(x, y) append(x, ".", y - 1)
data.frame(x = Reduce(Append, setdiff(rev(which(aaa$x == 0)), 1), init = aaa$x))
2) gsub Another possibility is to convert to a character string, use gsub and convert back:
data.frame(x = strsplit(gsub("(.)0", "\\1.0", paste(aaa$x, collapse = "")), "")[[1]])
3) We can create a two row matrix in which the first row is dot before each 0 and NA otherwise. Then unravel it to a vector and use na.omit to remove the NA values.
data.frame(x = na.omit(c(rbind(replace(ifelse(aaa$x == 0, ".", NA), 1, NA), aaa$x))))
4) We can lapply over aaa$x[-1] outputting c(".", 9) or 1. Unlist that and insert aaa$x[1] back in. No packages are used.
repl <- function(x) if (!x) c(".", 0) else 1
data.frame(x = c(aaa$x[1], unlist(lapply(aaa$x[-1], repl))))
5) Create a list of all but the first element and replace the 0's in that list with c(".", 0) . Unlist that and insert the first element back in. No packages are used.
L <- as.list(aaa$x[-1])
L[x[-1] == 0] <- list(c(".", 0))
data.frame(x = c(aaa$x[1], unlist(L)))
6) Assuming aaa has two columns where the second column is character (NOT factor). Append a row of dots to aaa and then create an index vector using unlist and Map to access the appropriate row of the extended aaa.
aaa <- data.frame(x = c(0,1,1,0,1,1,1,0,1,1), y = letters[1:10],
stringsAsFactors = FALSE)
nr <- nrow(aaa); nc <- ncol(aaa)
fun <- function(ix, x) if (!is.na(x) & x == 0 & ix > 1) c(nr + 1, ix) else ix
rbind(aaa, rep(".", nc))[unlist(Map(fun, 1:nr, aaa$x)), ]
If we did want to have y be factor then note that we can't just add a dot to a factor if it is not a level of that factor so there is the question of what levels the factor can have. To get around that let us add an NA rather than a dot to the factor. Then we get the following which is the same except that aaa has been redefined so that y is a factor, we no longer need nc since we are assuming 2 columns and rep(...) in the last line is replaced with c(".", NA).
aaa <- data.frame(x = c(0,1,1,0,1,1,1,0,1,1), y = letters[1:10])
nr <- nrow(aaa)
fun <- function(ix, x) if (!is.na(x) & x == 0 & ix > 1) c(nr + 1, ix) else ix
rbind(aaa, c(".", NA))[unlist(Map(fun, 1:nr, aaa$x)), ]
One dplyr and tidyr possibility may be:
aaa %>%
uncount(ifelse(row_number() > 1 & x == 0, 2, 1)) %>%
mutate(x = ifelse(x == 0 & lag(x == 1, default = first(x)), NA_integer_, x))
x
1 0
2 1
3 1
4 NA
5 0
6 1
7 1
8 1
9 NA
10 0
11 1
12 1
It is not adding a blank row as you have a numeric vector. Instead, it is adding a row with NA. If you need a blank row, you can convert it into a character vector and then replace NA with blank.
ind = with(aaa, ifelse(x == 0 & seq_along(x) > 1, 2, 1))
d = aaa[rep(1:NROW(aaa), ind), , drop = FALSE]
transform(d, x = replace(x, sequence(ind) == 2, NA))
Here is an option with rleid
library(data.table)
setDT(aaa)[, .(x = if(x[.N] == 1) c(x, NA) else x), rleid(x)][-.N, .(x)]
# x
# 1: 0
# 2: 1
# 3: 1
# 4: NA
# 5: 0
# 6: 1
# 7: 1
# 8: 1
# 9: NA
#10: 0
#11: 1
#12: 1
data.frame(x = unname(unlist(by(aaa$x,cumsum(aaa==0),c,'.'))))
x
1 0
2 1
3 1
4 .
5 0
6 1
7 1
8 1
9 .
10 0
11 1
12 1
13 .
My solution is
aaa <- data.frame(x = c(0,1,1,0,1,1,1,0,1,1), y = letters[1:10])
aaa$ind = with(aaa, ifelse(x == 0 & seq_along(x) > 1, 2, 1))
aaa<-aaa[rep(1:nrow(aaa), aaa$ind), ,]
aaa[(aaa$ind== 2 & !grepl(".1",rownames(aaa))),]<-NA
aaa$ind<- NULL
aaa
x y
1 0 a
2 1 b
3 1 c
4 NA <NA>
4.1 0 d
5 1 e
6 1 f
7 1 g
8 NA <NA>
8.1 0 h
9 1 i
10 1 j

Select columns based on columns sum

Any suggestion to select the columns of the row when value =1 and the sum columns values =1. it means that I will just select unique values, non-shared with the other individuals.
indv. X Y Z W T J
A 1 0 1 0 0 1
B 0 1 1 0 0 0
C 0 0 1 1 0 0
D 0 0 1 0 1 0
A: X, J
B: Y
C: W
D: T
Here you go! A solution in base r.
First we simulate your data, a data.frame with named rows and columns.
You can use sapply() to loop over the column indices.
A for-loop over the column indices will achieve the same thing.
Finally, save the results in a data.frame however you want.
# Simulate your example data
df <- data.frame(matrix(c(1, 0, 1, 0, 0, 1,
0, 1, 1, 0, 0, 0,
0, 0, 1, 1, 0, 0,
0, 0, 1, 0, 1, 0), nrow = 4, byrow = T))
# Names rows and columns accordingly
names(df) <- c("X", "Y", "Z", "W", "T", "J")
rownames(df) <- c("A", "B","C", "D")
> df
X Y Z W T J
A 1 0 1 0 0 1
B 0 1 1 0 0 0
C 0 0 1 1 0 0
D 0 0 1 0 1 0
Then we select columns where the sum == 1- columns with unique values.
For every one of these columns, we find the row of this value.
# Select columns with unique values (if sum of column == 1)
unique.cols <- which(colSums(df) == 1)
# For every one of these columns, select the row where row-value==1
unique.rows <- sapply(unique.cols, function(x) which(df[, x] == 1))
> unique.cols
X Y W T J
1 2 4 5 6
> unique.rows
X Y W T J
1 2 3 4 1
The rows are not named correctly yet (they are still the element named of unique.cols). So we reference the rownames of df to get the rownames.
# Data.frame of unique values
# Rows and columns in separate columns
df.unique <- data.frame(Cols = unique.cols,
Rows = unique.rows,
Colnames = names(unique.cols),
Rownames = rownames(df)[unique.rows],
row.names = NULL)
The result:
df.unique
Cols Rows Colnames Rownames
1 1 1 X A
2 2 2 Y B
3 4 3 W C
4 5 4 T D
5 6 1 J A
Edit:
This is how you could summarise the values per row using dplyr.
library(dplyr)
df.unique %>% group_by(Rownames) %>%
summarise(paste(Colnames, collapse=", "))
# A tibble: 4 x 2
Rownames `paste(Colnames, collapse = ", ")`
<fct> <chr>
1 A X, J
2 B Y
3 C W
4 D T
One idea is to use rowwise apply to find the columns with 1, after we filter out the columns with sum != to 1, i.e.
apply(df[colSums(df) == 1], 1, function(i) names(df[colSums(df) == 1])[i == 1])
$A
[1] "X" "J"
$B
[1] "Y"
$C
[1] "W"
$D
[1] "T"
You can play around with the output to get it to desired state, i.e.
apply(df[colSums(df) == 1], 1, function(i) toString(names(df[colSums(df) == 1])[i == 1]))
# A B C D
#"X, J" "Y" "W" "T"
Or
data.frame(cols = apply(df[colSums(df) == 1], 1, function(i) toString(names(df[colSums(df) == 1])[i == 1])))
# cols
#A X, J
#B Y
#C W
#D T
Here is an option with tidyverse. We gather the dataset to 'long' format, grouped by 'key', fiter the rows where 'val' is 1 and the sum of 'val is 1, grouped by 'indv.', summarise the 'key' by pasteing the elements together
library(dplyr)
library(tidyr)
gather(df1, key, val, -indv.) %>%
group_by(key) %>%
filter(sum(val) == 1, val == 1) %>%
group_by(indv.) %>%
summarise(key = toString(key))
# A tibble: 4 x 2
# indv. key
# <chr> <chr>
#1 A X, J
#2 B Y
#3 C W
#4 D T

Take certain value in a data frame

I have a data.frame and would like to take a certain value from a cell if another is in a dataframe.
I tried the apply function.
n <- c(2, 3, 0 ,1)
s <- c(0, 1, 1, 2)
b <- c("THIS", "FALSE", "NOT", "THIS")
df <- data.frame(n, s, b)
df <- sapply(df$Vals, FUN=function(x){ if(b[x]=="THIS") ? n[x] : s[x] } )
My logic is:
if(b at position x is equal to "This") {
add n[x] to the column df$Vals
} else {
add s[x] to the column df$Vals
}
Whereas x is a single row.
Any recommendation what I am doing wrong?
I appreciate your reply!
Like this:
df$Vals = with(df, ifelse(b=="THIS", n, s))
Or giving direct the resulting data.frame:
transform(df, Vals=with(df, ifelse(b=="THIS", n, s)))
# n s b Vals
#1 2 0 THIS 2
#2 3 1 FALSE 1
#3 0 1 NOT 1
#4 1 2 THIS 1
With your additional conditions:
func=Vectorize(function(b, s, n){if(b=='THIS') return(n);if(b==F) return(n+s);s})
df$Vals = with(df, func(b,s,n))
Or you could use the row/column indexing
df$Vals <- df[1:2][cbind(1:nrow(df),(df$b!='THIS')+1)]
df
# n s b Vals
#1 2 0 THIS 2
#2 3 1 FALSE 1
#3 0 1 NOT 1
#4 1 2 THIS 1

Resources