Related
Given the data frame below:
df <- data.frame(v1 = c(3, 0, 5, 1, 0),
v2 = c(2, 0, 0, 0, 0),
v3 = c(0, 0, 3, 0, 0),
v4 = c(0, 0, 0, 2, 0),
v5 = c(0, 0, 0, 0, 0),
v6 = c(0, 0, 0, 0, 7))
df
v1 v2 v3 v4 v5 v6
1 3 2 0 0 0 0
2 0 0 0 0 0 0
3 5 0 3 0 0 0
4 1 0 0 2 0 0
5 0 0 0 0 0 7
The desired result is the following data frame:
v1 v2 v3 v4 v5 v6
1 3 2 NA NA NA NA
2 NA NA NA NA NA NA
3 5 0 3 NA NA NA
4 1 0 0 2 NA NA
5 0 0 0 0 0 7
I'd like to replace all consecutive zeros in each row with NA under the condition that, looking at each row from left to right, there exists no non-zero number further down the row.
I've written a for loop to achieve this result, but this is really slow for a larger data frame:
for(i in 1:nrow(df)) {
for (j in 1:ncol(df)){
if ((df[i,j] == 0) & (apply(df[j:ncol(df)], 1, sum)[i] == 0)){
df[i,j] <- NA
}
}
}
I'd like a more efficient solution.
There should be a more efficient way to do this but here is one attempt using apply :
t(apply(df, 1, function(x) {
inds <- suppressWarnings(pmax(max(which(x != 0)), 0))
if(inds < length(x)) x[(inds + 1):length(x)] <- NA
x
}))
Another solution could be :
#Get the last non-zero column index for each row using `max.col`
#cbind it with row index
inds <- cbind(1:nrow(df), max.col(df != 0, ties.method = 'last'))
#Remove rows where the non-zero values is at last column
inds <- inds[!(inds[, 2] == ncol(df) & rowSums(df != 0) > 0), ]
inds[inds[, 2] == 6, 2] <- 0
#Create a sequence between last non-zero value to last value in column
#for each row and replace it with NA
df[do.call(rbind, Map(function(x, y)
cbind(x, y:ncol(df)), inds[, 1], inds[, 2] + 1))] <- NA
This solution uses apply to loop over the rows, but only vectorized functions within each row, so it effectively removes one non-vectorized loop.
trailing_zeros_to_NA=function(df){
l=ncol(df)
t(apply(df,1,function(row) {
row=rev(row)
row[seq_len(c(which(row!=0)-1,l)[1])]=NA
rev(row)
}))
}
Speed-up is about 20x.
original=function(df){
for(i in 1:nrow(df)) {
for (j in 1:ncol(df)){
if ((df[i,j] == 0) & (apply(df[j:ncol(df)], 1, sum)[i] == 0)){
df[i,j] <- NA
}
}
}
df
}
microbenchmark(original(df),trailing_zeros_to_NA(df))
# Unit: microseconds
# expr min lq mean median uq max neval
# original(df) 4672.029 5003.6895 5857.7659 5422.5345 6318.148 10676.192 100
# trailing_zeros_to_NA(df) 218.482 241.5945 312.7013 267.5695 315.370 2545.555 100
Same here, there must be a more elegant way, but you could use data.table and run a function for each individual row.
require(data.table)
dt <- data.table(ID = seq(1,5,1), # add unique ID
v1 = c(3, 0, 5, 1, 0),
v2 = c(2, 0, 0, 0, 0),
v3 = c(0, 0, 3, 0, 0),
v4 = c(0, 0, 0, 2, 0),
v5 = c(0, 0, 0, 0, 0),
v6 = c(0, 0, 0, 0, 7))
dt[, v1:= ifelse(v1==0 & sum(v2,v3,v4,v5,v6)==0,NA,v1),by=ID]
dt[, v2:= ifelse(v2==0 & sum(v3,v4,v5,v6)==0, NA,v2),by=ID]
dt[, v3:= ifelse(v3==0 & sum(v4,v5,v6)==0, NA,v3),by=ID]
dt[, v4:= ifelse(v4==0 & sum(v5,v6)==0, NA,v4),by=ID]
dt[, v5:= ifelse(v5==0 & sum(v6)==0, NA,v5),by=ID]
dt[, v6:= ifelse(v5==0 , NA,v6),by=ID]
I am trying to remove all columns in my dataframe that solely contain the value 0. My code is the following that I found on this website.
dataset = dataset[ ,colSums(dataset != 0) > 0]
However, I keep returning an error:
Error in [.data.frame(dataset, , colSums(dataset != 0) > 0) :
undefined columns selected
It's because you have an NA in at least one column. Fix like this:
dataset = dataset[ , colSums(dataset != 0, na.rm = TRUE) > 0]
Here's some code that will check which columns are numeric (or integer) and drop those that contain all zeros and NAs:
# example data
df <- data.frame(
one = rep(0,100),
two = sample(letters, 100, T),
three = rep(0L,100),
four = 1:100,
stringsAsFactors = F
)
# create function that checks numeric columns for all zeros
only_zeros <- function(x) {
if(class(x) %in% c("integer", "numeric")) {
all(x == 0, na.rm = TRUE)
} else {
FALSE
}
}
# apply that function to your data
df_without_zero_cols <- df[ , !sapply(df, only_zeros)]
There is an alternative using all():
dataset[, !sapply(dataset, function(x) all(x == 0))]
a c d f
1 1 -1 -1 a
2 2 0 NA a
3 3 1 1 a
In case of a large dataset, time and memory consuming copying can be avoided through removing the columns by reference
library(data.table)
cols <- which(sapply(dataset, function(x) all(x == 0)))
setDT(dataset)[, (cols) := NULL]
dataset
a c d f
1: 1 -1 -1 a
2: 2 0 NA a
3: 3 1 1 a
Data
dataset <- data.frame(a = 1:3, b = 0, c = -1:1, d = c(-1, NA, 1), e = 0, f ="a")
dataset
a b c d e f
1 1 0 -1 -1 0 a
2 2 0 0 NA 0 a
3 3 0 1 1 0 a
I have a data frame x.
x <- data.frame(a = c(10, 20, 30, 0), b = c(1, 2, 3, 0), c = c(1, 2, 3, 0), d = c(8, 16, 24, 0))
x
denominator_var <- "a"
numerator_vars <- c("b", "c", "d")
Using dplyr, I'm trying to add new columns (b_share, c_share, and d_share) such that each of them are equal to the corresponding column (b, c, and d) divided into a.
However, it is important to me to use NOT the original variable names but dynamic variable names.
My code below is not working. What's wrong?
x %>% mutate_at(vars(one_of(numerator_vars)),
funs(share = ifelse(!!(denominator_var) > 0, round(./!!(denominator_var) * 100, 2), 0)))
Thank you very much!
You can try using as.name before applying !!:
x %>% mutate_at(vars(one_of(numerator_vars)), funs(share =
ifelse(!!(as.name(denominator_var)), round(./!!(as.name(denominator_var))) * 100, 2)))
# a b c d b_share c_share d_share
# 1 10 1 1 8 0 0 100
# 2 20 2 2 16 0 0 100
# 3 30 3 3 24 0 0 100
# 4 0 0 0 0 2 2 2
You can get the result you want by quoting your denominator beforehand:
denominator_var <- quo(a)
x %>% mutate_at(numerator_vars,
funs(share = ifelse(!!(denominator_var) > 0,
round(./!!(denominator_var) * 100, 2),
0)))
Also note that you don't need to use vars for your vector numerator_vars.
Any suggestion to select the columns of the row when value =1 and the sum columns values =1. it means that I will just select unique values, non-shared with the other individuals.
indv. X Y Z W T J
A 1 0 1 0 0 1
B 0 1 1 0 0 0
C 0 0 1 1 0 0
D 0 0 1 0 1 0
A: X, J
B: Y
C: W
D: T
Here you go! A solution in base r.
First we simulate your data, a data.frame with named rows and columns.
You can use sapply() to loop over the column indices.
A for-loop over the column indices will achieve the same thing.
Finally, save the results in a data.frame however you want.
# Simulate your example data
df <- data.frame(matrix(c(1, 0, 1, 0, 0, 1,
0, 1, 1, 0, 0, 0,
0, 0, 1, 1, 0, 0,
0, 0, 1, 0, 1, 0), nrow = 4, byrow = T))
# Names rows and columns accordingly
names(df) <- c("X", "Y", "Z", "W", "T", "J")
rownames(df) <- c("A", "B","C", "D")
> df
X Y Z W T J
A 1 0 1 0 0 1
B 0 1 1 0 0 0
C 0 0 1 1 0 0
D 0 0 1 0 1 0
Then we select columns where the sum == 1- columns with unique values.
For every one of these columns, we find the row of this value.
# Select columns with unique values (if sum of column == 1)
unique.cols <- which(colSums(df) == 1)
# For every one of these columns, select the row where row-value==1
unique.rows <- sapply(unique.cols, function(x) which(df[, x] == 1))
> unique.cols
X Y W T J
1 2 4 5 6
> unique.rows
X Y W T J
1 2 3 4 1
The rows are not named correctly yet (they are still the element named of unique.cols). So we reference the rownames of df to get the rownames.
# Data.frame of unique values
# Rows and columns in separate columns
df.unique <- data.frame(Cols = unique.cols,
Rows = unique.rows,
Colnames = names(unique.cols),
Rownames = rownames(df)[unique.rows],
row.names = NULL)
The result:
df.unique
Cols Rows Colnames Rownames
1 1 1 X A
2 2 2 Y B
3 4 3 W C
4 5 4 T D
5 6 1 J A
Edit:
This is how you could summarise the values per row using dplyr.
library(dplyr)
df.unique %>% group_by(Rownames) %>%
summarise(paste(Colnames, collapse=", "))
# A tibble: 4 x 2
Rownames `paste(Colnames, collapse = ", ")`
<fct> <chr>
1 A X, J
2 B Y
3 C W
4 D T
One idea is to use rowwise apply to find the columns with 1, after we filter out the columns with sum != to 1, i.e.
apply(df[colSums(df) == 1], 1, function(i) names(df[colSums(df) == 1])[i == 1])
$A
[1] "X" "J"
$B
[1] "Y"
$C
[1] "W"
$D
[1] "T"
You can play around with the output to get it to desired state, i.e.
apply(df[colSums(df) == 1], 1, function(i) toString(names(df[colSums(df) == 1])[i == 1]))
# A B C D
#"X, J" "Y" "W" "T"
Or
data.frame(cols = apply(df[colSums(df) == 1], 1, function(i) toString(names(df[colSums(df) == 1])[i == 1])))
# cols
#A X, J
#B Y
#C W
#D T
Here is an option with tidyverse. We gather the dataset to 'long' format, grouped by 'key', fiter the rows where 'val' is 1 and the sum of 'val is 1, grouped by 'indv.', summarise the 'key' by pasteing the elements together
library(dplyr)
library(tidyr)
gather(df1, key, val, -indv.) %>%
group_by(key) %>%
filter(sum(val) == 1, val == 1) %>%
group_by(indv.) %>%
summarise(key = toString(key))
# A tibble: 4 x 2
# indv. key
# <chr> <chr>
#1 A X, J
#2 B Y
#3 C W
#4 D T
I have two binary columns:
col1 col2
0 1
0 0
1 0
1 1
I would like to merge this columns and if value 1 exist into one of in both columns I would like to have the 1 value. Example of output
merged_col
1
0
1
1
The general merged I tried is this:
merge(df$col1, df$col2, all = TRUE)
Any idea how can I handle the values?
You can just treat them as logical values and use or...
df$col3 <- as.integer(df$col1|df$col2)
The code below should do what you need:
df <- data.frame(col1 = c(0, 0, 1, 1), col2 = c(1, 0, 0, 1))
df$merge_col <- ifelse(df$col1 == 1 | df$col2 == 1, 1, 0)