Sum of two Columns of Data Frame with NA Values - r

I have a data frame with some NA values. I need the sum of two of the columns. If a value is NA, I need to treat it as zero.
a b c d
1 2 3 4
5 NA 7 8
Column e should be the sum of b and c:
e
5
7
I have tried a lot of things, and done two dozen searches with no luck. It seems like a simple problem. Any help would be appreciated!

dat$e <- rowSums(dat[,c("b", "c")], na.rm=TRUE)
dat
# a b c d e
# 1 1 2 3 4 5
# 2 5 NA 7 8 7

dplyr solution, taken from here:
library(dplyr)
dat %>%
rowwise() %>%
mutate(e = sum(b, c, na.rm = TRUE))

Here is another solution, with concatenated ifelse():
dat$e <- ifelse(is.na(dat$b) & is.na(dat$c), dat$e <-0, ifelse(is.na(dat$b), dat$e <- 0 + dat$c, dat$b + dat$c))
# a b c d e
#1 1 2 3 4 5
#2 5 NA 7 8 7
Edit, here is another solution that uses with as suggested by #kasterma in the comments, this is much more readable and straightforward:
dat$e <- with(dat, ifelse(is.na(b) & is.na(c ), 0, ifelse(is.na(b), 0 + c, b + c)))

if you want to keep NA if both columns has it you can use:
Data, sample:
dt <- data.table(x = sample(c(NA, 1, 2, 3), 100, replace = T), y = sample(c(NA, 1, 2, 3), 100, replace = T))
Solution:
dt[, z := ifelse(is.na(x) & is.na(y), NA_real_, rowSums(.SD, na.rm = T)), .SDcols = c("x", "y")]
(the data.table way)

I hope that it may help you
Some cases you have a few columns that are not numeric. This approach will serve you both.
Note that: c_across() for dplyr version 1.0.0 and later
df <- data.frame(
TEXT = c("text1", "text2"), a = c(1,5), b = c(2, NA), c = c(3,7), d = c(4,8))
df2 <- df %>%
rowwise() %>%
mutate(e = sum(c_across(a:d), na.rm = TRUE))
# A tibble: 2 x 6
# Rowwise:
# TEXT a b c d e
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 text1 1 2 3 4 10
# 2 text2 5 NA 7 8 20

Related

Split df column of integers into individual digits in R

I have a df where one variable is an integer. I'd like to split this column into it's individual digits. See my example below
Group Number
A 456
B 3
C 18
To
Group Number Digit1 Digit2 Digit3
A 456 4 5 6
B 3 3 NA NA
C 18 1 8 NA
We can use read.fwf from base R. Find the max number of character (nchar) in 'Number' column (mx). Read the 'Number' column after converting to character (as.character), specify the 'widths' as 1 by replicating 1 with mx and assign the output to new 'Digit' columns in the data
mx <- max(nchar(df1$Number))
df1[paste0("Digit", seq_len(mx))] <- read.fwf(textConnection(
as.character(df1$Number)), widths = rep(1, mx))
-output
df1
# Group Number Digit1 Digit2 Digit3
#1 A 456 4 5 6
#2 B 3 3 NA NA
#3 C 18 1 8 NA
data
df1 <- structure(list(Group = c("A", "B", "C"), Number = c(456L, 3L,
18L)), class = "data.frame", row.names = c(NA, -3L))
Another base R option (I think #akrun's approach using read.fwf is much simpler)
cbind(
df,
with(
df,
type.convert(
`colnames<-`(do.call(
rbind,
lapply(
strsplit(as.character(Number), ""),
`length<-`, max(nchar(Number))
)
), paste0("Digit", seq(max(nchar(Number))))),
as.is = TRUE
)
)
)
which gives
Group Number Digit1 Digit2 Digit3
1 A 456 4 5 6
2 B 3 3 NA NA
3 C 18 1 8 NA
Using splitstackshape::cSplit
splitstackshape::cSplit(df, 'Number', sep = '', stripWhite = FALSE, drop = FALSE)
# Group Number Number_1 Number_2 Number_3
#1: A 456 4 5 6
#2: B 3 3 NA NA
#3: C 18 1 8 NA
Updated
I realized I could use max function for counting characters limit in each row so that I could include it in my map2 function and save some lines of codes thanks to an accident that led to an inspiration by dear #ThomasIsCoding.
library(dplyr)
library(tidyr)
library(purrr)
library(stringr)
df %>%
rowwise() %>%
mutate(map2_dfc(Number, 1:max(nchar(Number)), ~ str_sub(.x, .y, .y))) %>%
unnest(cols = !c(Group, Number)) %>%
rename_with(~ str_replace(., "\\.\\.\\.", "Digit"), .cols = !c(Group, Number)) %>%
mutate(across(!c(Group, Number), as.numeric, na.rm = TRUE))
# A tibble: 3 x 5
Group Number Digit1 Digit2 Digit3
<chr> <dbl> <dbl> <dbl> <dbl>
1 A 456 4 5 6
2 B 3 3 NA NA
3 C 18 1 8 NA
Data
df <- tribble(
~Group, ~Number,
"A", 456,
"B", 3,
"C", 18
)
Two base r methods:
no_cols <- max(nchar(as.character(df1$Number)))
# Using `strsplit()`:
cbind(df1, setNames(data.frame(do.call(rbind,
lapply(strsplit(as.character(df1$Number), ""),
function(x) {
length(x) <- no_cols
x
}
)
)
), paste0("Digit", seq_len(no_cols))))
# Using `regmatches()` and `gregexpr()`:
cbind(df1, setNames(data.frame(do.call(rbind,
lapply(regmatches(df1$Number, gregexpr("\\d", df1$Number)),
function(x) {
length(x) <- no_cols
x
}
)
)
), paste0("Digit", seq_len(no_cols))))

Variable names as Input in an R Function

I have a dataframe with several numeric variables along with factors. I wish to run over the numeric variables and replace the negative values to missing. I couldn't do that.
My alternative idea was to write a function that gets a dataframe and a variable, and does it. It didn't work either.
My code is:
NegativeToMissing = function(df,var)
{
df$var[df$var < 0] = NA
}
Error in $<-.data.frame(`*tmp*`, "var", value = logical(0)) : replacement has 0 rows, data has 40
what am I doing wrong ?
Thank you.
Here is an example with some dummy data.
df1 <- data.frame(col1 = c(-1, 1, 2, 0, -3),
col2 = 1:5,
col3 = LETTERS[1:5])
df1
# col1 col2 col3
#1 -1 1 A
#2 1 2 B
#3 2 3 C
#4 0 4 D
#5 -3 5 E
Now find columns that are numeric
numeric_cols <- sapply(df1, is.numeric)
And replace negative values
df1[numeric_cols] <- lapply(df1[numeric_cols], function(x) replace(x, x < 0 , NA))
df1
# col1 col2 col3
#1 NA 1 A
#2 1 2 B
#3 2 3 C
#4 0 4 D
#5 NA 5 E
You could also do
df1[df1 < 0] <- NA
With tidyverse, we can make use of mutate_if
library(tidyverse)
df1 %>%
mutate_if(is.numeric, funs(replace(., . < 0, NA)))
If you still want to change only one selected variable a solution withdplyr would be to use non-standard evaluation:
library(dplyr)
NegativeToMissing <- function(df, var) {
quo_var = quo_name(var)
df %>%
mutate(!!quo_var := ifelse(!!var < 0, NA, !!var))
}
NegativeToMissing(data, var=quo(val1)) # use quo() function without ""
# val1 val2
# 1 1 1
# 2 NA 2
# 3 2 3
Data used:
data <- data.frame(val1 = c(1, -1, 2),
val2 = 1:3)
data
# val1 val2
# 1 1 1
# 2 -1 2
# 3 2 3

Group data by factor level, then transform to data frame with colname being levels?

There is my problem that I can't solve it:
Data:
df <- data.frame(f1=c("a", "a", "b", "b", "c", "c", "c"),
v1=c(10, 11, 4, 5, 0, 1, 2))
data.frame:f1 is factor
f1 v1
a 10
a 11
b 4
b 5
c 0
c 1
c 2
# What I want is:(for example, fetch data with the number of element of some level == 2, then to data.frame)
a b
10 4
11 5
Thanks in advance!
I might be missing something simple here , but the below approach using dplyr works.
library(dplyr)
nlevels = 2
df1 <- df %>%
add_count(f1) %>%
filter(n == nlevels) %>%
select(-n) %>%
mutate(rn = row_number()) %>%
spread(f1, v1) %>%
select(-rn)
This gives
# a b
# <int> <int>
#1 10 NA
#2 11 NA
#3 NA 4
#4 NA 5
Now, if you want to remove NA's we can do
do.call("cbind.data.frame", lapply(df1, function(x) x[!is.na(x)]))
# a b
#1 10 4
#2 11 5
As we have filtered the dataframe which has only nlevels observations, we would have same number of rows for each column in the final dataframe.
split might be useful here to split df$v1 into parts corresponding to df$f1. Since you are always extracting equal length chunks, it can then simply be combined back to a data.frame:
spl <- split(df$v1, df$f1)
data.frame(spl[lengths(spl)==2])
# a b
#1 10 4
#2 11 5
Or do it all in one call by combining this with Filter:
data.frame(Filter(function(x) length(x)==2, split(df$v1, df$f1)))
# a b
#1 10 4
#2 11 5
Here is a solution using unstack :
unstack(
droplevels(df[ave(df$v1, df$f1, FUN = function(x) length(x) == 2)==1,]),
v1 ~ f1)
# a b
# 1 10 4
# 2 11 5
A variant, similar to #thelatemail's solution :
data.frame(Filter(function(x) length(x) == 2, unstack(df,v1 ~ f1)))
My tidyverse solution would be:
library(tidyverse)
df %>%
group_by(f1) %>%
filter(n() == 2) %>%
mutate(i = row_number()) %>%
spread(f1, v1) %>%
select(-i)
# # A tibble: 2 x 2
# a b
# * <dbl> <dbl>
# 1 10 4
# 2 11 5
or mixing approaches :
as_tibble(keep(unstack(df,v1 ~ f1), ~length(.x) == 2))
Using all base functions (but you should use tidyverse)
# Add count of instances
x$len <- ave(x$v1, x$f1, FUN = length)
# Filter, drop the count
x <- x[x$len==2, c('f1','v1')]
# Hacky pivot
result <- data.frame(
lapply(unique(x$f1), FUN = function(y) x$v1[x$f1==y])
)
colnames(result) <- unique(x$f1)
> result
a b
1 10 4
2 11 5
I'd like code this, may it helps for you
library(reshape2)
library(dplyr)
aa = data.frame(v1=c('a','a','b','b','c','c','c'),f1=c(10,11,4,5,0,1,2))
cc = aa %>% group_by(v1) %>% summarise(id = length((v1)))
dd= merge(aa,cc) #get the level
ee = dd[dd$aa==2,] #select number of level equal to 2
ee$id = rep(c(1,2),nrow(ee)/2) # reset index like (1,2,1,2)
dcast(ee, id~v1,value.var = 'f1')
all done!

How to keep track of duplicated rows in a data frame when do a reduction by unique(df)?

This is a followup question of this question.
Imagine the following data frame:
a <- c(rep("A", 3), rep("B", 3), rep("A",2))
b <- c(1,1,2,4,1,1,2,2)
df <-data.frame(a,b)
which gives
a b
1 A 1
2 A 1
3 A 2
4 B 4
5 B 1
6 B 1
7 A 2
8 A 2
I reduce it to it's unique rows by:
df_unique <- unique(df)
Now, I am wondering how can I keep track of the merged rows. I would like to create a new column in which each component has a list of row names that have been merged. Something like the following:
df_unique_informative =
a b track
1 A 1 [1,2]
3 A 2 [3,7,8]
4 B 4 [4]
5 B 1 [5,6]
res = aggregate(x = list(track = 1:NROW(df)), by = list(a = df$a, b = df$b), function(x) x)
# OR perhaps you want
#res = aggregate(x = list(track = 1:NROW(df)), by = list(a = df$a, b = df$b), function(x)
# paste(x, collapse = ", "))
res
# a b track
#1 A 1 1, 2
#2 B 1 5, 6
#3 A 2 3, 7, 8
#4 B 4 4
#Shorter code
res = aggregate(list(track = 1:NROW(df)), df[,1:2], '[')
Update
a <- c(rep("A", 3), rep("B", 3), rep("A",2))
b <- c(1,1,2,4,1,1,2,2)
c = letters[1:8]
df <-data.frame(a,b,c, stringsAsFactors = FALSE)
res = aggregate(x = list(track = 1:NROW(df)), by = list(a = df$a, b = df$b), function(x) df$c[x])
res
# a b track
#1 A 1 a, b
#2 B 1 e, f
#3 A 2 c, g, h
#4 B 4 d
Here is one option with tidyverse
library(tidyverse)
rownames_to_column(df, 'rn') %>%
group_by(a, b) %>%
summarise(track = list(rn))

Concatenating all rows within a group using dplyr

Suppose I have a dataframe like this:
hand_id card_id card_name card_class
A 1 p alpha
A 2 q beta
A 3 r theta
B 2 q beta
B 3 r theta
B 4 s gamma
C 1 p alpha
C 2 q beta
I would like to concatenate the card_id, card_name, and card_class into one single row per hand level A, B, C. So the result would look something like this:
hand_id combo_1 combo_2 combo_3
A 1-2-3 p-q-r alpha-beta-theta
B 2-3-4 q-r-s beta-theta-gamma
....
I attempted to do this using group_by and mutate, but I can't seem to get it to work
data <- read_csv('data.csv')
byHand <- group_by(data, hand_id) %>%
mutate(combo_1 = paste(card_id),
combo_2 = paste(card_name),
combo_3 = paste(card_class))
Thank you for your help.
You were kind of close!
library(tidyr)
library(dplyr)
data <- read_csv('data.csv')
byHand <- group_by(data, hand_id) %>%
summarise(combo_1 = paste(card_id, collapse = "-"),
combo_2 = paste(card_name, collapse = "-"),
combo_3 = paste(card_class, collapse = "-"))
or using summarise_each:
byHand <- group_by(data, hand_id) %>%
summarise_each(funs(paste(., collapse = "-")))
Here is another option using data.table
library(data.table)
setDT(data)[, lapply(.SD, paste, collapse="-") , by = hand_id]
# hand_id card_id card_name card_class
#1: A 1-2-3 p-q-r alpha-beta-theta
#2: B 2-3-4 q-r-s beta-theta-gamma
#3: C 1-2 p-q alpha-beta
Not very familiar with dplyr... so here's my attempt without dplyr
df <- read_csv('data.csv')
res <- lapply(split(df, df$hand_id),function(x){
sL <- apply(x[,-1], 2, function(y) paste(y, collapse = "-"))
d <- data.frame(x$hand_id[1], rbind(sL))
names(d) <- c("hand_id", "combo_1", "combo_2", "combo_3")
return(d)
})
res <- do.call("rbind",res)
rownames(res) <- NULL
Here's the output:
## hand_id combo_1 combo_2 combo_3
## 1 A 1-2-3 p-q-r alpha-beta-theta
## 2 B 2-3-4 q-r-s beta-theta-gamma
## 3 C 1-2 p-q alpha-beta
If you have NAs in your data, you can use na.omit() inline with str_c(). unique() will also work if you only want the distinct.
data:
hand_id card_id card_name card_class
<chr> <dbl> <chr> <chr>
1 A 1 p alpha
2 A 2 q beta
3 A 3 r theta
4 A NA NA NA
5 B 2 q beta
6 B 3 r theta
7 B 4 s gamma
8 C 1 p alpha
9 C 2 q beta
code:
data %>%
group_by(hand_id) %>%
summarize(card_id = str_c(na.omit(card_id), collapse = "-"),
card_name = str_c(na.omit(card_name), collapse = "-"),
card_class = str_c(na.omit(card_class), collapse = "-"))
output:
hand_id card_id card_name card_class
* <chr> <chr> <chr> <chr>
1 A 1-2-3 p-q-r alpha-beta-the…
2 B 2-3-4 q-r-s beta-theta-gam…
3 C 1-2 p-q alpha-beta

Resources