Determine the number of rows with NAs - r

I have a data frame as follows:
col1 col2 col3
1 23 17 NA
2 55 NA NA
3 24 12 13
4 34 23 12
I'm interested in finding the number of rows in col2 and col3 with NAs.
I was surprised that the following code only gave me 4 instead of 2:
numNAs <- rowSums(is.na(all[,2:3]))
Please help.

Another short solution:
> sum(!complete.cases(dat[-1]))
[1] 2
where dat is the name of your data frame.

DF <- read.table(text=" col1 col2 col3
1 23 17 NA
2 55 NA NA
3 24 12 13
4 34 23 12", header=TRUE)
This gives the number of rows that contain any NA values in column 2 or 3:
sum(colSums(is.na(DF[,2:3])) > 0)
[1] 2

Another solution:
data <- read.table(text='col1 col2 col3
23 17 NA
55 NA NA
24 12 13
34 23 12', header=T)
sum(apply(is.na(data[, -1]), 1, any))

test <- read.table(textConnection(" col1 col2 col3
1 23 17 NA
2 55 NA NA
3 24 12 13
4 34 23 12"))
> table(test$col2,useNA="ifany")
12 17 23 <NA>
1 1 1 1
> table(test$col3,useNA="ifany")
12 13 <NA>
1 1 2

Another solution adding columns 2 and 3:
> sum(is.na(all[,"col2"] + all[,"col3"]))
[1] 2

Related

Replace NA using a vector of column names

I have a data frame with columns containing NAs which I replace using replace_na. The problem is these column names can change in the future so I would like to put these column names in a vector and then use the vector in the replace_na function. I don't want to change the entire data frame in one go, just specified columns. When I try this as below, the code runs but it doesn't change the data frame. Can anyone suggest any edits to the code?
library(tidyverse)
col1<-c(9,NA,25,26,NA,51)
col2<-c(9,5,25,26,NA,51)
col3<-c(NA,3,25,26,NA,51)
col4<-c(9,1,NA,26,NA,51)
data<-data.frame(col1,col2,col3,col4, stringsAsFactors = FALSE)
columns<-c(col1,col2)
data<-data%>%
replace_na(list(columns=0))
A dplyr option:
columns <- c("col1" ,"col2")
dplyr::mutate(data, across(columns, replace_na, 0))
Returns:
col1 col2 col3 col4
1 9 9 NA 9
2 0 5 3 1
3 25 25 25 NA
4 26 26 26 26
5 0 0 NA NA
6 51 51 51 51
Another option would be using coalesce inside map_at:
at argument in map_at can be a character vector of column names that you would like to modify
We then use coalesce function to specify the replacement of NAs
library(dplyr)
library(purrr)
data %>%
map_at(c("col1","col2"), ~ coalesce(.x, 0)) %>%
bind_cols()
# A tibble: 6 x 4
col1 col2 col3 col4
<dbl> <dbl> <dbl> <dbl>
1 9 9 NA 9
2 0 5 3 1
3 25 25 25 NA
4 26 26 26 26
5 0 0 NA NA
6 51 51 51 51
columns value should be string, you can then use is.na as -
columns<-c("col1","col2")
data[columns][is.na(data[columns])] <- 0
data
# col1 col2 col3 col4
#1 9 9 NA 9
#2 0 5 3 1
#3 25 25 25 NA
#4 26 26 26 26
#5 0 0 NA NA
#6 51 51 51 51
Or using tidyverse -
library(dplyr)
library(tidyr)
data <- data %>% mutate(across(all_of(columns), replace_na, 0))

R merge two data.frame by id and sub-id while changing column names?

I have two dataframes of this format.
df1:
id x y
1 2 3
2 4 5
3 6 7
4 8 9
5 1 1
df2:
id id2 v v2
1 t 11 21
1 b 12 22
2 t 13 23
2 b 14 24
3 t 15 25
3 b 16 26
4 b 17 27
Hence, sometimes, the id in main 'df' will appear twice (maximum) sometimes once, and sometimes not at all. The expected result would be:
df_merged:
id x y v.t v2.t v.b v2.b
1 2 3 11 21 12 22
2 4 5 13 23 24 24
3 6 7 15 25 16 26
4 8 9 NA NA 17 27
5 1 1 NA NA NA NA
I have used merge but due to the fact that id2 in df2 doesn't match, I get two instances of id in df_merged like so:
id x y v v2
1 ...
1 ...
Thanks in advance!
We can start by adjusting df2 to the right format then do a normal joining.
librar(dplyr)
library(tidyr)
df2 %>% gather(key,val,-id,-id2) %>% #Transfer from wide to long format for v and v2
mutate(new_key=paste0(key,'.',id2)) %>% #Create a new id2 as new_key
select(-id2,-key) %>% #de-select the unnessary columns
spread(new_key,val) %>% #Transfer back to wide foramt with right foramt for id
right_join(df1) %>% #right join df1 "To includes all rows in df1" using id
select(id,x,y,v.t,v2.t,v.b,v2.b) #rearrange columns name
Joining, by = "id"
id x y v.t v2.t v.b v2.b
1 1 2 3 11 21 12 22
2 2 4 5 13 23 14 24
3 3 6 7 15 25 16 26
4 4 8 9 NA NA 17 27
5 5 1 1 NA NA NA NA
You can solve this just using merge. Split df2 based on whether id2 equals b or t. Merge these two new objects with df1, and finally merge them together. The code includes one additional step to also include data found in df1 but not df2.
dfb <- merge(df1, df2[df2$id2=='b',], by='id')
dft <- merge(df1, df2[df2$id2=='t',], by='id')
dfRest <- df1[!df1$id %in% df2$id,]
dfAll <- merge(dfb[,c('id','x','y','v','v2')], dft[,c('id','v','v2')], by='id', all.x=T)
merge(dfAll, dfRest, all.x=T, all.y=T)
id x y v.x v2.x v.y v2.y
1 1 2 3 12 22 11 21
2 2 4 5 14 24 13 23
3 3 6 7 16 26 15 25
4 4 8 9 17 27 NA NA
5 5 1 1 NA NA NA NA

How to match and replace elements between two dataframes

I need to replace elements from one dataframe values into another dataframe.
For example:
df1:
id value
0 1 10
1 2 12
2 3 54
3 4 21
df2:
col1 col2 col3
0 1 2 3
1 1 1 3
2 1 3 4
3 1 1 5
Expected Output:
replaced values from df1 and applied to df2.
col1 col2 col3
0 10 12 54
1 10 10 54
2 10 54 21
3 10 10 5
How to do this is in R?
Ill solve this problem in pandas like below,
dic=df1.set_index('id')['value'].to_dict()
print df2.replace(dic)
But I'm stuck in R.
Please help me to solve this problem?
We can loop through each column of df2 using lapply and find a match for id column in df1 and replace the values for the match found using ifelse and keep the remaining values as it is.
df2[] <- lapply(df2, function(x) {
inds <- match(x, df1$id)
ifelse(is.na(inds),x, df1$value[inds])
})
df2
# col1 col2 col3
#0 10 12 54
#1 10 10 54
#2 10 54 21
#3 10 10 5
We could do this using named vector after creating a copy of the second dataset.
df3 <- df2
df3[] <- setNames(df1$value, df1$id)[as.matrix(df2)]
i1 <- is.na(df3)
df3[i1] <- df2[i1]
df3
# col1 col2 col3
#0 10 12 54
#1 10 10 54
#2 10 54 21
#3 10 10 5
What you can do:
Make a copy of df2:
df3=df2 # in R this is a copy not as in python
df3[]=df1$value[match(as.matrix(df2),df1$id)] # Match the columns
df3[is.na(df3)]=df2[is.na(df3)] # Reset Na to the previous value
df3
col1 col2 col3
0 10 12 54
1 10 10 54
2 10 54 21
3 10 10 5

Merge two data frames with different dimensions with partial overwrite in R

I've spent the better part of a day on this but I keep getting stuck. This wouldn't take me very long using index-match-match in Excel, but I'm newer to R and merging data doesn't seem very straight-forward. I've searched the site and found similar problems but no solutions specific to this type of issue.
I have two data frames. They have different lengths in both dimensions. a is 4x4 and b is 3x3. They partially overlap:
a <- data.frame("ID" = c(1:4), "A" = c(21:24), "B" = c(31:34), "C" = c(41:44))
a
ID A B C
1 1 21 31 41
2 2 22 32 42
3 3 23 33 43
4 4 24 34 44
and
b <- data.frame("ID" = c(4:6), "C" = c(22:24), "D" = c(32:34))
b
ID C D
1 4 22 32
2 5 23 33
3 6 24 34
I'm merging on "ID" number. My goal is to get them to look like
c <- data.frame("ID" = c(1:6), "A" = c(21:24, NA, NA), "B" = c(31:34, NA, NA), "C" = c(41:43,22:24), "D" = c(NA, NA, NA, 32:34))
c
ID A B C D
1 21 31 41 NA
2 22 32 42 NA
3 23 33 43 NA
4 24 34 22 32
5 NA NA 23 33
6 NA NA 24 34
As you can see, the final data frame combines the two and assigns NA to the missing information. In column "C", I would like b to overwrite a where it has numerical values. In this example, the value in c[4,3] should change from 44 to 22.
Most of this is simple enough. But getting column "C" correct has been a nightmare. I did the simple thing first:
merge(a, b, by = "ID", all = T)
It almost does the trick but ends up with duplicate row "C"s:
ID A B C.x C.y D
1 1 21 31 41 NA NA
2 2 22 32 42 NA NA
3 3 23 33 43 NA NA
4 4 24 34 44 22 32
5 5 NA NA NA 23 33
6 6 NA NA NA 24 34
This wouldn't be so bad if I could find out how to merge the duplicate rows correctly because then I could just run
merge(a[-4], b[-2], by = "ID", all = T)
ID A B D
1 1 21 31 NA
2 2 22 32 NA
3 3 23 33 NA
4 4 24 34 32
5 5 NA NA 33
6 6 NA NA 34
to merge everything else, then bring in the merged "C" after the fact.
But I can't figure it out how to deal with this part of it:
merge(a[c(1,4)], b[c(1,2)], by = "ID", all = T)
ID C.x C.y ID C
1 1 41 NA 1 1 41
2 2 42 NA 2 2 42
3 3 43 NA -> 3 3 43
4 4 44 22 4 4 22
5 5 NA 23 5 5 23
6 6 NA 24 6 6 24
There's gotta be way.
Thanks for your help!
For anyone else looking at this in the future, I realized this could also be solved using the following in base rather than dplyr:
df <- merge(a, b, by = "ID", all = T)
df[,"C"] <- ifelse(is.na(df[,"C.y"]), df[,"C.x"], df[,"C.y"])
df <- df[,-c(match("C.x", names(df)),match("C.y", names(df)))]
This ended up being the method I used because down the road I came to needing to perform some steps that were very difficult with dplyr for a novice (using variables inside mutate() and select()) and much more straightforward in base using the above syntax.
Thanks again to CPak, without whom I could not have figured this out.
Try this
library(dplyr)
starthere <- merge(a, b, by = "ID", all = T)
starthere %>%
mutate(C = ifelse(is.na(C.y), C.x, C.y)) %>%
select(-C.x, -C.y)
# ID A B D C
# 1 1 21 31 NA 41
# 2 2 22 32 NA 42
# 3 3 23 33 NA 43
# 4 4 24 34 32 22
# 5 5 NA NA 33 23
# 6 6 NA NA 34 24

Split all columns in one data frame and create two data frames in R

I have a single data frame (let's call it df) that looks like this:
col1 <- c("1/10", "2/30", "1/40", "3/23", "0/17", "7/14")
col2 <- c("2/44", "0/13", "4/55", "6/43", "0/19", "2/34")
col3 <- c("0/36", "0/87", "3/11", "2/12", "4/33", "0/12")
col4 <- c("1/76", "2/65", "2/21", "5/0", "2/26", "1/52")
df <- data.frame(col1,col2,col3,col4)
GOAL: In each cell there is are two numbers separated by a "/". Create two data frames: 1 data frame with the the LEFT number and another data frame with the RIGHT number.
The end result would ideally look like this:
df.left.numbers:
col1 col2 col3 col4
1 2 0 1
2 0 0 2
1 4 3 2
3 6 2 5
0 0 4 2
7 2 0 1
df.right.numbers:
col1 col2 col3 col4
10 44 36 76
30 13 87 65
40 55 11 21
23 43 12 0
17 19 33 26
14 34 12 53
I've used strsplit() but that is for 1 column splitting into two within ONE data frame. I also tried the separate() function in the tidyr package however that requires the name of a given column. I am iterating through all of them. I suppose I could write a loop, however I was wondering if anyone had an easier way of making this happen!
Thanks!!
Try this:
require(data.table)
lapply(split(unlist(
lapply(df,tstrsplit,"/"),recursive=FALSE),c("Left","Right")),
as.data.frame)
#$Right
# col12 col22 col32 col42
#1 10 44 36 76
#2 30 13 87 65
#3 40 55 11 21
#4 23 43 12 0
#5 17 19 33 26
#6 14 34 12 52
#$Left
# col11 col21 col31 col41
#1 1 2 0 1
#2 2 0 0 2
#3 1 4 3 2
#4 3 6 2 5
#5 0 0 4 2
#6 7 2 0 1
Not very elegant, but it is short and it works...
col1 <- c("1/10", "2/30", "1/40", "3/23", "0/17", "7/14")
col2 <- c("2/44", "0/13", "4/55", "6/43", "0/19", "2/34")
col3 <- c("0/36", "0/87", "3/11", "2/12", "4/33", "0/12")
col4 <- c("1/76", "2/65", "2/21", "5/0", "2/26", "1/52")
df <- data.frame(col1,col2,col3,col4,stringsAsFactors = FALSE)
dfLeft <- as.data.frame(lapply(df,function(x) gsub("\\/.+","",x)))
dfRight <- as.data.frame(lapply(df,function(x) gsub(".+\\/","",x)))
Another option with purrr package:
library(data.table)
library(purrr)
df %>%
map(tstrsplit, split="/") %>%
transpose() %>% map(as.data.frame) %>%
set_names(c("left", "right"))
#$left
# col1 col2 col3 col4
#1 1 2 0 1
#2 2 0 0 2
#3 1 4 3 2
#4 3 6 2 5
#5 0 0 4 2
#6 7 2 0 1
#$right
# col1 col2 col3 col4
#1 10 44 36 76
#2 30 13 87 65
#3 40 55 11 21
#4 23 43 12 0
#5 17 19 33 26
#6 14 34 12 52

Resources