Remove rows with all NA values after groupby r - r

I want to remove rows which has all NAs after using group_by. here is a sample dataset:
df=data.frame(Col1=c("B","B","C","D",
"P1","P2","P3")
,Col2=c(NA,8,NA,9,10,8,9)
,Col3=c(NA,7,6,8,NA,7,8)
,Col4=c(NA,NA,7,7,NA,7,7))
i want to groupby Col1 and remove rows if column values are all NA.
So the desired output is:
Col1
Col2
Col3
Col4
B
8
7
NA
C
NA
6
7
D
9
8
7
P1
10
NA
NA
P2
8
7
7
P3
9
8
7
any help would be really appreciated.

You don't need group_by, you can use if_any.
library(dplyr)
filter(df, if_any(-Col1, ~ !is.na(.)))
# Col1 Col2 Col3 Col4
# 1 B 8 7 NA
# 2 C NA 6 7
# 3 D 9 8 7
# 4 P1 10 NA NA
# 5 P2 8 7 7
# 6 P3 9 8 7

There is no need for group-by, keep only rows where there is at least 1 non-na column, excluding Col1:
df[ rowSums(!is.na(df[, -1])) > 0, ]
# Col1 Col2 Col3 Col4
# 2 B 8 7 NA
# 3 C NA 6 7
# 4 D 9 8 7
# 5 P1 10 NA NA
# 6 P2 8 7 7
# 7 P3 9 8 7

With the help of r2evans answer, I was able to remove NAs after grouping by using group_by. Thanks everyone who answered.
library(dplyr)
df %>% group_by(Col1) %>% filter(if_any(everything(), ~ !is.na(.)))
Col1 Col2 Col3 Col4
B 8 7 NA
C NA 6 7
D 9 8 7
P1 10 NA NA
P2 8 7 7
P3 9 8 7

Related

Avoid the for loops-R

I have two data frames, x and y. For each value of x[,2], I look if the value is equal to the value of the elements of y[,1]. If so, I add a third column in the first data frame that contains the values of y[,2].
I managed to do that with loops, but how can I do this using vectors?
x=data.frame(1:15,15:1)
y=data.frame(3:5,c(7.2,8.5,0.3))
for ( i in 1:nrow(x)) {
for (j in 1:nrow(y)) {
if (x[i,2]==y[j,1]){
x[i,3]=y[j,2]
}else{
}
}
}
Use a join instead of loops - based on the loop comparision, the second column of 'x' is compared with the first column of 'y', thus those columns are used in the on, assign (:=) the second column (col2) from the second dataset to create the new column 'col3' in first data
library(data.table)
setDT(x)[y, col3 := i.col2, on = .(col2 = col1)]
-output
> x
col1 col2 col3
1: 1 15 NA
2: 2 14 NA
3: 3 13 NA
4: 4 12 NA
5: 5 11 NA
6: 6 10 NA
7: 7 9 NA
8: 8 8 NA
9: 9 7 NA
10: 10 6 NA
11: 11 5 0.3
12: 12 4 8.5
13: 13 3 7.2
14: 14 2 NA
15: 15 1 NA
data
x <- data.frame(col1 = 1:15, col2 = 15:1)
y <- data.frame(col1 = 3:5, col2 = c(7.2,8.5,0.3))
Update: Many thanks to #TrainingPizza (who has drawn my attention to the false output of my first answer and also provided how it could work:
library(dplyr)
x %>%
rowwise() %>%
mutate(col3 = ifelse(col2 %in% y$col1, y$col2[y$col1==col2], NA))
col1 col2 col3
<int> <int> <dbl>
1 1 15 NA
2 2 14 NA
3 3 13 NA
4 4 12 NA
5 5 11 NA
6 6 10 NA
7 7 9 NA
8 8 8 NA
9 9 7 NA
10 10 6 NA
11 11 5 0.3
12 12 4 8.5
13 13 3 7.2
14 14 2 NA
15 15 1 NA
First answer (not correct)
Here is dplyr way how to avoid the for - loop:
library(dplyr)
x %>%
mutate(V3 = ifelse(V2 %in% y$V1, y$V2, NA))
V1 V2 V3
1 1 15 NA
2 2 14 NA
3 3 13 NA
4 4 12 NA
5 5 11 NA
6 6 10 NA
7 7 9 NA
8 8 8 NA
9 9 7 NA
10 10 6 NA
11 11 5 8.5
12 12 4 0.3
13 13 3 7.2
14 14 2 NA
15 15 1 NA

Summing Entries in Multiple Unequally-Sized Data Frames With Some (but not All) Rows and Columns the Same

I have the following three data frames:
df1 <- data.frame(A = 1:10, B = 3:12, D = 4:13)
df2 <- data.frame(C = 1:5, B = 4:8, E = 2:6)
df3 <- data.frame(A = 13:10, B = 19:16, F = 1:4)
rownames(df1) <- paste0("row", seq_len(nrow(df1)))
rownames(df2) <- paste0("row", c(1, 3, 5, 7, 11))
rownames(df3) <- paste0("row", c(12, 3, 10, 9))
# > df1
# A B D
# row1 1 3 4
# row2 2 4 5
# row3 3 5 6
# row4 4 6 7
# row5 5 7 8
# row6 6 8 9
# row7 7 9 10
# row8 8 10 11
# row9 9 11 12
# row10 10 12 13
# > df2
# C B E
# row1 1 4 2
# row3 2 5 3
# row5 3 6 4
# row7 4 7 5
# row11 5 8 6
# > df3
# A B F
# row12 13 19 1
# row3 12 18 2
# row10 11 17 3
# row9 10 16 4
When cells from different data frames have the same row and column names, I want to sum their values. In the cases where a cell has no matches (there aren't any other cells with the same row and column name), the final data frame will contain that original value. When a cell with a particular row name and column name combination doesn't exist in any of the original data frames, the final data frame will contain an NA in that position.
The final data frame should look like this data frame:
> df4
A B C D E G
row1 1 7 1 4 2 NA
row2 2 4 NA 5 1 NA
row3 15 28 2 6 5 2
row4 4 6 NA 7 3 NA
row5 5 13 3 8 8 NA
row6 6 8 NA 9 5 NA
row7 7 16 4 10 11 NA
row8 8 10 NA 11 7 NA
row9 19 27 NA 12 8 4
row10 21 29 NA 13 9 3
row11 NA 8 5 NA 6 NA
row12 13 19 NA NA NA 1
I'm imagining something with the Reduce() function that can be used on many data frames at once. Is the first step adding missing rows and columns to existing data frames with NAs in all the cells where values are missing?
Thanks!
I think this should work. With row AND column names and one data type, I prefer matrices to data frames, but you can convert the final matrix back to a data frame if you need.
# put things in a list
df_list = list(df1, df2, df3)
# get the complete set of row and column names
all_rows = unique(unlist(lapply(df_list, rownames)))
all_cols = unique(unlist(lapply(df_list, colnames)))
# initialize a final matrix to NA
final_mat = matrix(NA, nrow = length(all_rows), ncol = length(all_cols))
rownames(final_mat) = all_rows
colnames(final_mat) = all_cols
# go through each df in the list
for(i in seq_along(df_list)) {
# set any NAs in the selection to 0
final_mat[rownames(df_list[[i]]), colnames(df_list[[i]])][is.na(final_mat[rownames(df_list[[i]]), colnames(df_list[[i]])])] = 0
# add the data frame to the selection
final_mat[rownames(df_list[[i]]), colnames(df_list[[i]])] = final_mat[rownames(df_list[[i]]), colnames(df_list[[i]])] + as.matrix(df_list[[i]])
}
final_mat
# A B D C E F
# row1 1 7 4 1 2 NA
# row2 2 4 5 NA NA NA
# row3 15 28 6 2 3 2
# row4 4 6 7 NA NA NA
# row5 5 13 8 3 4 NA
# row6 6 8 9 NA NA NA
# row7 7 16 10 4 5 NA
# row8 8 10 11 NA NA NA
# row9 19 27 12 NA NA 4
# row10 21 29 13 NA NA 3
# row11 NA 8 NA 5 6 NA
# row12 13 19 NA NA NA 1
Here's a dplyr alternative -
Put the dataframes in a list
make rownames as a separate column from each dataframe
Combine them in one dataframe
For each rowname sum the values from all the columns. If a column has all the NA's we return NA. A shortcut to do that is to use hablar::sum_.
Order the data based on rownames
Use column_to_rownames to assign the values back as rownames.
library(dplyr)
df_list = list(df1, df2, df3)
purrr::map_df(df_list, ~.x %>% rownames_to_column('row')) %>%
group_by(row) %>%
summarise(across(A:F, hablar::sum_)) %>%
arrange(order(gtools::mixedorder(row))) %>%
column_to_rownames('row')
# A B D C E F
#row1 1 7 4 1 2 NA
#row2 2 4 5 NA NA NA
#row3 15 28 6 2 3 2
#row4 4 6 7 NA NA NA
#row5 5 13 8 3 4 NA
#row6 6 8 9 NA NA NA
#row7 7 16 10 4 5 NA
#row8 8 10 11 NA NA NA
#row9 19 27 12 NA NA 4
#row10 21 29 13 NA NA 3
#row11 NA 8 NA 5 6 NA
#row12 13 19 NA NA NA 1
Using data.table
library(data.table)
rbindlist(list(setDT(df1, keep.rownames = TRUE),
setDT(df2, keep.rownames = TRUE),
setDT(df3, keep.rownames = TRUE)), fill = TRUE)[,
lapply(.SD, function(x) dplyr::na_if(sum(x, na.rm = TRUE), 0)), rn]

Calculate metrics for multiple columns based on subsets defined by other columns

I would like to calculate simple summary metrics for subsets of certain columns in a data frame, where the subsets are based on information in other columns of the same data frame. Let me illustrate:
colA <- c(NA,2,3,NA,NA,3,9,5,6,1)
colB <- c(9,3,NA,2,2,4,6,1,9,9)
colC <- c(NA,NA,5,7,3,9,8,1,2,3)
colAA <- c(NA,NA,6,NA,NA,NA,1,7,9,4)
colBB <- c(NA,2,NA,7,8,NA,2,7,9,4)
colCC <- c(NA,NA,3,7,5,8,9,9,NA,3)
df <- data.frame(colA,colB,colC,colAA,colBB,colCC)
> df
colA colB colC colAA colBB colCC
1 NA 9 NA NA NA NA
2 2 3 NA NA 2 NA
3 3 NA 5 6 NA 3
4 NA 2 7 NA 7 7
5 NA 2 3 NA 8 5
6 3 4 9 NA NA 8
7 9 6 8 1 2 9
8 5 1 1 7 7 9
9 6 9 2 9 9 NA
10 1 9 3 4 4 3
Here colAA should be subsetted by colA so that rows containing NAs in colA are removed:
> df1 <- subset(df, !is.na(colA))
> df1
colA colB colC colAA colBB colCC
2 2 3 NA NA 2 NA
3 3 NA 5 6 NA 3
6 3 4 9 NA NA 8
7 9 6 8 1 2 9
8 5 1 1 7 7 9
9 6 9 2 9 9 NA
10 1 9 3 4 4 3
Now I would like to calculate e.g. column length and percentage of non-NA values within the column:
> length(df1$colAA)
[1] 7
> (nrow(subset(df1, !is.na(colAA)))/length(df1$colAA))*100
[1] 71.42857
In an ideal world, the output would be written to another data frame, e.g.:
cat n perc_n
1 colAA 7 71
2 colBB 9 78
3 colCC 8 88
Any way to achieve this for all columns in a slighty more elegant/efficient manner? Any suggestions will be much appreciated!
You can pass the two sets of columns to Map:
res = Map(function(x,y) summary(y[!is.na(x)]), df[,1:3], df[, 4:6])
Since the post is tagged with data.table, I'd also recommend making a table like
data.table::rbindlist(lapply(res, as.list), id="col")
# col Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# 1: colA 1 4 6 5.400 7.0 9 2
# 2: colB 2 3 7 5.571 7.5 9 2
# 3: colC 3 4 7 6.286 8.5 9 1
You can replace summary with whatever function you like that returns a named vector and it should still work with as.list + rbindlist.

How to combine rows in two dataframes into one dataframe in R?

This is what I want to do:
Combine
df1
Col1 Col2 Col3
7 1 8
6 2 9
3 6 3
and
df2
Col1 Col2 Col3
4 6 3
5 7 8
9 1 2
into this:
df3
Col1 Col2 Col3
7 1 8
6 2 9
3 6 3
4 6 3
5 7 8
9 1 2
And I'm sorry if someone has asked this already.
Thanks!
You can just do :
df3 <- rbind(df1, df2)

Loop function for comparing the columns

I have a very large data set including 400 string and numeric variables. I want to compare each two consequiative columns 3&4, 5&6, etc. I am going to compare the third variable (.x) with fourth (.y) , fifth with sixth one, seventh one with eightth one and so on in the following way: if (.y) is NA then we replace the NA with the value of corresponding row from (.x) . For example if number .y is NA we replace NA with the corresponding value from number .x which would be 5. Again, if day.y is NA we replace NA in day.y with the corresponding value from day.x which would be 3. How can I write a loope function to do that?
A<-c(1,2,3,4,5,6,7,NA,NA,5,5,6)
B<-c(3,4,5,6,1,2,7,6,7,NA,NA,6)
number.x<-c(1,2,3,4,5,6,7,NA,NA,5,5,6)
number.y<-c(3,4,5,6,1,2,7,6,7,NA,NA,6)
day.x<-c(1,3,4,5,6,7,8,1,NA,3,5,3)
day.y<-c(4,5,6,7,8,7,8,1,2,3,5,NA)
school.x<-c("a","b","b","c","n","f","h","NA","F","G","z","h")
school.y<-c("a","b","b","c","m","g","h","NA","NA","G","H","T")
city.x<- c(1,2,3,7,5,8,7,5,6,7,5,1)
city.y<- c(1,2,3,5,5,7,7,NA,NA,3,4,5)
df<-data.frame(A,B,number.x,number.y,day.x,day.y,school.x,school.y,city.x,city.y)
This is a hacked approach to your question and it requires that every two columns are going to be compared against one another.
library(dplyr)
start_group <- seq(1, length(df), by = 2)
df2 <- data.frame(id = 1:nrow(df))
for(i in start_group){
i <- i
j <- i + 1
dnames <- df[, c(i, j)] %>%
names
df_ <- data.frame(col1 = df[, i],
col2 = df[, j]) %>%
mutate(col1 = ifelse(is.na(col1), col2 %>% paste, col1 %>% paste)) %>%
mutate(col2 = ifelse(is.na(col2), col1 %>% paste, col2 %>% paste))
names(df_) <- dnames
df2 <- cbind(df2, df_)
}
df2[, -1]
number.x number.y day.x day.y school.x school.y city.x city.y
1 1 3 1 4 a a 1 1
2 2 4 3 5 b b 2 2
3 3 5 4 6 b b 3 3
4 4 6 5 7 c c 7 5
5 5 1 6 8 n m 5 5
6 6 2 7 7 f g 8 7
7 7 7 8 8 h h 7 7
8 6 6 1 1 NA NA 5 5
9 7 7 2 2 F F 6 6
10 5 5 3 3 G G 7 3
11 5 5 5 5 z H 5 4
12 6 6 3 3 h T 1 5
Consider the following base R solution. Essentially, it loops through a distinct list of column stem names (number, day, school, class) and replaces NA values in .x columns with corresponding NA values in .y columns and vice versa. NOTE: Schools column require conversion from factor to character and one of its rows has NA in both .x and .y columns
# CONVERT TO CHARACTER (NOTE: NA VALUE BECOME "NA" STRINGS)
df[,c('school.x', 'school.y')] <-
sapply(df[,c('school.x', 'school.y')], as.character)
# SET UP FINAL DF
finaldf <- df
# OBTAIN UNIQUE LIST OF COLUMNS STEM (W/O x AND y SUFFIXES)
distinctcols <- unique(gsub("[.][x]|[.][y]", "", names(df)[49:ncol(df)]))
# LOOP THROUGH COLUMN STEM REPLACING NA VALUES
for (col in distinctcols) {
# REPLACE NA .x COLUMN VALUES
finaldf[is.na(finaldf[paste0(col,'.x')])|finaldf[paste0(col,'.x')]=="NA",
paste0(col,'.x')] <-
finaldf[is.na(finaldf[paste0(col,'.x')])|finaldf[paste0(col,'.x')]=="NA",
paste0(col,'.y')]
# REPLACE NA .y COLUMN VALUES
finaldf[is.na(finaldf[paste0(col,'.y')])|finaldf[paste0(col,'.y')]=="NA",
paste0(col,'.y')] <-
finaldf[is.na(finaldf[paste0(col,'.y')])|finaldf[paste0(col,'.y')]=="NA",
paste0(col,'.x')]
}
OUTPUT
number.x number.y day.x day.y school.x school.y city.x city.y
1 1 3 1 4 a a 1 1
2 2 4 3 5 b b 2 2
3 3 5 4 6 b b 3 3
4 4 6 5 7 c c 7 5
5 5 1 6 8 n m 5 5
6 6 2 7 7 f g 8 7
7 7 7 8 8 h h 7 7
8 6 6 1 1 NA NA 5 5
9 7 7 2 2 F F 6 6
10 5 5 3 3 G G 7 3
11 5 5 5 5 z H 5 4
12 6 6 3 3 h T 1 5

Resources