Related
df <- data.frame(A1 = c(6, 8, NA, 1, 5),
A2 = c(NA, NA, 9, 3, 6),
A3 = c(9, NA, 1, NA, 4),
B1 = c(NA, NA, 9, 3, 6),
B2 = c(9, NA, 1, NA, 4),
B3 = c(NA, NA, 9, 3, 6)
)
I have a dataset with multiple questionnaires that each have multiple items. I would like to replace the missing data with the row mean of the observable values for each of the questionnaires (missing values in A items replaced by row mean of A1 to A3 and missing values in B items replaces by row mean of B1 to B3). What is the best way to do that?
You may try
df <- data.frame(A1 = c(6, 8, NA, 1, 5),
A2 = c(NA, NA, 9, 3, 6),
A3 = c(9, NA, 1, NA, 4),
B1 = c(NA, NA, 9, 3, 6),
B2 = c(9, NA, 1, NA, 4),
B3 = c(NA, NA, 9, 3, 6)
)
df1 <- df %>%
select(starts_with("A"))
df2 <- df %>%
select(starts_with("B"))
x1 <- which(is.na(df1), arr.ind = TRUE)
df1[x1] <- rowMeans(df1, na.rm = T)[x1[,1]]
x2 <- which(is.na(df2), arr.ind = TRUE)
df2[x2] <- rowMeans(df2, na.rm = T)[x2[,1]]
df <- cbind(df1, df2)
df
A1 A2 A3 B1 B2 B3
1 6 7.5 9 9 9 9
2 8 8.0 8 NaN NaN NaN
3 5 9.0 1 9 1 9
4 1 3.0 2 3 3 3
5 5 6.0 4 6 4 6
You may use split.default to split data in different groups and replace NA with row-wise mean (taken from this answer https://stackoverflow.com/a/6918323/3962914 )
as.data.frame(lapply(split.default(df, sub('\\d+', '', names(df))), function(x) {
k <- which(is.na(x), arr.ind = TRUE)
x[k] <- rowMeans(x, na.rm = TRUE)[k[, 1]]
x
})) -> result
names(result) <- names(df)
result
# A1 A2 A3 B1 B2 B3
#1 6 7.5 9 9 9 9
#2 8 8.0 8 NaN NaN NaN
#3 5 9.0 1 9 1 9
#4 1 3.0 2 3 3 3
#5 5 6.0 4 6 4 6
You could also do:
library(dplyr)
df %>%
reshape(names(.), dir='long', sep="")%>%
group_by(id) %>%
mutate(across(A:B, ~replace(.x, is.na(.x), mean(.x, na.rm = TRUE))))%>%
pivot_wider(id, names_from = time, values_from = A:B, names_sep = "") %>%
ungroup() %>%
select(-id)
# A tibble: 5 x 6
A1 A2 A3 B1 B2 B3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 6 7.5 9 9 9 9
2 8 8 8 NaN NaN NaN
3 5 9 1 9 1 9
4 1 3 2 3 3 3
5 5 6 4 6 4 6
We can use split.default with na.aggregate
library(purrr)
library(zoo)
library(dplyr)
library(stringr)
map_dfc(split.default(df, str_remove(names(df), "\\d+")), ~
as_tibble(t(na.aggregate(t(.x)))))
# A tibble: 5 × 6
A1 A2 A3 B1 B2 B3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 6 7.5 9 9 9 9
2 8 8 8 NaN NaN NaN
3 5 9 1 9 1 9
4 1 3 2 3 3 3
5 5 6 4 6 4 6
Span a matrix of rowMeans on the rows and replace the NA's. In an lapply that greps the questions.
do.call(cbind, lapply(c('A', 'B'), function(q) {
s <- df[, grep(q, names(df))]
na <- is.na(s)
replace(s, na, rowMeans(s, na.rm=TRUE)[row(s)][na])
}))
# A1 A2 A3 B1 B2 B3
# 1 6 7.5 9 9 9 9
# 2 8 8.0 8 NaN NaN NaN
# 3 5 9.0 1 9 1 9
# 4 1 3.0 2 3 3 3
# 5 5 6.0 4 6 4 6
Data:
df <- structure(list(A1 = c(6, 8, NA, 1, 5), A2 = c(NA, NA, 9, 3, 6
), A3 = c(9, NA, 1, NA, 4), B1 = c(NA, NA, 9, 3, 6), B2 = c(9,
NA, 1, NA, 4), B3 = c(NA, NA, 9, 3, 6)), class = "data.frame", row.names = c(NA,
-5L))
I have a dataset with one variable with participant IDs and several variables with peer-nominations (in form of IDs).
I need to replace all numbers in the peer-nomination variables, that are not among the participant IDs, with NA.
Example: I have
ID PN1 PN2
1 2 5
2 3 4
4 6 2
5 2 7
I need
ID PN1 PN2
1 2 5
2 NA 4
4 NA 2
5 2 NA
Would be great if someone can help! Thank you so much in advance.
An alternative with Base R,
df[,-1][matrix(!(unlist(df[,-1]) %in% df[,1]),nrow(df))] <- NA
df
gives,
ID PN1 PN2
1 1 2 5
2 2 NA 4
3 4 NA 2
4 5 2 NA
library(tidyverse)
df %>%
mutate(across(-ID, ~if_else(. %in% ID, ., NA_real_)))
which gives:
# ID PN1 PN2
# 1 1 2 5
# 2 2 NA 4
# 3 4 NA 2
# 4 5 2 NA
Data used:
df <- data.frame(ID = c(1, 2, 4, 5),
PN1 = c(2, 3, 6, 2),
PN2 = c(5, 4, 2, 7))
Here is a base R way.
The lapply loop on all columns except for the id column, uses function is.na<- to assign NA values to vector elements not in df1[[1]]. Then returns the changed vector.
df1[-1] <- lapply(df1[-1], function(x){
is.na(x) <- !x %in% df1[[1]]
x
})
df1
# ID PN1 PN2
#1 1 2 5
#2 2 NA 4
#3 4 NA 2
#4 5 2 NA
Data in dput format
df1 <-
structure(list(ID = c(1L, 2L, 4L, 5L),
PN1 = c(2L, NA, NA, 2L), PN2 = c(5L, 4L, 2L, NA)),
row.names = c(NA, -4L), class = "data.frame")
We could use mutate with case_when:
library(dplyr)
df %>%
mutate(across(starts_with("PN"), ~case_when(!(. %in% ID) ~ NA_real_,
TRUE ~ as.numeric(.))))
Output:
# A tibble: 4 x 3
ID PN1 PN2
<int> <dbl> <dbl>
1 1 2 5
2 2 NA 4
3 4 NA 2
4 5 2 NA
With data.table you can (l)apply the function fifelse() to every column
you have selected with .SD & .SDcols.
require(data.table)
cols = grep('PN', names(df)) # column indices (or names)
df[ , lapply(.SD, function(x) fifelse(!x %in% ID, NA_real_, x)),
.SDcols = cols ]
Data from #deschen:
df = data.frame(ID = c(1, 2, 4, 5),
PN1 = c(2, 3, 6, 2),
PN2 = c(5, 4, 2, 7))
setDT(df)
This question already has answers here:
Merging a lot of data.frames [duplicate]
(1 answer)
How do I replace NA values with zeros in an R dataframe?
(29 answers)
Closed 2 years ago.
I want to merge the following 3 data frames and fill the missing values with -1. I think I should use the fct merge() but not exactly know how to do it.
> df1
Letter Values1
1 A 1
2 B 2
3 C 3
> df2
Letter Values2
1 A 0
2 C 5
3 D 9
> df3
Letter Values3
1 A -1
2 D 5
3 B -1
desire output would be:
Letter Values1 Values2 Values3
1 A 1 0 -1
2 B 2 -1 -1 # fill missing values with -1
3 C 3 5 -1
4 D -1 9 5
code:
> dput(df1)
structure(list(Letter = structure(1:3, .Label = c("A", "B", "C"
), class = "factor"), Values1 = c(1, 2, 3)), class = "data.frame", row.names = c(NA,
-3L))
> dput(df2)
structure(list(Letter = structure(1:3, .Label = c("A", "C", "D"
), class = "factor"), Values2 = c(0, 5, 9)), class = "data.frame", row.names = c(NA,
-3L))
> dput(df3)
structure(list(Letter = structure(c(1L, 3L, 2L), .Label = c("A",
"B", "D"), class = "factor"), Values3 = c(-1, 5, -1)), class = "data.frame", row.names = c(NA,
-3L))
You can get data frames in a list and use merge with Reduce. Missing values in the new dataframe can be replaced with -1.
new_df <- Reduce(function(x, y) merge(x, y, all = TRUE), list(df1, df2, df3))
new_df[is.na(new_df)] <- -1
new_df
# Letter Values1 Values2 Values3
#1 A 1 0 -1
#2 B 2 -1 -1
#3 C 3 5 -1
#4 D -1 9 5
A tidyverse way with the same logic :
library(dplyr)
library(purrr)
list(df1, df2, df3) %>%
reduce(full_join) %>%
mutate(across(everything(), replace_na, -1))
Here's a dplyr solution
df1 %>%
full_join(df2, by = "Letter") %>%
full_join(df3, by = "Letter") %>%
mutate_if(is.numeric, function(x) replace_na(x, -1))
output:
Letter Values1 Values2 Values3
<chr> <dbl> <dbl> <dbl>
1 A 1 0 -1
2 B 2 -1 -1
3 C 3 5 -1
4 D -1 9 5
Here is my toy dataframe:
structure(list(a = c(1, 2), b = c(3, 4), c = c(5, 6), d = c(7,
8)), .Names = c("a", "b", "c", "d"), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame"))
Now I want to reorder and exclude one the columns and keep the others:
df %>% select(-a, d, everything())
I want my df to be :
d b c
7 3 5
8 4 6
I get the following:
b c d a
<dbl> <dbl> <dbl> <dbl>
1 3 5 7 1
2 4 6 8 2
Keep the -a at the last in the select. Even though, we removed a in the beginning the everythig() at the end is still checking the column names of the whole dataset
df%>%
select(d, everything(), -a)
# A tibble: 2 x 3
# d b c
# <dbl> <dbl> <dbl>
#1 7 3 5
#2 8 4 6
I would like to splite each row of a data frame(numberic) into two rows. For example, part of the original data frame like this (nrow(original datafram) > 2800000):
ID X Y Z value_1 value_2
1 3 2 6 22 54
6 11 5 9 52 71
3 7 2 5 2 34
5 10 7 1 23 47
And after spliting each row, we can get:
ID X Y Z
1 3 2 6
22 54 NA NA
6 11 5 9
52 71 NA NA
3 7 2 5
2 34 NA NA
5 10 7 1
23 47 NA NA
the "value_1" and "value_2" columns are split and each element is set to a new row. For example, value_1 = 22 and value_2 = 54 are set to a new row.
Here is one option with data.table. We convert the 'data.frame' to 'data.table' by creating a column of rownames (setDT(df1, keep.rownames = TRUE)). Subset the columns 1:5 and 1, 6, 7 in a list, rbind the list element with fill = TRUE option to return NA for corresponding columns that are not found in one of the datasets, order by the row number ('rn') and assign (:=) the row number column to 'NULL'.
library(data.table)
setDT(df1, keep.rownames = TRUE)[]
rbindlist(list(df1[, 1:5, with = FALSE], setnames(df1[, c(1, 6:7),
with = FALSE], 2:3, c("ID", "X"))), fill = TRUE)[order(rn)][, rn:= NULL][]
# ID X Y Z
#1: 1 3 2 6
#2: 22 54 NA NA
#3: 6 11 5 9
#4: 52 71 NA NA
#5: 3 7 2 5
#6: 2 34 NA NA
#7: 5 10 7 1
#8: 23 47 NA NA
A hadleyverse corresponding to the above logic would be
library(dplyr)
tibble::rownames_to_column(df1[1:4]) %>%
bind_rows(., setNames(tibble::rownames_to_column(df1[5:6]),
c("rowname", "ID", "X"))) %>%
arrange(rowname) %>%
select(-rowname)
# ID X Y Z
#1 1 3 2 6
#2 22 54 NA NA
#3 6 11 5 9
#4 52 71 NA NA
#5 3 7 2 5
#6 2 34 NA NA
#7 5 10 7 1
#8 23 47 NA NA
data
df1 <- structure(list(ID = c(1L, 6L, 3L, 5L), X = c(3L, 11L, 7L, 10L
), Y = c(2L, 5L, 2L, 7L), Z = c(6L, 9L, 5L, 1L), value_1 = c(22L,
52L, 2L, 23L), value_2 = c(54L, 71L, 34L, 47L)), .Names = c("ID",
"X", "Y", "Z", "value_1", "value_2"), class = "data.frame",
row.names = c(NA, -4L))
Here's a (very slow) pure R solution using no extra packages:
# Replicate your matrix
input_df <- data.frame(ID = rnorm(10000),
X = rnorm(10000),
Y = rnorm(10000),
Z = rnorm(10000),
value_1 = rnorm(10000),
value_2 = rnorm(10000))
# Preallocate memory to a data frame
output_df <- data.frame(
matrix(
nrow = nrow(input_df)*2,
ncol = ncol(input_df)-2))
# Loop through each row in turn.
# Put the first four elements into the current
# row, and the next two into the current+1 row
# with two NAs attached.
for(i in seq(1, nrow(output_df), 2)){
output_df[i,] <- input_df[i, c(1:4)]
output_df[i+1,] <- c(input_df[i, c(5:6)],NA,NA)
}
colnames(output_df) <- c("ID", "X", "Y", "Z")
Which results in
> head(output_df)
X1 X2 X3 X4
1 0.5529417 -0.93859275 2.0900276 -2.4023800
2 0.9751090 0.13357075 NA NA
3 0.6753835 0.07018647 0.8529300 -0.9844643
4 1.6405939 0.96133195 NA NA
5 0.3378821 -0.44612782 -0.8176745 0.2759752
6 -0.8910678 -0.37928353 NA NA
This should work
data <- read.table(text= "ID X Y Z value_1 value_2
1 3 2 6 22 54
6 11 5 9 52 71
3 7 2 5 2 34
5 10 7 1 23 47", header=T)
data1 <- data[,1:4]
data2 <- setdiff(data,data1)
names(data2) <- names(data1)[1:ncol(data2)]
combined <- plyr::rbind.fill(data1,data2)
n <- nrow(data1)
combined[kronecker(1:n, c(0, n), "+"),]
Though why you would need to do this beats me.