Fill missing values in a data frame - r

Hey I need to fill out the missing values of a data frame. The logic is easy, if there is value in M[i, j + 1] then use M[i, j + 1], else use M[i, j - 1]. But the tricky thing is I need to fill out the missing value since the beginning of the row to the column after last non-na value for each row, not only the cells near the non-empty cells.
Here is the data
a1 <- c('a',9,8,rep(NA,5))
a2 <- c('b',NA,NA,NA,NA,3,NA,4)
a3 <- c('c',11,6,7,NA,NA,NA,6)
M <- rbind(a1,a2,a3)
ind <- !is.na(M[,-1])
t <- tapply(M[,-1][ind], row(M[,-1])[ind], head, 1)
M <- M %>%
as.data.frame(stringsAsFactors = FALSE) %>%
group_by(V1) %>%
do(mutate(., last_non_na_col = max(apply(.,1,function(x) max(which(!is.na(x)))))))
for (i in 1:nrow(M)) {
for (j in 3:(M$last_non_na_col[i]+1)) {
if (is.na(M[i,j])) {
M[i,j] = ifelse(!is.na(M[i,j+1]),M[i,j+1],(ifelse(!is.na(M[i,j-1]),M[i,j-1],t[i])))
} }
for (j in 2) { M[i,j] = ifelse(is.na(M[i,j]), M[i,j+1], M[i,j])}
}
The raw data is like this
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
a1 "a" "9" "8" NA NA NA NA NA
a2 "b" NA NA NA NA "3" NA "4"
a3 "c" "11" "6" "7" NA NA NA "6"
The output of my code is the following, which is correct. Please notice that for cell M[2,5], the filled value should be 7(which is the number prior to it), not 6(the nearest number after it).
V1 V2 V3 V4 V5 V6 V7 V8 last_non_na_col
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int>
1 a 9 8 8 NA NA NA NA 3
2 b 3 3 3 3 3 4 4 8
3 c 11 6 7 7 7 6 6 8
I did this in for loop. Does any one can help me to do this in tidyverse?
Thanks,
Cathy

As we have a tbl_df, we could use tidyverse methods
library(tidyverse)
gather(M, key, val, -V1) %>%
group_by(V1) %>%
fill(val, .direction = 'up') %>%
mutate(val = replace(val, which(is.na(val))[1],
val[tail(which(!is.na(val)), 1)])) %>%
spread(key, val)
# A tibble: 3 x 8
# Groups: V1 [3]
# V1 V2 V3 V4 V5 V6 V7 V8
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 a 9 8 8 NA NA NA NA
#2 b 3 3 3 3 3 4 4
#3 c 11 6 7 5 5 6 6
In the OP's for loop, we could use na.locf (to fill up the NA elements by the adjacent non-NA elements - from zoo package)
library(zoo)
last_non_na_col <- c(3, 8, 8)
for (i in seq_len(nrow(M))) {
M[i, -1] <- na.locf(unlist(M[i, -1]), fromLast = TRUE, na.rm = FALSE)
for (j in 3:(pmin(ncol(M), last_non_na_col[i]+1))) {
if (is.na(M[i,j])) {
M[i,j] = ifelse(!is.na(M[i,j+1]), M[i,j+1], M[i,j-1])
}
}
}
M
# A tibble: 3 x 8
# Groups: V1 [3]
# V1 V2 V3 V4 V5 V6 V7 V8
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 a 9 8 8 NA NA NA NA
#2 b 3 3 3 3 3 4 4
#3 c 11 6 7 5 5 6 6
NOTE: Here, we created the last_non_na_col as a vector instead of a separate column in the dataset for easiness in indexing
data
M <- structure(list(V1 = c("a", "b", "c"), V2 = c("9", NA, "11"),
V3 = c("8", NA, "6"), V4 = c(NA, NA, "7"), V5 = c(NA_character_,
NA_character_, NA_character_), V6 = c(NA, "3", "5"), V7 = c(NA_character_,
NA_character_, NA_character_), V8 = c(NA, "4", "6")), .Names = c("V1",
"V2", "V3", "V4", "V5", "V6", "V7", "V8"), row.names = c(NA,
-3L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"),
vars = "V1", drop = TRUE, indices = list(
0L, 1L, 2L), group_sizes = c(1L, 1L, 1L), biggest_group_size = 1L,
labels = structure(list(
V1 = c("a", "b", "c")), row.names = c(NA, -3L),
class = "data.frame", vars = "V1", drop = TRUE, .Names = "V1"))

Related

R function (lapply {ifelse})? to return values in a dataframe based on an index df

I have a fairly large dataset of locations and associated points. All the points and locations are index values. I am trying to convert the index values to location codes from another df in a concise way while keeping "na" values. Here is what I have tried.
a <- c(1,2,3,5, "na", "na")
b <- c(2,1,5,7,3,6)
c <- c(3,6,2,4,1,"na")
df <- rbind(a,b,c)
df <- as.data.frame(df)
d <- c(1,2,3,4,5,6,7,8)
e <- c(0115,0116,1117,1119,1237,1456,1901,2135)
df2 <- cbind(d,e)
df2 <- as.data.frame(df2)
test <- lapply(df[2:6], function(x){ifelse(x %in% df2$d, df2$e, "na")})
The result is repeating values across rows of the first columns associated value. Any ideas how to resolve this. Apparently I don't understand how lapply functions.
use NA and not "na" in your data.
df3 <- melt(df, id.var = "V1", value.name = "d")
merge(df3, df2, by = "d", all = TRUE)
# d V1 variable e
# 1 1 3 V5 115
# 2 1 2 V2 115
# 3 2 1 V2 116
# 4 2 3 V3 116
# 5 3 1 V3 1117
# 6 3 2 V5 1117
# 7 4 3 V4 1119
# 8 5 2 V3 1237
# 9 5 1 V4 1237
# 10 6 3 V2 1456
# 11 6 2 V6 1456
# 12 7 2 V4 1901
# 13 8 NA <NA> 2135
# 14 NA 1 V6 NA
# 15 NA 1 V5 NA
# 16 NA 3 V6 NA
Data
df <- structure(list(V1 = c(1, 2, 3), V2 = c(2, 1, 6), V3 = c(3, 5,
2), V4 = c(5, 7, 4), V5 = c(NA, 3, 1), V6 = c(NA, 6, NA)), .Names = c("V1",
"V2", "V3", "V4", "V5", "V6"), row.names = c("a", "b", "c"), class = "data.frame")
df2 <- structure(list(d = c(1, 2, 3, 4, 5, 6, 7, 8), e = c(115, 116,
1117, 1119, 1237, 1456, 1901, 2135)), .Names = c("d", "e"), row.names = c(NA,
-8L), class = "data.frame")
Using lapply
lapply(df[2:6], function(x){
y <- which(!is.na(x))
x[y] <- df2$e[na.omit(x)]
# x[y] <- df2$e[df2$d[na.omit(x)]] # using d
return(x)
})
# $V2
# [1] 116 115 1456
#
# $V3
# [1] 1117 1237 116
#
# $V4
# [1] 1237 1901 1119
#
# $V5
# [1] NA 1117 115
#
# $V6
# [1] NA 1456 NA

Sort data based on conditions

I have a (x) data frame in R with 5 numeric columns and apart from this one information is sorting order to be followed in form of a vector i.e.
1, 0, 2, 4, 3
dataset
v1 v2 v3 v4 v5
1 2 3 4 5
3 13 12 1 4
6 4 6 5 3
Expected result
v1 v2 v3 v4 v5
3 13 12 1 4
1 2 2 4 5
6 4 6 5 3
this vector define the sorting order that first column needs to be sorted first then 3rd column then 5th column and then 4th column. manually it can be done as
x = x[order(x[1],)]
x = x[order(x[3],)]
x = x[order(x[5],)]
x = x[order(x[4],)]
rownames(x) = NULL
Problem is for 5 columns, it is easy but it is complicated for 100s of columns.
any lead to this will be appreciated.
Thanks
We can do a match on the original vector and then use a for loop to get the output
i1 <- match(seq_along(x), vec, nomatch = 0)
i1 <- i1[i1!=0]
for(i in i1){
x <- x[order(x[i]),]
}
x
# v1 v2 v3 v4 v5
# 2 3 13 12 1 4
# 1 1 2 3 4 5
# 3 6 4 6 5 3
data
x <- structure(list(v1 = c(1L, 3L, 6L), v2 = c(2L, 13L, 4L), v3 = c(3L,
12L, 6L), v4 = c(4L, 1L, 5L), v5 = c(5L, 4L, 3L)), .Names = c("v1",
"v2", "v3", "v4", "v5"), class = "data.frame", row.names = c(NA,
-3L))
vec <- c(1, 0, 2, 4, 3)

Dynamically Create Variables Based on Binary Indicators in R

I have user-level data that looks like this:
ID V1 V2 V3 V4
001 1 0 1 0
002 0 1 0 1
003 0 0 0 0
004 1 1 1 0
In the above example, I would like an elegant solution (likely using tidyr) to dynamically refactor this to appear as:
ID Num_Vars Var1 Var2 Var3
001 2 V1 V3 NA
002 2 V2 V4 NA
003 0 NA NA NA
004 3 V1 V2 V3
Note that this example is simplified and there are actually many variables. The point is to have code that detects how many variables should be created, based on the maximum number of 1s in Var1-VarX that are populated for any user.
This feels like some fairly standard reshaping: convert to long, manipulate by group, convert back to wide:
df %>%
gather(key = var, value = value, -ID) %>%
group_by(ID) %>%
filter(value != 0) %>%
mutate(Num_Vars = n(),
Var_Label = paste0("Var", 1:n())) %>%
spread(key = Var_Label, value = var) %>%
select(-value) %>%
full_join(distinct(df, ID))
# Source: local data frame [4 x 5]
# Groups: ID [?]
#
# ID Num_Vars Var1 Var2 Var3
# <int> <int> <chr> <chr> <chr>
# 1 1 2 V1 V3 <NA>
# 2 2 2 V2 V4 <NA>
# 3 4 3 V1 V2 V3
# 4 3 NA <NA> <NA> <NA>
Using this data reproducibly shared with dput():
df = structure(list(ID = 1:4, V1 = c(1L, 0L, 0L, 1L), V2 = c(0L, 1L,
0L, 1L), V3 = c(1L, 0L, 0L, 1L), V4 = c(0L, 1L, 0L, 0L)), .Names = c("ID",
"V1", "V2", "V3", "V4"), class = "data.frame", row.names = c(NA,
-4L))
We can use melt/dcast from data.table
library(data.table)
dcast(melt(setDT(df), id.var = "ID")[, Num_vars := sum(value),
ID][value!=0][df[, "ID", with = FALSE], on = "ID"],
ID + Num_vars ~ paste0("Var", rowid(ID)), value.var = "variable")
# ID Num_vars Var1 Var2 Var3
#1: 1 2 V1 V3 NA
#2: 2 2 V2 V4 NA
#3: 3 NA NA NA NA
#4: 4 3 V1 V2 V3

Melt the data by separating the columns

I have a data frame df resulted from dplyr's summarise_each
V1_mean V2_mean V3_mean V4_mean V5_mean V1_median V2_median V3_median V4_median V5_median V1_my_mode V2_my_mode V3_my_mode V4_my_mode V5_my_mode V1_sum
1 3 4 NA 3.75 5 3 4 NA 4 5 1 2 NA 4 5 4
V2_sum V3_sum V4_sum V5_sum
1 4 4 4 4
How can I put it in a following format?
var mean median my_mode sum
1 V1 3 3 1 4
2 V2 4 4 2 4
3 V3 NA NA NA 4
4 V4 3 4 4 4
5 V5 5 5 5 4
df
structure(list(V1_mean = 3, V2_mean = 4, V3_mean = NA_real_,
V4_mean = 3.75, V5_mean = 5, V1_median = 3, V2_median = 4,
V3_median = NA_real_, V4_median = 4, V5_median = 5, V1_my_mode = "1",
V2_my_mode = "2", V3_my_mode = NA, V4_my_mode = "4", V5_my_mode = "5",
V1_sum = 4L, V2_sum = 4L, V3_sum = 4L, V4_sum = 4L, V5_sum = 4L), class = "data.frame", .Names = c("V1_mean",
"V2_mean", "V3_mean", "V4_mean", "V5_mean", "V1_median", "V2_median",
"V3_median", "V4_median", "V5_median", "V1_my_mode", "V2_my_mode",
"V3_my_mode", "V4_my_mode", "V5_my_mode", "V1_sum", "V2_sum",
"V3_sum", "V4_sum", "V5_sum"), row.names = c(NA, -1L))
Here's an approach that makes one small modification: Changes my_mode to myMode in the column names. This makes it easy for separate to work after having melted the data (using gather):
library(dplyr)
library(tidyr)
df %>%
gather(var, val, starts_with("V")) %>%
separate(var, into = c("V1", "V2")) %>%
spread(V2, val)
# V1 mean median myMode sum
# 1 V1 3 3 1 4
# 2 V2 4 4 2 4
# 3 V3 <NA> <NA> <NA> 4
# 4 V4 3.75 4 4 4
# 5 V5 5 5 5 4

R - How to transform a list of lists to matrix with unique indexes

I have a list which looking like this
[[1]]
users V1
1 28 3
2 33 1
3 35 4
4 260 1
[[2]]
users V1
1 33 2
2 260 1
3 285 13
How can I create a table like this one using R?
users V1 V2
1 28 3 NA
2 33 1 2
3 35 4 NA
4 260 NA 1
5 285 13 NA
You can try
Reduce(function(...) merge(..., by='users', all=TRUE), lst)
# users V1.x V1.y
#1 28 3 NA
#2 33 1 2
#3 35 4 NA
#4 260 1 1
#5 285 NA 13
Another option would be to use join_all from plyr. But, this requires the column names other than the one used in the by= to be named differently.
library(plyr)
nm1 <- make.names(sapply(lst, colnames)[-1,],unique=TRUE)
join_all(Map(function(x,y) {names(x)[-1] <- y; x}, lst, nm1),
by='users', type='full')
# users V1 V1.1
#1 28 3 NA
#2 33 1 2
#3 35 4 NA
#4 260 1 1
#5 285 NA 13
data
lst <- list(structure(list(users = c(28L, 33L, 35L, 260L), V1 = c(3L,
1L, 4L, 1L)), .Names = c("users", "V1"), class = "data.frame",
row.names = c("1", "2", "3", "4")), structure(list(users = c(33L, 260L, 285L),
V1 = c(2L, 1L, 13L)), .Names = c("users", "V1"), class = "data.frame",
row.names = c("1", "2", "3")))
You can use do.call with merge on your list.
l <- list(structure(list(users = c(28, 33, 35, 260), V1 = c(3, 1,
4, 1)), .Names = c("users", "V1"), row.names = c(NA, -4L), class = "data.frame"),
structure(list(users = c(33, 260, 285), V1 = c(2, 1, 13)), .Names = c("users",
"V1"), row.names = c(NA, -3L), class = "data.frame"))
do.call(merge, c(l, list(all = TRUE, by = "users")))
# users V1.x V1.y
#1 28 3 NA
#2 33 1 2
#3 35 4 NA
#4 260 1 1
#5 285 NA 13

Resources