cbind in for loop with unequal number of rows - r

This is my data frame
>head(dat)
Var1 Freq
1 89 2
2 95 2
3 97 1
4 99 2
5 103 2
6 104 2
I want to iterate over for loop and append dat$freq using cbind. Is it possible to append the Freq to the same Var1, when NA occurs in Freq?

I think OP is looking to merge a list of data.frames instead of cbind
Following should do the trick.
DF.LIST <- lapply(1:5, function(x) {
rows <- sample(1:5, 1)
data.frame(Var1 = sample(1:5, rows), Freq = sample(5:10, rows))
})
DF.LIST
## [[1]]
## Var1 Freq
## 1 2 6
## 2 4 7
## 3 3 9
## 4 5 10
##
## [[2]]
## Var1 Freq
## 1 3 10
## 2 2 9
##
## [[3]]
## Var1 Freq
## 1 4 5
## 2 3 6
##
## [[4]]
## Var1 Freq
## 1 1 6
## 2 2 10
## 3 5 7
## 4 3 9
## 5 4 8
##
## [[5]]
## Var1 Freq
## 1 5 10
##
OPTION 1
Problem with Reduce & merge combo if used directly on such a list is that it will just end up merging with both Var1 and Freq columns. To avoid that we first rename the second column in each data.frame by adding a index number. After that Reduce and merge combo should give what OP wants.
for (i in 1:length(DF.LIST)) {
names(DF.LIST[[i]]) <- c("Var1", paste0("Freq", i))
}
Reduce(function(...) merge(..., all = T), DF.LIST)
## Var1 Freq1 Freq2 Freq3 Freq4 Freq5
## 1 1 NA NA NA 6 NA
## 2 2 6 9 NA 10 NA
## 3 3 9 10 6 9 NA
## 4 4 7 NA 5 8 NA
## 5 5 10 NA NA 7 10
OPTION 2
You can try following on original DF.LIST directly, but you still need to take care of the column names in the result then.
Reduce(function(...) merge(..., by = "Var1", all = T), DF.LIST)
## Var1 Freq.x Freq.y Freq.x Freq.y Freq
## 1 1 NA 6 NA 9 NA
## 2 2 7 NA NA 8 NA
## 3 3 NA 5 NA 7 7
## 4 4 NA 7 5 5 10
## 5 5 9 9 7 6 NA
Warning messages:
1: In merge.data.frame(..., by = "Var1", all = T) :
column names ‘Freq.x’, ‘Freq.y’ are duplicated in the result
2: In merge.data.frame(..., by = "Var1", all = T) :
column names ‘Freq.x’, ‘Freq.y’ are duplicated in the result

Related

Summing Entries in Multiple Unequally-Sized Data Frames With Some (but not All) Rows and Columns the Same

I have the following three data frames:
df1 <- data.frame(A = 1:10, B = 3:12, D = 4:13)
df2 <- data.frame(C = 1:5, B = 4:8, E = 2:6)
df3 <- data.frame(A = 13:10, B = 19:16, F = 1:4)
rownames(df1) <- paste0("row", seq_len(nrow(df1)))
rownames(df2) <- paste0("row", c(1, 3, 5, 7, 11))
rownames(df3) <- paste0("row", c(12, 3, 10, 9))
# > df1
# A B D
# row1 1 3 4
# row2 2 4 5
# row3 3 5 6
# row4 4 6 7
# row5 5 7 8
# row6 6 8 9
# row7 7 9 10
# row8 8 10 11
# row9 9 11 12
# row10 10 12 13
# > df2
# C B E
# row1 1 4 2
# row3 2 5 3
# row5 3 6 4
# row7 4 7 5
# row11 5 8 6
# > df3
# A B F
# row12 13 19 1
# row3 12 18 2
# row10 11 17 3
# row9 10 16 4
When cells from different data frames have the same row and column names, I want to sum their values. In the cases where a cell has no matches (there aren't any other cells with the same row and column name), the final data frame will contain that original value. When a cell with a particular row name and column name combination doesn't exist in any of the original data frames, the final data frame will contain an NA in that position.
The final data frame should look like this data frame:
> df4
A B C D E G
row1 1 7 1 4 2 NA
row2 2 4 NA 5 1 NA
row3 15 28 2 6 5 2
row4 4 6 NA 7 3 NA
row5 5 13 3 8 8 NA
row6 6 8 NA 9 5 NA
row7 7 16 4 10 11 NA
row8 8 10 NA 11 7 NA
row9 19 27 NA 12 8 4
row10 21 29 NA 13 9 3
row11 NA 8 5 NA 6 NA
row12 13 19 NA NA NA 1
I'm imagining something with the Reduce() function that can be used on many data frames at once. Is the first step adding missing rows and columns to existing data frames with NAs in all the cells where values are missing?
Thanks!
I think this should work. With row AND column names and one data type, I prefer matrices to data frames, but you can convert the final matrix back to a data frame if you need.
# put things in a list
df_list = list(df1, df2, df3)
# get the complete set of row and column names
all_rows = unique(unlist(lapply(df_list, rownames)))
all_cols = unique(unlist(lapply(df_list, colnames)))
# initialize a final matrix to NA
final_mat = matrix(NA, nrow = length(all_rows), ncol = length(all_cols))
rownames(final_mat) = all_rows
colnames(final_mat) = all_cols
# go through each df in the list
for(i in seq_along(df_list)) {
# set any NAs in the selection to 0
final_mat[rownames(df_list[[i]]), colnames(df_list[[i]])][is.na(final_mat[rownames(df_list[[i]]), colnames(df_list[[i]])])] = 0
# add the data frame to the selection
final_mat[rownames(df_list[[i]]), colnames(df_list[[i]])] = final_mat[rownames(df_list[[i]]), colnames(df_list[[i]])] + as.matrix(df_list[[i]])
}
final_mat
# A B D C E F
# row1 1 7 4 1 2 NA
# row2 2 4 5 NA NA NA
# row3 15 28 6 2 3 2
# row4 4 6 7 NA NA NA
# row5 5 13 8 3 4 NA
# row6 6 8 9 NA NA NA
# row7 7 16 10 4 5 NA
# row8 8 10 11 NA NA NA
# row9 19 27 12 NA NA 4
# row10 21 29 13 NA NA 3
# row11 NA 8 NA 5 6 NA
# row12 13 19 NA NA NA 1
Here's a dplyr alternative -
Put the dataframes in a list
make rownames as a separate column from each dataframe
Combine them in one dataframe
For each rowname sum the values from all the columns. If a column has all the NA's we return NA. A shortcut to do that is to use hablar::sum_.
Order the data based on rownames
Use column_to_rownames to assign the values back as rownames.
library(dplyr)
df_list = list(df1, df2, df3)
purrr::map_df(df_list, ~.x %>% rownames_to_column('row')) %>%
group_by(row) %>%
summarise(across(A:F, hablar::sum_)) %>%
arrange(order(gtools::mixedorder(row))) %>%
column_to_rownames('row')
# A B D C E F
#row1 1 7 4 1 2 NA
#row2 2 4 5 NA NA NA
#row3 15 28 6 2 3 2
#row4 4 6 7 NA NA NA
#row5 5 13 8 3 4 NA
#row6 6 8 9 NA NA NA
#row7 7 16 10 4 5 NA
#row8 8 10 11 NA NA NA
#row9 19 27 12 NA NA 4
#row10 21 29 13 NA NA 3
#row11 NA 8 NA 5 6 NA
#row12 13 19 NA NA NA 1
Using data.table
library(data.table)
rbindlist(list(setDT(df1, keep.rownames = TRUE),
setDT(df2, keep.rownames = TRUE),
setDT(df3, keep.rownames = TRUE)), fill = TRUE)[,
lapply(.SD, function(x) dplyr::na_if(sum(x, na.rm = TRUE), 0)), rn]

Applying custom function to each row uses only first value of argument

I am trying to recode NA values to 0 in a subset of columns using the following dataset:
set.seed(1)
df <- data.frame(
id = c(1:10),
trials = sample(1:3, 10, replace = T),
t1 = c(sample(c(1:9, NA), 10)),
t2 = c(sample(c(1:7, rep(NA, 3)), 10)),
t3 = c(sample(c(1:5, rep(NA, 5)), 10))
)
Each row has a certain number of trials associated with it (between 1-3), specified by the trials column. columns t1-t3 represent scores for each trial.
The number of trials indicates the subset of columns in which NAs should be recoded to 0: NAs that are within the number of trials represent missing data, and should be recoded as 0, while NAs outside the number of trials are not meaningful, and should remain NAs. So, for a row where trials == 3, an NA in column t3 would be recoded as 0, but in a row where trials == 2, an NA in t3 would remain an NA.
So, I tried using this function:
replace0 <- function(x, num.sun) {
x[which(is.na(x[1:(num.sun + 2)]))] <- 0
return(x)
}
This works well for single vectors. When I try applying the same function to a data frame with apply(), though:
apply(df, 1, replace0, num.sun = df$trials)
I get a warning saying:
In 1:(num.sun + 2) :
numerical expression has 10 elements: only the first used
The result is that instead of having the value of num.sun change every row according to the value in trials, apply() simply uses the first value in the trials column for every single row. How could I apply the function so that the num.sun argument changes according to the value of df$trials?
Thanks!
Edit: as some have commented, the original example data had some non-NA scores that didn't make sense according to the trials column. Here's a corrected dataset:
df <- data.frame(
id = c(1:5),
trials = c(rep(1, 2), rep(2, 1), rep(3, 2)),
t1 = c(NA, 7, NA, 6, NA),
t2 = c(NA, NA, 3, 7, 12),
t3 = c(NA, NA, NA, 4, NA)
)
Another approach:
# create an index of the NA values
w <- which(is.na(df), arr.ind = TRUE)
# create an index with the max column by row where an NA is allowed to be replaced by a zero
m <- matrix(c(1:nrow(df), (df$trials + 2)), ncol = 2)
# subset 'w' such that only the NA's which fall in the scope of 'm' remain
i <- w[w[,2] <= m[,2][match(w[,1], m[,1])],]
# use 'i' to replace the allowed NA's with a zero
df[i] <- 0
which gives:
> df
id trials t1 t2 t3
1 1 1 3 NA 5
2 2 2 2 2 NA
3 3 2 6 6 4
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 1
9 9 2 1 3 NA
10 10 1 9 4 3
You could easily wrap this in a function:
replace.NA.with.0 <- function(df) {
w <- which(is.na(df), arr.ind = TRUE)
m <- matrix(c(1:nrow(df), (df$trials + 2)), ncol = 2)
i <- w[w[,2] <= m[,2][match(w[,1], m[,1])],]
df[i] <- 0
return(df)
}
Now, using replace.NA.with.0(df) will produce the above result.
As noted by others, some rows (1, 3 & 10) have more values than trails. You could tackle that problem by rewriting the above function to:
replace.with.NA.or.0 <- function(df) {
w <- which(is.na(df), arr.ind = TRUE)
df[w] <- 0
v <- tapply(m[,2], m[,1], FUN = function(x) tail(x:5,-1))
ina <- matrix(as.integer(unlist(stack(v)[2:1])), ncol = 2)
df[ina] <- NA
return(df)
}
Now, using replace.with.NA.or.0(df) produces the following result:
id trials t1 t2 t3
1 1 1 3 NA NA
2 2 2 2 2 NA
3 3 2 6 6 NA
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 NA
9 9 2 1 3 NA
10 10 1 9 NA NA
Here I just rewrite your function using double subsetting x[paste0('t',x['trials'])], which overcome the problem in the other two solutions with row 6
replace0 <- function(x){
#browser()
x_na <- x[paste0('t',x['trials'])]
if(is.na(x_na)){x[paste0('t',x['trials'])] <- 0}
return(x)
}
t(apply(df, 1, replace0))
id trials t1 t2 t3
[1,] 1 1 3 NA 5
[2,] 2 2 2 2 NA
[3,] 3 2 6 6 4
[4,] 4 3 NA 1 2
[5,] 5 1 5 NA NA
[6,] 6 3 7 NA 0
[7,] 7 3 8 7 0
[8,] 8 2 4 5 1
[9,] 9 2 1 3 NA
[10,] 10 1 9 4 3
Here is a way to do it:
x <- is.na(df)
df[x & t(apply(x, 1, cumsum)) > 3 - df$trials] <- 0
The output looks like this:
> df
id trials t1 t2 t3
1 1 1 3 NA 5
2 2 2 2 2 NA
3 3 2 6 6 4
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 1
9 9 2 1 3 NA
10 10 1 9 4 3
> x <- is.na(df)
> df[x & t(apply(x, 1, cumsum)) > 3 - df$trials] <- 0
> df
id trials t1 t2 t3
1 1 1 3 NA 5
2 2 2 2 2 NA
3 3 2 6 6 4
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 1
9 9 2 1 3 NA
10 10 1 9 4 3
Note: row 1/3/10, is problematic since there are more non-NA values than the trials.
Here's a tidyverse way, note that it doesn't give the same output as other solutions.
Your example data shows results for trials that "didn't happen", I assumed your real data doesn't.
library(tidyverse)
df %>%
nest(matches("^t\\d")) %>%
mutate(data = map2(data,trials,~mutate_all(.,replace_na,0) %>% select(.,1:.y))) %>%
unnest
# id trials t1 t2 t3
# 1 1 1 3 NA NA
# 2 2 2 2 2 NA
# 3 3 2 6 6 NA
# 4 4 3 0 1 2
# 5 5 1 5 NA NA
# 6 6 3 7 0 0
# 7 7 3 8 7 0
# 8 8 2 4 5 NA
# 9 9 2 1 3 NA
# 10 10 1 9 NA NA
Using the more commonly used gather strategy this would be:
df %>%
gather(k,v,matches("^t\\d")) %>%
arrange(id) %>%
group_by(id) %>%
slice(1:first(trials)) %>%
mutate_at("v",~replace(.,is.na(.),0)) %>%
spread(k,v)
# # A tibble: 10 x 5
# # Groups: id [10]
# id trials t1 t2 t3
# <int> <int> <dbl> <dbl> <dbl>
# 1 1 1 3 NA NA
# 2 2 2 2 2 NA
# 3 3 2 6 6 NA
# 4 4 3 0 1 2
# 5 5 1 5 NA NA
# 6 6 3 7 0 0
# 7 7 3 8 7 0
# 8 8 2 4 5 NA
# 9 9 2 1 3 NA
# 10 10 1 9 NA NA

Transpose multiple columns as column names and fill with values in R

The sample data as following:
x <- read.table(header=T, text="
ID CostType1 Cost1 CostType2 Cost2
1 a 10 c 1
2 b 2 c 20
3 a 1 b 50
4 a 40 c 1
5 c 2 b 30
6 a 60 c 3
7 c 10 d 1
8 a 20 d 2")
I want the second and third columns (CostType1 and CostType 2) to be the the names of new columns and fill the corresponding cost to certain cost type. If there's no match, filled with NA. The ideal format will be following:
a b c d
1 10 NA 1 NA
2 NA 2 20 NA
3 1 50 NA NA
4 40 1 NA NA
5 NA 30 2 NA
6 60 NA 3 NA
7 NA NA 10 1
8 20 NA NA 2
A solution using tidyverse. We can first get how many groups are there. In this example, there are two groups. We can convert each group, combine them, and then summarize the data frame with the first non-NA value in the column.
library(tidyverse)
# Get the group numbers
g <- (ncol(x) - 1)/2
x2 <- map_dfr(1:g, function(i){
# Transform the data frame one group at a time
x <- x %>%
select(ID, ends_with(as.character(i))) %>%
spread(paste0("CostType", i), paste0("Cost", i))
return(x)
}) %>%
group_by(ID) %>%
# Select the first non-NA value if there are multiple values
summarise_all(funs(first(.[!is.na(.)])))
x2
# # A tibble: 8 x 5
# ID a b c d
# <int> <int> <int> <int> <int>
# 1 1 10 NA 1 NA
# 2 2 NA 2 20 NA
# 3 3 1 50 NA NA
# 4 4 40 NA 1 NA
# 5 5 NA 30 2 NA
# 6 6 60 NA 3 NA
# 7 7 NA NA 10 1
# 8 8 20 NA NA 2
A base solution using reshape
x1 <- setNames(x[,c("ID", "CostType1", "Cost1")], c("ID", "CostType", "Cost"))
x2 <- setNames(x[,c("ID", "CostType2", "Cost2")], c("ID", "CostType", "Cost"))
reshape(data=rbind(x1, x2), idvar="ID", timevar="CostType", v.names="Cost", direction="wide")

R language check missing data for columns and rows

I have a data frame sells and I want to check the missing data in both rows and columns
What I did for rows is:
sells[, complete.cases(sells)]
nrows(sells[, complete.cases(sells)])
but I didn't know who to solve if for columns
Help please
First let's take the iris dataframe and insert randomly some NA's:
iris.demo <- iris
iris.nas <- matrix(as.logical(sample(FALSE:TRUE, size = 150*5,
prob = c(.9,.1),replace = TRUE)),ncol = 5)
iris.demo[iris.nas] <- NA
For rows, it is pretty straightforward:
sum(complete.cases(iris.demo))
# [1] 75
For columns, two possibilities (among several possible others):
Transposing the whole dataframe
sum(complete.cases(t(iris.demo)))
# [1] 0 # 0 columns are complete
Using lapply to count the "non-missing" on every column and see if it's equal to nrow:
sum(lapply(iris.demo, function(x) sum(!is.na(x))) == nrow(iris.demo))
# [1] 0
You could do it like this:
set.seed(1)
(sells <- data.frame(replicate(2, sample(c(1:3, NA), 10, T)), x3 = 1:10))
# X1 X2 x3
# 1 NA 2 1
# 2 1 3 2
# 3 3 2 3
# 4 1 1 4
# 5 2 NA 5
# 6 2 3 6
# 7 1 NA 7
# 8 2 1 8
# 9 NA 3 9
# 10 2 2 10
Rows:
sells[complete.cases(sells), ]
# X1 X2 x3
# 1 2 1 1
# 2 2 1 2
# 3 3 3 3
# 9 3 2 9
nrow(sells[complete.cases(sells), ])
# [1] 6
Columns:
sells[, sapply(sells, function(col) any(is.na(col)))]
# X1 X2
# 1 2 1
# 2 2 1
# 3 3 3
# 4 NA 2
# 5 1 NA
# 6 NA 2
# 7 NA 3
# 8 3 NA
# 9 3 2
# 10 1 NA
sum(sapply(sells, function(col) any(is.na(col))))
# [1] 2

Appending Dataset in R

I have 2 datasets:
Data1:
Var1 Var2 Var3 Var4
10 10 2 3
9 2 8 3
6 4 4 8
7 3 10 8
Data2:
Var1 Var5 Var3 Var6
3 6 6 4
1 2 5 1
9 2 2 9
2 6 3 2
Now I want to append this 2 datasets
Final Data:
Var1 Var2 Var3 Var4 Var5 Var6
10 10 2 3
9 2 8 3
6 4 4 8
7 3 10 8
3 4 6 6
1 1 2 5
9 9 2 2
2 2 6 3
I can't use rbind to create this dataset. Can anybody please tell me the method to create this dataset? Also, suppose I want to append multiple (more than 2) datasets. What's the procedure?
I recommend the function rbind.fill of the plyr package:
library(plyr)
rbind.fill(Data1, Data2)
# Var1 Var2 Var3 Var4 Var5 Var6
#1 10 10 2 3 NA NA
#2 9 2 8 3 NA NA
#3 6 4 4 8 NA NA
#4 7 3 10 8 NA NA
#5 3 NA 6 NA 6 4
#6 1 NA 5 NA 2 1
#7 9 NA 2 NA 2 9
#8 2 NA 3 NA 6 2
The major advantage of this technique is that it's not limited to two data frames, but allows combining any number of data frames.
If the data still needs to be read from disk, you can do something like:
file_list = list.files()
data_list = lapply(file_list, read.table)
data_combined = do.call("rbind.fill", data_list)
merge(Data1, Data2, all=TRUE, sort=FALSE)
Var1 Var3 Var2 Var4 Var5 Var6
1 10 2 10 3 NA NA
2 9 8 2 3 NA NA
3 6 4 4 8 NA NA
4 7 10 3 8 NA NA
5 3 6 NA NA 6 4
6 1 5 NA NA 2 1
7 9 2 NA NA 2 9
8 2 3 NA NA 6 2
EDIT: A way to combine multiple frames
As detailed here.
Combining more than 2 frames
Data3
Var1 Var3 Var5 Var6
1 2 6 4 1
2 10 1 6 1
3 1 6 3 1
4 9 5 5 7
We'll need to put your data into a list and use a nice package called reshape.
datalist <- list(Data1, Data2, Data3)
library(reshape)
merge_recurse(datalist)
Var1 Var3 Var2 Var4 Var5 Var6
1 10 2 10 3 NA NA
2 9 8 2 3 NA NA
3 6 4 4 8 NA NA
4 7 10 3 8 NA NA
5 3 6 NA NA 6 4
6 1 5 NA NA 2 1
7 9 2 NA NA 2 9
8 2 3 NA NA 6 2
9 2 6 NA NA 4 1
10 10 1 NA NA 6 1
11 1 6 NA NA 3 1
12 9 5 NA NA 5 7
# Open a new directory and keep only the data files to be combined
combinedfiles <- function(){
# nullVar: Creating a Null Variable using as.null function
nullVar <- function(x){
x <- getwd();
x <- as.null(x);
}
# readTab: Read file using read.table function
readTab <- function(y) {
read.table(y, header=TRUE, sep = " ")
}
objectcontent <- nullVar(x);
for (i in 1:length(list.files(getwd()))) {
y <- list.files(getwd())[i];
objectcontent <- rbind(objectcontent, readTab(y));
i <- i + 1
}
return(objectcontent)
}
#Then type the following in the console
combinedfiles()
a version using apply loops (which do not suffer from the rbind slowdown):
combined_files = function(file_path, extension = "csv") {
require(plyr)
file_list = list.files(file_path, pattern = extension)
data_list = lapply(file_list, read.table, header = TRUE, sep = " ")
combined_data = do.call("rbind.fill", data_list)
return(combined_data)
}
Try this:
data1 <- as.data.frame(read.table("data1", header=TRUE, sep=" "))
data2 <- as.data.frame(read.table("data2", header=TRUE, sep=" "))
merge(data1, data2, all=TRUE, all.x=TRUE, all.Y=TRUE)

Resources