Add/match rows with NA to matrix based on missing unique IDs - r

I am using a panel data set and intent to model this as a dynamic affiliation network using SAOMs. The data is unfortunately very messy and a pain to deal with.
I have managed to create adjacency matrices for each panel wave. However, over time the panel grew in size / people left. I need the number of rows in each matrix to be the same and in the same order according to the unique IDs, which are present when inspecting the objects in R. All "added IDs" should show 10s across the whole row.
Here is a reproducible example that should make the issue clear and also shows what I aim for. I assume this can be solved by smart use of the merge() function, but I could not get it to work:
wave1 <- matrix(c(0,0,1,1,0,1,1,0,1,1), nrow = 5, ncol = 2, dimnames = list(c("1","2","4","5","9"), c("group1","group2")))
wave2 <- matrix(c(0,1,1,0,1,0,1,1), nrow = 4, ncol = 2, dimnames = list(c("1","4","8","9"), c("group1","group2")))
wave1_c <- matrix(c(0,0,1,1,10,0,1,1,0,0,10,1), nrow = 6, ncol = 2, dimnames = list(c("1","2","4","5","8","9"), c("group1","group2")))
wave2_c <- matrix(c(0,10,1,10,1,0,1,10,0,10,1,1), nrow = 6, ncol = 2, dimnames = list(c("1","2","4","5","8","9"), c("group1","group2")))
Thanks in advance. Numbers in the matrices are arbitrary except for the 10s.

Solution in base R using dataframes and merge.
Merge and outer join.
dwave1_c <- merge(wave1, wave2, by = 'row.names', all = TRUE, suffixes="")[2:3]
dwave2_c <- merge(wave2, wave1, by = 'row.names', all = TRUE, suffixes="")[2:3]
dwave1_c[is.na(dwave1_c)] <- 10
dwave2_c[is.na(dwave2_c)] <- 10
as.matrix(dwave1_c)
as.matrix(dwave2_c)
Update.
both <- merge(wave1, wave2, by = 'row.names', all = TRUE)
Output.
Row.names group1.x group2.x group1.y group2.y
1 1 0 1 0 1
2 2 0 1 NA NA
3 4 1 0 1 0
4 5 1 1 NA NA
5 8 NA NA 1 1
6 9 0 1 0 1
dwave1_c <- both[,2:3]; colnames(dwave1_c) <- colnames(wave1)
dwave2_c <- both[,4:5]; colnames(dwave2_c) <- colnames(wave2)
dwave1_c[is.na(dwave1_c)] <- 10
dwave2_c[is.na(dwave2_c)] <- 10
Show result.
as.matrix(dwave1_c)
as.matrix(dwave2_c)
First try.
## Convert matrix to dataframe.
df1 <- as.data.frame(wave1)
df2 <- as.data.frame(wave2)
## Merge df1 and df2 by row name.
m_df1_df2 <- merge(df1, df2, by = 'row.names', all = TRUE)
rownames(m_df1_df2) <- m_df1_df2$Row.names
# Rows not in df1, but in df2,
# rows not in df2, but in df1
not1_2 <- m_df1_df2[is.na(m_df1_df2$group1.x),][c("group1.x", "group2.x")] # not in df1, in df2
not2_1 <- m_df1_df2[is.na(m_df1_df2$group1.y),][c("group1.y", "group2.y")] # not in df2, in df1
## Same column names.
colnames(not1_2) <- colnames(df1)
colnames(not2_1) <- colnames(df2)
## append
df1_c <- rbind(df1, not1_2)
df2_c <- rbind(df2, not2_1)
## order by row name
df1_c <- df1_c[order(row.names(df1_c)), ]
df2_c <- df2_c[order(row.names(df2_c)), ]
## replace NA by 10
df1_c[is.na(df1_c)] <- 10
df2_c[is.na(df2_c)] <- 10
as.matrix(df1_c)
as.matrix(df2_c)

The conversion of wave1,2 to data frames in my first attempt is redundant and can be omitted. However at the expense of implicit coercions.
## merge wave1 and wave2 by row name.
m_df1_df2 <- merge(wave1, wave2, by = 0, all = TRUE)
rownames(m_df1_df2) <- m_df1_df2$Row.names
# rows not in set 1, but in set 2,
# rows not in set 2, but in set 1.
not1_2 <- m_df1_df2[is.na(m_df1_df2$group1.x),][c("group1.x", "group2.x")]
not2_1 <- m_df1_df2[is.na(m_df1_df2$group1.y),][c("group1.y", "group2.y")]
## Same column names.
colnames(not1_2) <- colnames(wave1)
colnames(not2_1) <- colnames(wave2)
## append.
wave1_c <- rbind(wave1, not1_2)
wave2_c <- rbind(wave2, not2_1)
## order by row name.
wave1_c <- wave1_c[order(row.names(wave1_c)), ]
wave2_c <- wave2_c[order(row.names(wave2_c)), ]
## replace NA by 10.
wave1_c[is.na(wave1_c)] <- 10
wave2_c[is.na(wave2_c)] <- 10
## show result.
wave1_c
wave2_c

Solution using setdiff.
## rownames not in set 1, but in set 2,
## rownames not in set 2, but in set 1.
rn_not2_1 <- setdiff(rownames(wave1), rownames(wave2))
rn_not1_2 <- setdiff(rownames(wave2), rownames(wave1))
## missing rows to add.
add_to_1 <- wave2[rn_not1_2,,drop=FALSE]
add_to_2 <- wave1[rn_not2_1,,drop=FALSE]
add_to_1[,] <- 10
add_to_2[,] <- 10
## append.
wave1_c <- rbind(wave1, add_to_1)
wave2_c <- rbind(wave2, add_to_2)
## order by row name.
wave1_c <- wave1_c[order(row.names(wave1_c)), ]
wave2_c <- wave2_c[order(row.names(wave2_c)), ]
## show result.
wave1_c
wave2_c

Related

Assign() to specific indices of vectors, vectors specified by string names

I'm trying to assign values to specific indices of a long list of vectors (in a loop), where each vector is specified by a string name. The naive approach
testVector1 <- c(0, 0, 0)
vectorName <- "testVector1"
indexOfInterest <- 3
assign(x = paste0(vectorName, "[", indexOfInterest, "]"), value = 1)
doesn't work, instead it creates a new vector "testVector1[3]" (the goal was to change the value of testVector1 to c(0, 0, 1)).
I know the problem is solvable by overwriting the whole vector:
temporaryVector <- get(x = vectorName)
temporaryVector[indexOfInterest] <- 1
assign(x = vectorName, value = temporaryVector)
but I was hoping for a more direct approach.
Is there some alternative to assign() that solves this?
Similarly, is there a way to assign values to specific elements of columns in data frames, where both the data frames and columns are specified by string names?
If you must do this you can do it with eval(parse():
valueToAssign <- 1
stringToParse <- paste0(
vectorName, "[", indexOfInterest, "] <- ", valueToAssign
)
eval(parse(text = stringToParse))
testVector1
# [1] 0 0 1
But this is not recommended. Better to put the desired objects in a named list, e.g.:
testVector1 <- c(0, 0, 0)
dat <- data.frame(a = 1:5, b = 2:6)
l <- list(
testVector1 = testVector1,
dat = dat
)
Then you can assign to them by name or index:
vectorName <- "testVector1"
indexOfInterest <- 3
dfName <- "dat"
colName <- "a"
rowNum <- 3
valueToAssign <- 1
l[[vectorName]][indexOfInterest] <- valueToAssign
l[[dfName]][rowNum, colName] <- valueToAssign
l
# $testVector1
# [1] 0 0 1
# $dat
# a b
# 1 1 2
# 2 2 3
# 3 1 4
# 4 4 5
# 5 5 6

Sum observations from two columns, looping over many columns in R

I have searched high and low, but am stuck on how to approach this. I have two sets of columns that I want to sum, row by row, but which I want to loop over many columns. If I were to do this manually, I would want:
df1[1,1]+df2[1,1]
df1[2,1]+df2[2,1]
etc... I've found many helpful examples on how to do something like:
apply(df[,c("a","d")], 1, sum)
though I want to do this over lots of columns. Also, while it's not entirely relevant, I want to phrase my question as close to my reality as possible, so my example below includes NA's, since my actual data contains many missing values.
# make a data frame, df1, with three columns
a <- sample(1:100, 50, replace = T)
b <- sample(100:300, 50, replace = T)
c <- sample(2:50, 500, replace = T)
df1 <- cbind(a,b,c)
# make another data frame, df2, with three columns
x <- sample(1:100, 50, replace = T)
y <- sample(100:300, 50, replace = T)
z <- sample(2:50, 50, replace = T)
df2 <- cbind(x,y,z)
# make another data frame, df2, with three columns
x <- sample(1:100, 50, replace = T)
y <- sample(100:300, 50, replace = T)
z <- sample(2:50, 50, replace = T)
df2 <- cbind(x,y,z)
Make it possible to randomly throw a few NAs in, function from http://www.r-bloggers.com/function-to-generate-a-random-data-set/
NAins <- NAinsert <- function(df, prop = .1){
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop*n*m)
id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x){
df[rows[x], cols[x]] <<- NA
}
)
return(df)
}
Add the NAs to the frames
NAins(df1, .2)
NAins(df2, .14)
Then, I tried to seq along the columns in each data frame, and used apply setting the index to 1, meaning to sum each row entry. This doesn't work.
for(i in seq_along(df1)){
for(j in seq_along(df2)){
apply(c(df1[,i], col2[j]), 1, function(x) sum(x, na.rm = T))}}
Thanks for any help!
You should be able to just replace NA with 0, and then add with "+":
replace(df1, is.na(df1), 0) + replace(df2, is.na(df2), 0)
# X Y Z
# 1 7 19 6
# 2 11 12 1
# 3 16 14 11
# 4 13 7 13
# 5 10 2 11
Alternatively, if you have more than just two data.frames, you can collect them in a list and use Reduce:
Reduce("+", lapply(mget(c("df1", "df2", "df3")), function(x) replace(x, is.na(x), 0)))
Here's some sample data (and what I think is an easier way to create it):
set.seed(1) ## Set a seed so others can reproduce your sample data
dfmaker <- function() {
setNames(
data.frame(
replicate(3, sample(c(NA, 1:10), 5, TRUE), FALSE)),
c("X", "Y", "Z"))
}
df1 <- dfmaker()
df1
# X Y Z
# 1 2 9 2
# 2 4 10 1
# 3 6 7 7
# 4 9 6 4
# 5 2 NA 8
df2 <- dfmaker()
df2
# X Y Z
# 1 5 10 4
# 2 7 2 NA
# 3 10 7 4
# 4 4 1 9
# 5 8 2 3
df3 <- dfmaker()
You can transform the data.frame to an array and sum them using apply function.
install.package('abind')
library(abind)
df <- abind(list(df1,df2), along = 3)
results <- apply(df, MARGIN = c(1,2), FUN = function(x) sum(x, na.rm = TRUE))
results

Link data.frame and matrix

I have a data.frame and a matrix with same row and different number of columns.
All elements in the matrix are integer but the data.frame includes character in some columns.
I want to link the rows of these file, i.e. if if I delete a row in the matrix the same row in the data.frame be deleted automatically or when I sort the elements of data.frame with one of its column, the elements in the matrix be sorted accordingly.
Added note: I want to keep the matrix as integer matrix so I can not use cbind.
There are (at least) two solutions to this. The easy option is to make a new data.frame which includes both rows as such:
Sample data
set.seed(123)
df <- data.frame(ID = 1:26, Group = sample(c("A", "B"), 26, TRUE))
mat <- matrix(rnorm(78), ncol = 3, dimnames = list(1:26, paste0("Val", 1:3)))
Make new data.frame, storing names of matrix columns for later reference:
new_df <- cbind(df, mat)
mat_cols <- colnames(mat)
Do some subsetting:
new_df <- new_df[seq(1, 25, 2), ]
Extract matrix back out whenever needed:
as.matrix(new_df[, mat_cols])
The other option is to use an S3 or S4 class. The Bioconductor package Biobase has, for example, an ExpressionSet class which can hold a matrix and phenotype data, and subsetting works to subset both (though the matrix has the rows and columns the opposite way round).
If you wanted to do that more simply (ExpressionsSets can be relatively complex to get your head around), here's an S3 implementation:
as.JoinedUp <- function(data_frame, matrix) {
stopifnot(is.data.frame(data_frame), is.matrix(matrix), nrow(data_frame) == nrow(matrix))
x <- list(data_frame = data_frame, matrix = matrix)
class(x) <- "JoinedUp"
x
}
`[.JoinedUp` <- function(x, i = NULL, j = NULL) {
if (is.null(i)) {
i <- 1:nrow(x$data_frame)
}
if (is.null(j)) {
j <- union(colnames(x$data_frame), colnames(x$matrix))
}
stopifnot(is.character(j))
x$data_frame <- x$data_frame[i, intersect(j, colnames(x$data_frame)), drop = FALSE]
x$matrix <- x$matrix[i, intersect(j, colnames(x$matrix)), drop = FALSE]
x
}
`[<-.JoinedUp` <- function(x, i = NULL, j = NULL, value) {
if (is.null(j)) {
j <- union(colnames(x$data_frame), colnames(x$matrix))
}
if (is.null(i)) {
i <- 1:nrow(x$data_frame)
}
stopifnot(is.character(j))
if (!is.matrix(value) & !is.data.frame(value)) {
value <- as.data.frame(t(value), stringsAsFactors = FALSE)
}
stopifnot(ncol(value) == length(j))
if (any(j %in% colnames(x$data_frame))) {
df_cols <- intersect(j, colnames(x$data_frame))
x$data_frame[i, df_cols] <- value[, match(df_cols, j)]
}
if (any(j %in% colnames(x$matrix))) {
mat_cols <- intersect(j, colnames(x$matrix))
x$matrix[i, mat_cols] <- data.matrix(value[, match(mat_cols, j)])
}
x
}
Examples:
new_obj <- as.JoinedUp(df, mat)
new_obj[1:3, ]
new_obj[, c("ID", "Val1")]
new_obj[10:15, ]$matrix
new_obj <- new_obj[order(new_obj$matrix[, "Val1"]), ]
new_obj[1:5, c("ID", "Val1")] <- data.frame(ID = 20:24, Val1 = 0)
This is only a skeleton of what you'd need; you'd probably also want to define methods for dim, nrow, ncol, etc.
Try this example:
#dummy data
set.seed(123)
df1 <- data.frame(ID=1:3, x=letters[1:3])
m1 <- matrix(c(1:3,runif(6)), ncol=3)
#cbind data.frame and matrix, results in a data.frame object
res <- cbind(df1, m1)
res
# ID x 1 2 3
# 1 1 a 1 0.2875775 0.8830174
# 2 2 b 2 0.7883051 0.9404673
# 3 3 c 3 0.4089769 0.0455565
#subset 2nd row
res[ 2,]
# ID x 1 2 3
# 2 2 b 2 0.7883051 0.9404673
#order by 4th column
res[ order(res[ ,4 ]), ]
# ID x 1 2 3
# 1 1 a 1 0.2875775 0.8830174
# 3 3 c 3 0.4089769 0.0455565
# 2 2 b 2 0.7883051 0.9404673

R extract matching values from list of data frames

I have a relatively large amount of data stored in a list of data frames with several columns.
For each element of the list I wish to check one column against a reference and if present extract the value held in another column of the same element and place in a new summary matrix.
e.g. with the following example code:
add1 = c("N1","N1","N1")
coords1 = c(1,2,3)
vals1 = c("a","b","c")
extra1 = c("x","y","x")
add2 = c("N2","N2","N2","N2")
coords2 = c(2,3,4,5)
vals2 = c("b","c","d","e")
extra2 = c("z","y","x","x")
add3 = c("N3","N3","N3")
coords3 = c(1,3,5)
vals3 = c("a","c","e")
extra3 = c("z","z","x")
df1 <- data.frame(add1, coords1, vals1, extra1)
df2 <- data.frame(add2, coords2, vals2, extra2)
df3 <- data.frame(add3, coords3, vals3, extra3)
list_all <- list(df1, df2, df3)
coordinate.extract <- unique(unlist(lapply(list_all, "[", 1)))
my_matrix <- matrix(0, ncol = length(list_all)
, nrow = (length(coordinate.extract)))
my_matrix_new <- cbind(as.character(coordinate.extract)
, my_matrix)
I would like to end up with:
my_matrix_new = V1 V2 V3 V4
1 a a
2 b b
3 c c c
4 d
5 e e
i.e. the 3rd column of each list element is chosen based on the value of the second column.
I hope this is clear.
Thanks,
Matt
I would use data.frame as there are mixed classes. You may try merge with Reduce to get the expected output. Select the 2nd and 3rd columns,in each list element, change the column name for the 2nd to be same across all the list elements, merge, and if needed replace the NA elements with ''
lst1 <- lapply(list_all, function(x) {names(x)[2] <- 'V1';x[2:3] })
res <- Reduce(function(...) merge(..., by='V1', all=TRUE), lst1)
res[-1] <- lapply(res[-1], as.character)
res[is.na(res)] <- ''
res
# V1 vals1 vals2 vals3
#1 1 a a
#2 2 b b
#3 3 c c c
#4 4 d
#5 5 e e
We can change the column names
names(res) <- paste0('V', seq_along(res))

R - co-locate columns with the same name after merge

Situation
I have two data frames, df1 and df2with the same column headings
x <- c(1,2,3)
y <- c(3,2,1)
z <- c(3,2,1)
names <- c("id","val1","val2")
df1 <- data.frame(x, y, z)
names(df1) <- names
a <- c(1, 2, 3)
b <- c(1, 2, 3)
c <- c(3, 2, 1)
df2 <- data.frame(a, b, c)
names(df2) <- names
And am performing a merge
#library(dplyr) # not needed for merge
joined_df <- merge(x=df1, y=df2, c("id"),all=TRUE)
This gives me the columns in the joined_df as id, val1.x, val2.x, val1.y, val2.y
Question
Is there a way to co-locate the columns that had the same heading in the original data frames, to give the column order in the joined data frame as id, val1.x, val1.y, val2.x, val2.y?
Note that in my actual data frame I have 115 columns, so I'd like to stay clear of using joned_df <- joined_df[, c(1, 2, 4, 3, 5)] if possible.
Update/Edit: also, I would like to maintain the original order of column headings, so sorting alphabetically is not an option (-on my actual data, I realise it would work with the example I have given).
My desired output is
id val1.x val1.y val2.x val2.y
1 1 3 1 3 3
2 2 2 2 2 2
3 3 1 3 1 1
Update with solution for general case
The accepted answer solves my issue nicely.
I've adapted the code slightly here to use the original column names, without having to hard-code them in the rep function.
#specify columns used in merge
merge_cols <- c("id")
# identify duplicate columns and remove those used in the 'merge'
dup_cols <- names(df1)
dup_cols <- dup_cols [! dup_cols %in% merge_cols]
# replicate each duplicate column name and append an 'x' and 'y'
dup_cols <- rep(dup_cols, each=2)
var <- c("x", "y")
newnames <- paste(dup_cols, ".", var, sep = "")
#create new column names and sort the joined df by those names
newnames <- c(merge_cols, newnames)
joined_df <- joined_df[newnames]
How about something like this
numrep <- rep(1:2, each = 2)
numrep
var <- c("x", "y")
var
newnames <- paste("val", numrep, ".", var, sep = "")
newdf <- cbind(joined_df$id, joined_df[newnames])
names(newdf)[1] <- "id"
Which should give you the dataframe like this
id val1.x val1.y val2.x val2.y
1 1 3 1 3 3
2 2 2 2 2 2
3 3 1 3 1 1

Resources