I have created a data frame, in the data frame there are 3 sites and I have created a nested for loop to create my desired matrices. THe overall objective is find a more efficient way to do this for each of the 3 sites instead of just the one.
The outputs from the nested for loop (EDmatrix and timelags) are the expected results for the other two sites. I would like to find a more efficient way of obtaining these matrices as well as be able to do it for all site instead of just the one in this example.
set.seed(123)
d1 = sample.int(50, 27)
d2 = sample.int(50, 27)
d3 = sample.int(50, 27)
year <- c(1990:1998)
site <- c(rep("a", 9), rep("b", 9), rep("c", 9))
ED = function(x,y){
#x and y are vectors of spp abundances
#they must be the same length!
if(length(x)!=length(y)) stop("Bad abundances!")
out = sqrt(sum((x-y)^2))
out
}
df <- data.frame(site, year, d1 = d1, d2 = d2, d3 = d3)
Here is the code to get the expected output for only a single site, but I would like to be able to do this for all of the sites in the data frame df.
subdf = subset(df,site=="a") # subset data for one site
EDmatrix = matrix(NA,dim(subdf)[1],dim(subdf)[1]) # create a place to store the dissimilarity values
timeLags = matrix(NA,dim(subdf)[1],dim(subdf)[1]) # create a place to store the time lags
# First loop through all "j" years from 1 to the total number of years
# Now loop through all "k" years from 1 to the total number of years
for(j in 1: length(subdf$year)){
for(k in 1: length(subdf$year)){
# grab density data for year "j"
jdensity <- subdf[j,-c(1:2)]
# grab density data for year "k"
kdensity <- subdf[k,-c(1:2)]
# calculate and store (in the EDmatrix) the ED value based on the data for year j and k
EDmatrix[j,k] <- ED(jdensity, kdensity)
# calculate and store (in timeLags) the time lag (the absolute value of the difference
# in time between year j and k
timeLags[j,k] <- abs(subdf[j, 2] - subdf[k, 2])
}# exit k loop
}# exit j loop
EDmatrix[lower.tri(EDmatrix, diag=T)]=NA # set duplicate entries to NA
timeLags[lower.tri(timeLags, diag=T)]=NA # set duplicate entries to NA
y = as.vector(EDmatrix) # turn the matrix into a vector
x = as.vector(timeLags)
We may use outer for this operation
library(dplyr)
library(tidyr)
library(purrr)
f1 <- function(dat, i, j) {
subdat <- dat %>%
select(starts_with('d'))
jdensity <- subdat[i, ]
kdensity <- subdat[j,]
EDtmp <- ED(jdensity, kdensity)
timetmp <- abs(dat$year[i] - dat$year[j])
tibble(EDtmp, timetmp)
}
f2 <- function(dat, s1, s2) {
mat <- outer(s1, s2, Vectorize(\(i, j) list(f1(dat, i, j))))
EDmatrix <- matrix(map_dbl(mat, ~ .x$EDtmp), length(s1), length(s1))
timeLags <- matrix(map_dbl(mat, ~ .x$timetmp), length(s1), length(s1))
EDmatrix[lower.tri(EDmatrix, diag=TRUE)]=NA
timeLags[lower.tri(timeLags, diag=TRUE)]=NA
y = as.vector(EDmatrix)
x = as.vector(timeLags)
tibble(y, x)
}
out1 <- df %>%
group_by(site) %>%
summarise(out = f2(cur_data(), row_number(), row_number()),
.groups = 'drop') %>%
unnest(out)
-checking with OP's output
> out1$x[out1$site == "a"]
[1] NA NA NA NA NA NA NA NA NA 1 NA NA NA NA NA NA NA NA 2 1 NA NA NA NA NA NA NA 3 2 1 NA NA NA NA NA NA 4 3 2 1 NA NA NA NA NA 5 4 3
[49] 2 1 NA NA NA NA 6 5 4 3 2 1 NA NA NA 7 6 5 4 3 2 1 NA NA 8 7 6 5 4 3 2 1 NA
> x
[1] NA NA NA NA NA NA NA NA NA 1 NA NA NA NA NA NA NA NA 2 1 NA NA NA NA NA NA NA 3 2 1 NA NA NA NA NA NA 4 3 2 1 NA NA NA NA NA 5 4 3
[49] 2 1 NA NA NA NA 6 5 4 3 2 1 NA NA NA 7 6 5 4 3 2 1 NA NA 8 7 6 5 4 3 2 1 NA
> out1$y[out1$site == "a"]
[1] NA NA NA NA NA NA NA NA NA 30.675723 NA NA NA NA
[15] NA NA NA NA 41.388404 18.055470 NA NA NA NA NA NA NA 42.485292
[29] 33.136083 25.729361 NA NA NA NA NA NA 38.288379 41.581246 34.770677 39.433488 NA NA
[43] NA NA NA 13.038405 38.379682 49.264592 54.083269 40.865633 NA NA NA NA 16.431677 25.317978
[57] 36.701499 47.549974 36.359318 15.362291 NA NA NA 34.799425 54.680892 54.018515 49.254441 26.019224 35.791060 41.484937
[71] NA NA 9.433981 34.842503 46.108568 42.801869 45.199558 19.924859 25.079872 38.652296 NA
> y
[1] NA NA NA NA NA NA NA NA NA 30.675723 NA NA NA NA
[15] NA NA NA NA 41.388404 18.055470 NA NA NA NA NA NA NA 42.485292
[29] 33.136083 25.729361 NA NA NA NA NA NA 38.288379 41.581246 34.770677 39.433488 NA NA
[43] NA NA NA 13.038405 38.379682 49.264592 54.083269 40.865633 NA NA NA NA 16.431677 25.317978
[57] 36.701499 47.549974 36.359318 15.362291 NA NA NA 34.799425 54.680892 54.018515 49.254441 26.019224 35.791060 41.484937
[71] NA NA 9.433981 34.842503 46.108568 42.801869 45.199558 19.924859 25.079872 38.652296 NA
Related
Consider this sorted vector:
x <- c(3, 4, 6, 8, 9, 10, 18)
Is there an easy way to complete the sequence with NAs, i.e. get:
#[1] 3 4 NA 6 NA 8 9 10 NA NA NA NA NA NA NA 18
You can create the full sequence and add NAs on unmatched numbers:
s <- seq(min(x), max(x))
s[!s %in% x] <- NA
#[1] 3 4 NA 6 NA 8 9 10 NA NA NA NA NA NA NA 18
You could create even create functions that take as input the vector, a minimum and maximum and add NAs to non matching numbers:
fill_seq <-
function(x, min = NULL, max = NULL, by = 1){
min = if(is.null(min)) min(x) else min
max = if(is.null(max)) max(x) else max
s <- seq(min, max, by = by)
s[!s %in% x] <- NA
s
}
Examples:
fill_seq(x)
# [1] 3 4 NA 6 NA 8 9 10 NA NA NA NA NA NA NA 18
fill_seq(x, min = 1, max = 20)
# [1] NA NA 3 4 NA 6 NA 8 9 10 NA NA NA NA NA NA NA 18 NA NA
fill_seq(x, by = 0.5)
# [1] 3 NA 4 NA NA NA 6 NA NA NA 8 NA 9 NA 10 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 18
Using replace.
replace(array(dim=max(x)), x, x)[-seq_len(min(x) - 1)]
# [1] 3 4 NA 6 NA 8 9 10 NA NA NA NA NA NA NA 18
Assume a data.frame:
df <- data.frame(name = c("a","b","c","d","e"),rank = c(1,1,4,3,2))
name rank
a 1
b 1
c 4
d 3
e 2
Based on the above data.frame, I want to create a new one that holds the count of transitions from one rank to another. So the output would be something like this:
name 1to1 1to2 1to3 1to4 2to1 2to2 2to3 2to4 3to1 3to2 3to3 3to4 4to1 4to2 4to3 4to4
1 b 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
2 c NA NA NA 1 NA NA NA NA NA NA NA NA NA NA NA NA
3 d NA NA NA NA NA NA NA NA NA NA NA NA NA NA 1 NA
4 e NA NA NA NA NA NA NA NA NA 1 NA NA NA NA NA NA
One way to do this would be to run a for loop and then using ifs but I am pretty sure there should be a more efficient way of doing this.
For example, if item d has a rank of 3 and item c is ranked as 4 then the code should increase the count of the 4to3 column under d's row (as per example above). Please let me know if this is unclear and I appreciate all the help.
P.S. colnames are not that important.
You could use Map to create sequences for extracting the transitions and collapse them into the desired form using paste.
tmp <- sapply(Map(seq, 1:(nrow(df1)-1), 2:nrow(df1)), function(i) df1$rank[i])
v <- apply(tmp, 2, function(x) paste(x, collapse="to"))
Then create a grid with all permutations
to <- apply(expand.grid(1:4, 1:4), 1, function(x) paste(x, collapse="to"))
and compare them with the actual transitions to get the resulting binary structure; create a data frame out of it.
res <- data.frame(name=df1$name[-1], t(sapply(v, function(i) setNames(+(i == to), to))))
Afterwards, you may convert the zeroes to NA using
res[res == 0] <- NA
Result
res
# name X1to1 X2to1 X3to1 X4to1 X1to2 X2to2 X3to2 X4to2 X1to3 X2to3 X3to3 X4to3 X1to4 X2to4 X3to4 X4to4
# 1to1 b 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
# 1to4 c NA NA NA NA NA NA NA NA NA NA NA NA 1 NA NA NA
# 4to3 d NA NA NA NA NA NA NA NA NA NA NA 1 NA NA NA NA
# 3to2 e NA NA NA NA NA NA 1 NA NA NA NA NA NA NA NA NA
Data
df1 <- structure(list(name = structure(1:5, .Label = c("a", "b", "c",
"d", "e"), class = "factor"), rank = c(1, 1, 4, 3, 2)), class = "data.frame", row.names = c(NA,
-5L))
Set every non-NA value that has a non-NA value to "his left" to NA.
Data
a <- c(3,2,3,NA,NA,1,NA,NA,2,1,4,NA)
[1] 3 2 3 NA NA 1 NA NA 2 1 4 NA
Desired Output
[1] 3 NA NA NA NA 1 NA NA 2 NA NA NA
My working but ugly solution:
IND <- !(is.na(a)) & data.table::rleidv(!(is.na(a))) %>% duplicated
a[IND]<- NA
a
There's gotta be a better solution ...
Alternatively,
a[-1][diff(!is.na(a)) == 0] <- NA; a
# [1] 3 NA NA NA NA 1 NA NA 2 NA NA NA
OK for brevity...
a[!is.na(dplyr::lag(a))]<-NA
a
[1] 3 NA NA NA NA 1 NA NA 2 NA NA NA
You can do a simple ifelse statement where you add your vector with a lagged vector a. If the result is NA then the value should remain the same. Else, NA, i.e.
ifelse(is.na(a + dplyr::lag(a)), a, NA)
#[1] 3 NA NA NA NA 1 NA NA 2 NA NA NA
tidyr's spread function only takes column names without quotes. Is there a way I can pass in a variable that contains the column name
for eg
# example using gather()
library("tidyr")
dummy.data <- data.frame("a" = letters[1:25], "B" = LETTERS[1:5], "x" = c(1:25))
dummy.data
var = "x"
dummy.data %>% gather(key, value, var)
This gives an error
Error: All select() inputs must resolve to integer column positions.
The following do not:
* var
Which is solved using match function which gives the required column position
dummy.data %>% gather(key, value, match(var, names(.)))
But this same approach doesn't work for the spread function
dummy.data %>% spread(a, match(var, names(.)))
Error: Invalid column specification
Do gather and spread functions take different column specification. gather takes a column index while spread doesn't mention what it wants
If you want to use standard evaluation you need to use gather_ or spread_
These 2 give the same results
dummy.data %>% gather_("key", "value", var)
dummy.data %>% gather(key, value, match(var, names(.)))
And this works :
dummy.data %>% spread_("a",var)
# B a b c d e f g h i j k l m n o p q r s t u v w x y
# 1 A 1 NA NA NA NA 6 NA NA NA NA 11 NA NA NA NA 16 NA NA NA NA 21 NA NA NA NA
# 2 B NA 2 NA NA NA NA 7 NA NA NA NA 12 NA NA NA NA 17 NA NA NA NA 22 NA NA NA
# 3 C NA NA 3 NA NA NA NA 8 NA NA NA NA 13 NA NA NA NA 18 NA NA NA NA 23 NA NA
# 4 D NA NA NA 4 NA NA NA NA 9 NA NA NA NA 14 NA NA NA NA 19 NA NA NA NA 24 NA
# 5 E NA NA NA NA 5 NA NA NA NA 10 NA NA NA NA 15 NA NA NA NA 20 NA NA NA NA 25
I have a function that takes as input a dataframe with certain columns
columns =['a', 'b',...,'z']
Now I have a dataframe DF with only few of these columns DF_columns = ['f', 'u', 'z']
How can I create a dataframe that has all the columns with value NA if the columns are not in DF and that coincides with DF on the columns ['f', 'u', 'z']
Example:
d = data.frame('g'=c(1,2,3), 's' = c(4,2,3))
columns = letters[1:21]
columns
[1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t"
[21] "u"
> d
g s
1 1 4
2 2 2
3 3 3
>
x.or.na <- function(x, df) if (x %in% names(df)) df[[x]] else NA
as.data.frame(Map(x.or.na, columns, list(d)))
set.seed(42)
DF <- setNames(as.data.frame(matrix(sample(1:15, 15, replace=TRUE), ncol=3)), c('f', 'u', 'z') )
DF
# f u z
#1 14 8 7
#2 15 12 11
#3 5 3 15
#4 13 10 4
#5 10 11 7
res <- do.call(`data.frame`,lapply(split(letters[4:26], letters[4:26]),
function(x){x1 <- match(x, colnames(DF)); if(!is.na(x1)) DF[,x1] else NA}))
res
# d e f g h i j k l m n o p q r s t u v w x y z
#1 NA NA 14 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 8 NA NA NA NA 7
#2 NA NA 15 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 12 NA NA NA NA 11
#3 NA NA 5 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 3 NA NA NA NA 15
#4 NA NA 13 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 10 NA NA NA NA 4
#5 NA NA 10 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 11 NA NA NA NA 7
Using dplyr
library(dplyr)
DF %>%
do({x1 <-data.frame(., setNames(as.list(rep(NA, sum(!letters[4:26] %in% names(DF)))),
setdiff(letters[4:26], names(DF))))
x1[,order(colnames(x1))] })
# d e f g h i j k l m n o p q r s t u v w x y z
#1 NA NA 14 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 8 NA NA NA NA 7
#2 NA NA 15 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 12 NA NA NA NA 11
#3 NA NA 5 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 3 NA NA NA NA 15
#4 NA NA 13 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 10 NA NA NA NA 4
#5 NA NA 10 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 11 NA NA NA NA 7
This is quite easy (in terms of syntax) and efficient (in terms of speed) using the data.table package:
require(data.table) ## 1.9.2+
setDT(d)[, setdiff(columns, names(d)) := NA] ## (1)
setcolorder(d, columns) ## (2)
setDF(d) ## (3)
setDT converts d to a data.table, after which we use the := operator to create new columns by reference. There are many ways to use :=, but highlighted here is the use case LHS := RHS. Here LHS is a vector of column names and RHS is the value. NA is provided only once on the RHS, which gets automatically recycled for all other columns. Note that NA by default is logical type in R.
If required you can reorder the columns of d in the same order as columns using setcolorder.
Again, if necessary, you can convert the data.table back to a data.frame, using the function setDF, which again modifies the object by reference. But it's available in the development version v1.9.3 only for now.
Here are a few methods and their timings.
createDF1 <- function(colVec, data)
{
m <- matrix(, nrow = nrow(data), ncol = length(colVec),
dimnames = list(NULL, colVec))
m[, names(data)] <- as.matrix(data)
data.frame(apply(m, 2, as.numeric))
}
createDF2 <- function(colVec, data)
{
rr <- setNames(rep(list(rep(NA_integer_, nrow(data))), length(colVec)), .
nm = colVec)
rr[match(names(data), colVec)] <- data
as.data.frame(rr)
}
createDF3 <- function(colVec, data)
{
rr <- setNames(replicate(length(colVec),
list(rep(NA_integer_, nrow(data)))),
nm = colVec)
rr[match(names(d), colVec)] <- data
as.data.frame(rr)
}
Create a 3,000,000 x 3 data frame to test on:
columns <- letters[1:21]
d <- data.frame(g = 1:3e6L, s = 1:3e6L, j = 1:3e6L)
Run some tests:
system.time({ createDF1(columns, d) })
# user system elapsed
# 5.022 1.023 6.054
system.time({ createDF2(columns, d) })
# user system elapsed
# 0.007 0.004 0.011
system.time({ createDF3(columns, d) })
# user system elapsed
# 0.105 0.077 0.183
Of these three, it looks like rep(list(rep(NA_integer_, nrow(data))), length(columns)) is the way to go, and replace values from that.
Setup:
set.seed(1)
DF_all <- setNames(data.frame(matrix(rnorm(5*26), nrow=5, ncol=26)), letters)
DF <- DF_all[, c('f','u','z')]
Create a new empty dataframe and populate with your columns:
DF2 <- setNames(data.frame(matrix(nrow=5, ncol=26)), letters)
DF2[, c('f','u','z')] <- DF[, c('f','u','z')]
Result:
> DF2
a b c d e f g h i j k l m n o p q r s t u v w x y z
1 NA NA NA NA NA -0.05612874 NA NA NA NA NA NA NA NA NA NA NA NA NA NA -0.62036668 NA NA NA NA 0.71266631
2 NA NA NA NA NA -0.15579551 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 0.04211587 NA NA NA NA -0.07356440
3 NA NA NA NA NA -1.47075238 NA NA NA NA NA NA NA NA NA NA NA NA NA NA -0.91092165 NA NA NA NA -0.03763417
4 NA NA NA NA NA -0.47815006 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 0.15802877 NA NA NA NA -0.68166048
5 NA NA NA NA NA 0.41794156 NA NA NA NA NA NA NA NA NA NA NA NA NA NA -0.65458464 NA NA NA NA -0.32427027
[<- could be used to fill up the missing columns with NA.
`[<-`(d,, setdiff(columns, names(d)), NA)[columns]
#`[<-`(d,, columns[!columns %in% names(d)], NA)[columns] #Alternative
# a b c d e f g h i j k l m n o p q r s t u
#1 NA NA NA NA NA NA 1 NA NA NA NA NA NA NA NA NA NA NA 4 NA NA
#2 NA NA NA NA NA NA 2 NA NA NA NA NA NA NA NA NA NA NA 2 NA NA
#3 NA NA NA NA NA NA 3 NA NA NA NA NA NA NA NA NA NA NA 3 NA NA
Or directly adding the missing columns to the original data.frame
d[columns[!columns %in% names(d)]] <- NA
d[columns]
# a b c d e f g h i j k l m n o p q r s t u
#1 NA NA NA NA NA NA 1 NA NA NA NA NA NA NA NA NA NA NA 4 NA NA
#2 NA NA NA NA NA NA 2 NA NA NA NA NA NA NA NA NA NA NA 2 NA NA
#3 NA NA NA NA NA NA 3 NA NA NA NA NA NA NA NA NA NA NA 3 NA NA
Or in a function:
f <- function(DF, COL) {
d[columns[!columns %in% names(d)]] <- NA
d[columns]
}
f(d, columns)
# a b c d e f g h i j k l m n o p q r s t u
#1 NA NA NA NA NA NA 1 NA NA NA NA NA NA NA NA NA NA NA 4 NA NA
#2 NA NA NA NA NA NA 2 NA NA NA NA NA NA NA NA NA NA NA 2 NA NA
#3 NA NA NA NA NA NA 3 NA NA NA NA NA NA NA NA NA NA NA 3 NA NA
Data
d <- data.frame('g'=c(1,2,3), 's' = c(4,2,3))
columns <- letters[1:21]