R error in '[<-.data.frame 'C' *temp*',… replacement has # items, need # - r

I have a function that I've executed many times but is now throwing an error, which I do not understand. I'm trying to apply the function over a list.
I did not write the function and I have limited experience with functions. So, I'm not sure how to troubleshoot the code.
function:
myfun<-function(Year, SampleID, Species, Abundance, resamps) {
library(vegan)
counter<-1
simbaseline<-data.frame(array(NA,dim=c(length(unique(Year)),5)))
names(simbaseline)<-c("Year", "Jaccard","Horn","Bray","Pearson")
simnext<-data.frame(array(NA,dim=c(length(unique(Year)),5)))
names(simnext)<-c("Year", "Jaccard","Horn","Bray","Pearson")
simhind<-data.frame(array(NA,dim=c(length(unique(Year)),5)))
names(simhind)<-c("Year", "Jaccard","Horn","Bray","Pearson")
counter2<-1
# getting vector with number of samples per year
nsamples<-c()
for(y in unique(Year)){
nsamples<-c(nsamples, length(unique(SampleID[Year==y])))
}
t<-1
minsample<-min(nsamples)
for(repeats in 1:resamps){
raref<-data.frame(array(NA,dim=c(1,3)))
names(raref)<-c("Year","Species","Abundance")
for(y in unique(Year)){
#getting samples for this year
samps<-unique(SampleID[Year==y])
# re-sampling samples to equalize number of samples
sam<-as.character(sample(samps,minsample,replace=T))
# getting data that belongs to bootstraped samples
rarefyear<-data.frame(SampleID[which(SampleID %in% sam & Year == y)],
Species[which(SampleID %in% sam & Year == y)],
Abundance[which(SampleID %in% sam & Year == y)])
names(rarefyear)<-c("SampleID", "Species", "Abundance")
# calculating pooled abundances of eahc species to store
spabun<-tapply(as.numeric(rarefyear[,3]),rarefyear[,2],sum)
spar<-data.frame(rep(y, length(spabun)),names(spabun),spabun, row.names=NULL)
names(spar)<-c("Year","Species","Abundance")
raref<-rbind(raref,spar)
counter<-counter+1
}
# calculating year by species table of abundance
rareftabtemp<-with(raref,tapply(Abundance,list(Year,Species),function(x)x))
rareftabtemp[is.na(rareftabtemp)]<-0
Pearsoncor<-cor(t(log(rareftabtemp+1)), method="pearson")
# calculating between year similarities (NOT DISTANCE!) with Jaccard, Morisita-Horn, Bray and Pearson correlations
Jacsim<-as.matrix(1-vegdist(rareftabtemp, method="jaccard"))
Hornsim<-as.matrix(1-vegdist(rareftabtemp, method="horn"))
Braysim<-as.matrix(1-vegdist(rareftabtemp, method="bray"))
n<-length(unique(Year))
simbaseline[counter2:(counter2+n-2),]<-
cbind(unique(Year)[2:n],Jacsim[2:n],Hornsim[2:n],Braysim[2:n],Pearsoncor[2:n])
simnext[counter2:(counter2+n-2),]<-
cbind(unique(Year)[2:n],Jacsim[row(Jacsim)-col(Jacsim)==1],
Hornsim[row(Hornsim)-col(Hornsim)==1],
Braysim[row(Braysim)-col(Braysim)==1],
Pearsoncor[row(Pearsoncor)-col(Pearsoncor)==1])
# added hindcasting
simhind[counter2:(counter2+n-2),]<-
cbind(unique(Year)[1:(n-1)],
Jacsim[row(Jacsim) %in% 1:(max(row(Jacsim))-1) &
col(Jacsim)==max(col(Jacsim))],
Hornsim[row(Hornsim)%in%1:(max(row(Hornsim))-1) &
col(Hornsim)==max(col(Hornsim))],
Braysim[row(Braysim)%in%1:(max(row(Braysim))-1) &
col(Braysim)==max(col(Braysim))],
Pearsoncor[row(Pearsoncor)%in%1:(max(row(Pearsoncor))-1) &
col(Pearsoncor)==max(col(Pearsoncor))])
counter2<-counter2+n
}
baselinesim<-data.frame(unique(Year)[2:n],
tapply(simbaseline$Jaccard,simbaseline$Year,mean),
tapply(simbaseline$Horn,simbaseline$Year,mean),
tapply(simbaseline$Bray,simbaseline$Year,mean),
tapply(simbaseline$Pearson,simbaseline$Year,mean))
names(baselinesim)<-c("Year", "Jaccard","Horn","Bray","Pearson")
nextsim<-data.frame(unique(Year)[2:n],
tapply(simnext$Jaccard,simnext$Year,mean),
tapply(simnext$Horn,simnext$Year,mean),
tapply(simnext$Bray,simnext$Year,mean),
tapply(simnext$Pearson,simnext$Year,mean))
names(nextsim)<-c("Year", "Jaccard","Horn","Bray","Pearson")
hindcastsim<-data.frame(unique(Year)[1:(n-1)],
tapply(simhind$Jaccard,simhind$Year,mean),
tapply(simhind$Horn,simhind$Year,mean),
tapply(simhind$Bray,simhind$Year,mean),
tapply(simhind$Pearson,simhind$Year,mean))
names(hindcastsim)<-c("Year", "Jaccard","Horn","Bray","Pearson")
a<-list(baselinesim,nextsim,hindcastsim)
return(a)
}
error:
Error in [<-.data.frame(*tmp*, counter2:(counter2 + n - 2), , value = c(NA, : replacement has 2 items, need 5
Traceback
6.
stop(sprintf(ngettext(m, "replacement has %d item, need %d",
"replacement has %d items, need %d"), m, n * p), domain = NA)
5.
`[<-.data.frame`(`*tmp*`, counter2:(counter2 + n - 2), , value = structure(c(NA,
2009), .Dim = 2:1))
4.
`[<-`(`*tmp*`, counter2:(counter2 + n - 2), , value = structure(c(NA,
2009), .Dim = 2:1))
3.
myfun(x$Year, x$Bay, x$Species, x$Abundance, 20)
2.
FUN(X[[i]], ...)
1.
lapply(summer.split, function(x) myfun(x$Year, x$Bay,
x$Species, x$Abundance, 20))
Again, the function worked
Someone seems to have asked a similar question before and was answered by #Marat Talipov but I'm not experienced enough to make sense of what the solution was.
The answer was:
This error pops up when you're unlucky and i <- runif(n) < 1/2 consists only of FALSE, i.e. no permutations happen. You need to add a check in the swap function to fix this problem.
R error in '[<-.data.frame'... replacement has # items, need #
A subset of my data can be found here:
https://fil.email/sI4Kyhaj
The data was split by "Bay" to generate the list
Note that the function may not throw an error on a different machine because it seems to occur periodically.

The problem is located at the peace of code below (and similar indexed reasignment of dataframe's expressions in your code):
simbaseline[counter2:(counter2+n-2),]<-
cbind(unique(Year)[2:n],Jacsim[2:n],Hornsim[2:n],Braysim[2:n],Pearsoncor[2:n])
What happend is that the amount of rows you are trying to assign in the left-hand experession is not equal to the right-hand one. To avoid it you can use intermediate dataframe filled-in with e.g. NAs then reassign temporary dataframe to your target data frame. Please see the simulated code below with explanations how it can be done:
# simulation
df <- data.frame(i = 1:10, l = letters[1:10], stringsAsFactors = FALSE)
head(df)
# i l
# 1 1 a
# 2 1 1
# 3 a a
# 4 4 d
# 5 5 e
# 6 6 f
# with error
df[1:5, ] <- cbind(1:3, c("a", "b", "c"))
# Error in `[<-.data.frame`(`*tmp*`, 1:5, , value = c("1", "2", "3", "a", :
# replacement has 6 items, need 10
# without error
dftemp_in <- cbind(1:3, c("a", "b", "c"))
dftemp_out <- df[1:5, ]
dftemp_out[] <- NA
dftemp_out[seq(nrow(dftemp_in)), ] <- dftemp_in
df[1:5, ] <- dftemp_out
df
# i l
# 1 1 a
# 2 2 b
# 3 3 c
# 4 <NA> <NA>
# 5 <NA> <NA>
# 6 6 f
# 7 7 g
# 8 8 h
# 9 9 i
# 10 10 j

Related

Counting unique occurrence of two values per ID, considering presence of third value

The title is not quite cool - I apologise that I was not able to summarise the question better. I am conceptually a bit lost and wondered if there is a better approach for the following:
What I have:
I have two columns, ID and eye. Eyes can be coded as "r", "l" or "b" (right/ left/ both eyes). It does not have to contain all values, and it can include NA.
What I want:
I want to count number of distinct eyes by ID. If "b" is occurring, "r|l" for the same ID should not be counted (because "right | left eye" is part of "both eyes").
Ideally base R only:
My approach uses base R only, and I would much prefer a base R solution, because this is intended for a package. (Actually, the core of this function is already part of a package, but I wonder if this can be improved).
Other solutions very welcome:
The final function is also to be applied on data frames with 10^6 rows and thousands of IDs, so should be fast computation by group. My solution seems already fairly fast, (I have not done a formal test though). I would therefore also think any dplyr::group_by solution would not be an option (at least in my approaches).
# sample data
set.seed(42)
id <- letters[sample(11, replace = TRUE)]
foo1 <- data.frame(id, eye = sample(c("r", "l", "b"), 11, replace = TRUE))
foo2 <- data.frame(id, eye = "r")
foo3 <- foo2
foo3$eye[1:5] <- NA
foo4 <- data.frame(id, eye = "b")
count_eyes <- function(x, pat_col, eye) {
# reduce to unique combinations of patient and eye, then count occurrence of
# "eye" by patient. Results in matrix of 0/1
eye_tab <- table(unique(x[, c(pat_col, eye)]))
# cases where "b" does not exist must also work (foo2 and foo3)
if(any(grepl("b", colnames(eye_tab)))){
# whenever "b" is present, "r" and "l" will be set to 0,
# so it will not be counted in the next step
# "r" and "l" might not occur
if(any(grepl("r|l", colnames(eye_tab)))){
eye_tab[, c("r","l")][eye_tab[, "b"] == 1] <- 0
}
}
# I chose the programmatic approach because the column names might not be present
# I add all 1 for each column. Because r is set to 0 previously, I have to
# add the count for b again to get the real number of right eyes.
n_b <- unname(colSums(eye_tab[, colnames(eye_tab) == "b", drop = FALSE]))
n_right <- sum(unname(colSums(eye_tab[, colnames(eye_tab) == "r", drop = FALSE])), n_b)
n_left <- sum(unname(colSums(eye_tab[, colnames(eye_tab) == "l", drop = FALSE])), n_b)
c(r = n_right, l = n_left)
}
expected result
lapply(mget(c("foo1", "foo2", "foo3", "foo4")), count_eyes, pat_col = "id", eye = "eye")
#> $foo1
#> r l
#> 7 6
#>
#> $foo2
#> r l
#> 8 0
#>
#> $foo3
#> r l
#> 6 0
#>
#> $foo4
#> r l
#> 8 8
The code could be shortened if we convert the column to factor with levels specified
count_eyes <- function(x, pat_col, eye) {
nm1 <- c('r', 'l')
x$eye <- factor(x$eye, levels = c("b", nm1)) # // convert to factor
# reduce to unique combinations of patient and eye, then count occurrence of
# "eye" by patient. Results in matrix of 0/1
eye_tab <- table(unique(x[, c(pat_col, eye)]))
# cases where "b" does not exist must also work (foo2 and foo3)
if(any(grepl("b", colnames(eye_tab)))){
# whenever "b" is present, "r" and "l" will be set to 0,
# so it will not be counted in the next step
# "r" and "l" might not occur
if(any(grepl(paste(nm1, collapse="|"), colnames(eye_tab)))){
eye_tab[, nm1][eye_tab[, "b"] == 1] <- 0
}
}
out <- colSums(eye_tab)
out[nm1] + out['b']
}
-testing
lapply(mget(paste0('foo', 1:4)), count_eyes, pat_col = "id", eye = "eye")
#$foo1
#r l
#7 6
#$foo2
#r l
#8 0
#$foo3
#r l
#6 0
#$foo4
#r l
#8 8
Here's another approach with split and rowSums:
count_eyes <- function(x , pat_col, eye){
rowSums(sapply(split(subset(x,select = eye),
subset(x,select = pat_col)),
function(y){c(r = any(y %in% c("b", "r")),
l = any(y %in% c("b", "l")))
}))}
lapply(mget(ls(pattern="foo")),count_eyes, "id", "eye")
$foo1
r l
5 4
$foo2
r l
6 0
$foo3
r l
4 0
$foo4
r l
6 6

permute dataframe but must have unique rows

Say I have a dataframe like this:
d <- data.frame(time = c(1,3,5,6,11,15,15,18,18,20), side = c("L", "R", "R", "L", "L", "L", "L", "R","R","R"), id = c(1,2,1,2,4,3,4,2,1,1), stringsAsFactors = F)
d
time side id
1 1 L 1
2 3 R 2
3 5 R 1
4 6 L 2
5 11 L 4
6 15 L 3
7 15 L 4
8 18 R 2
9 18 R 1
10 20 R 1
I wish to permute the id variable and keep the other two constant. However, importantly, in my final permutations I do not want to have the same id on the same side at the same time. For instance, there are two times/sides where this might occur. In the original data at time 15 and 18 there are two unique ids at the same side (left for time 15 and right for time 18). If I permute using sample there is a chance that the same id shows up at the same time/side combination.
For example,
set.seed(11)
data.frame(time=d$time, side=d$side, id=sample(d$id))
time side id
1 1 L 1
2 3 R 1
3 5 R 4
4 6 L 1
5 11 L 4
6 15 L 2
7 15 L 3
8 18 R 2
9 18 R 2
10 20 R 1
Here, id=2 appears on two rows at time 18 on side "R". This is not allowed in the permutation I need.
One solution would be to brute force this - e.g. say I needed 100 permutation, I could generate 500 and discard those that fail the criteria. However, in my real data I have hundreds of rows and just using samplealmost always leads to a failure. I wonder if there is a better algorithm for doing this? Perhaps a birth-death algorithm?
Setup:
library(tidyverse)
d <- data.frame(time = c(1,3,5,6,11,15,15,18,18,20), side = c("L", "R", "R", "L", "L", "L", "L", "R","R","R"), id = c(1,2,1,2,4,3,4,2,1,1), stringsAsFactors = F)
d <- rownames_to_column(d)
I want the rownames to put it back in order at the end.
You need a function that takes a vector (like your id vector) and returns a sample of size n with the constraint that the values have to be different, as in the following (which assumes the sampling you want can actually take place, i.e. you haven't run out of items to sample). For convenience this also returns the "leftovers" that weren't sampled:
samp_uniq_n <- function(vec, n) {
x <- vec
out <- rep(NA, n)
for(i in 1:n) {
# Here would be a good place to make sure sampling is even possible.
probs <- prop.table(table(x))
out[i] <- sample(unique(x), 1, prob=probs)
x <- x[x != out[i]]
vec <- vec[-min(which(vec == out[i]))]
}
return(list(out=out, vec=vec))
}
Now, we need to split the data into a list of rows that have the same time and side and start the sampling with the largest such:
id <- d$id
d_split <- d %>% select(-id) %>% split(., list(d$time, d$side), drop = TRUE)
d_split_desc <- d_split[order(-sapply(d_split, nrow))]
Then we can do the sampling itself:
for(i in seq_along(d_split_desc)) {
samp <- samp_uniq_n(id, nrow(d_split_desc[[i]]))
this_id <- samp$out
d_split_desc[[i]]$id <- this_id
id <- samp$vec
}
Finally, some cleanup:
d_permute <- do.call(rbind, d_split_desc) %>%
arrange(as.numeric(rowname)) %>%
select(-rowname)
Putting all this in a big function is an annoyance I'll leave to anyone who is interested.

Error in mean and mode in R

I have the following data. I want to get mean from each column, which is mode for nominal data.
df1<-data.frame(c("a","a"),c("b","d"),c(1,5),c(4,8))
names(df1)<-c("x","y","z","w")
df1
x y z w
1 a b 1 4
2 a d 5 8
df2<-as.data.frame(matrix(0,ncol=4,nrow=1))
names(df2)<-c("x","y","z","w")
df2$x<-names(table(df1$x))[table(df1$x)==max(table(df1$x))]
df2$y<-names(table(df1$y))[table(df1$y)==max(table(df1$y))]
df2$z<-mean(df1$z)
df2$w<-mean(df1$w)
If the data frame only contains a data, and one of the nominal columns of the next data is different with the previous one, then the following error is showing.
Error in `$<-.data.frame`(`*tmp*`, y, value = c("b", "d")) :
replacement has 2 rows, data has 1
What can I do to fix this error?Thank you so much for your help
You can write a function to calculate either the mean or mode of each column:
get.mean.mod <- function (df) {
data.frame(lapply(df, function (x) {
if (is.numeric(x)) return (mean(x))
freq <- table(x)
names(freq)[which.max(freq)]
}))
}
get.mean.mod(df1)
# x y z w
# 1 a b 3 6

Run a separate function for each item depending on the value of another variable with dplyr

I have a dataset which contains a categorical variable. Depending on the value of this variable, I want to run a different function for each such value. All the possible functions have the same return type. I might wish to run say, sin() if category is 'A', cos() if category is 'B', and tan() if category is 'C'.
The real application for this is in simulating populations, where outcomes depend on the values of categories, but sometimes in very different ways.
Toy example
library(dplyr)
category=c('A','B','C')
N <- 100
pop <- as.data.frame(ID <- seq(1:N))
pop <- as.tbl(pop)
pop$Category <- sample(category,N,replace=TRUE)
pop$score <- runif(N)
pop
tf <- function(x,EXPR) {
switch(EXPR,
A = cos(x),
B = sin(x),
C = tan(x)
)}
pop$results <- tf(pop$Score,pop$Category)
This code fails,reasonably enough, with the error message
Error in switch(EXPR, A = cos(x), B = sin(x), C = tan(x)) : EXPR must be a length 1 vector
I have looked, carefully, at dplyr and do, and I can easily see how to run the same function for each category separately. However, I need a function which depends on the category value.
Suggestions greatly appreciated.
The rowwise function is what you need to force it evaluate row by row...
pop<-data.frame(ID=1:100,
category = sample(c("A", "B", "C"),100,replace=TRUE),
score = runif(100))
exprs<-function(category, score){
if(category=="A")
ret <- sin(score)
if(category=="B")
ret <- cos(score)
if(category=="C")
ret <- tan(score)
ret }
pop %>%
rowwise %>%
mutate(answer = exprs(category, score))
Source: local data frame [100 x 4]
Groups:
# A tibble: 100 × 4
ID category score answer
<int> <fctr> <dbl> <dbl>
1 1 C 0.5219332 0.5751317
2 2 C 0.9266336 1.3314972
3 3 B 0.2729260 0.9629863
4 4 B 0.6575110 0.7915158
5 5 B 0.0910481 0.9958580
6 6 C 0.9968752 1.5467554
7 7 A 0.3429183 0.3362369
8 8 A 0.9101669 0.7896062
9 9 B 0.9291849 0.5984872
10 10 C 0.8913347 1.2379742
# ... with 90 more rows
You can use Vectorize():
set.seed(42)
category=c('A','B','C')
N <- 10
pop <- data.frame(ID=seq(1:N), Category=sample(category,N,replace=TRUE), score=runif(N), stringsAsFactors = FALSE)
tf <- function(x, EXPR) switch(EXPR,
'A' = cos(x),
'B' = sin(x),
'C' = tan(x))
TF <- Vectorize(tf)
pop$result <- TF(pop$score, pop$Category)
or (thx to #42 for the comment)
pop$result <- mapply(tf, pop$score, pop$Category)
The error appears because you are sending the complete vector , instead of record wise. I used lapply to call your function for each row and it works
library(dplyr)
category=c('A','B','C')
N <- 100
pop <- data.frame(ID = seq(1:N))
pop$Category <- sample(category,N,replace=TRUE)
pop$Category <- as.factor(pop$Category)
pop$score <- runif(N)
tf <- function(x,EXPR) {
switch(EXPR,
A = cos(x),
B = sin(x),
C = tan(x)
)}
## call tf for every row in the dataframe
pop$results <-lapply( seq_len(nrow(pop)) , function (i) {
tf(pop$score[i],pop$Category[i])
}) %>% unlist
Thanks

Length of Trend - Panel Data

I have a well balanced panel data set which contains NA observations. I will be using LOCF, and would like to know how many consecutive NA's are in each panel, before carrying observations forward. LOCF is a procedure where by missing values can be "filled in" using the "last observation carried forward". This can make sense it some time-series applications; perhaps we have weather data in 5 minute increments: a good guess at the value of a missing observation might be an observation made 5 minutes earlier.
Obviously, it makes more sense to carry an observation forward one hour within one panel than it does to carry that same observation forward to the next year in the same panel.
I am aware that you can set a "maxgap" argument using zoo::na.locf, however, I want to get a better feel for my data. Please see a simple example:
require(data.table)
set.seed(12345)
### Create a "panel" data set
data <- data.table(id = rep(1:10, each = 10),
date = seq(as.POSIXct('2012-01-01'),
as.POSIXct('2012-01-10'),
by = '1 day'),
x = runif(100))
### Randomly assign NA's to our "x" variable
na <- sample(1:100, size = 52)
data[na, x := NA]
### Calculate the max number of consecutive NA's by group...this is what I want:
### ID Consecutive NA's
# 1 1
# 2 3
# 3 3
# 4 3
# 5 4
# 6 5
# ...
# 10 2
### Count the total number of NA's by group...this is as far as I get:
data[is.na(x), .N, by = id]
All solutions are welcomed, but data.table solutions are highly preferred; the data file is large.
This will do it:
data[, max(with(rle(is.na(x)), lengths[values])), by = id]
I just ran rle to find all consecutive NA's and picked the max length.
Here's a rather convoluted answer to the comment question of recovering the date ranges for the above max:
data[, {
tmp = rle(is.na(x));
tmp$lengths[!tmp$values] = 0; # modify rle result to ignore non-NA's
n = which.max(tmp$lengths); # find the index in rle of longest NA sequence
tmp = rle(is.na(x)); # let's get back to the unmodified rle
start = sum(tmp$lengths[0:(n-1)]) + 1; # and find the start and end indices
end = sum(tmp$lengths[1:n]);
list(date[start], date[end], max(tmp$lengths[tmp$values]))
}, by = id]
You can use rle with the modification suggested here (and pasted below) to count NA values.
foo <- data[, rle(x), by=id]
foo[is.na(values), max(lengths), by=id]
# id V1
# 1: 1 1
# 2: 2 3
# 3: 3 3
# 4: 4 3
# 5: 5 4
# 6: 6 5
# 7: 7 3
# 8: 8 5
# 9: 9 2
# 10: 10 2
Amended rle function:
rle<-function (x)
{
if (!is.vector(x)&& !is.list(x))
stop("'x' must be an atomic vector")
n<- length(x)
if (n == 0L)
return(structure(list(lengths = integer(), values = x),
class = "rle"))
#### BEGIN NEW SECTION PART 1 ####
naRepFlag<-F
if(any(is.na(x))){
naRepFlag<-T
IS_LOGIC<-ifelse(typeof(x)=="logical",T,F)
if(typeof(x)=="logical"){
x<-as.integer(x)
naMaskVal<-2
}else if(typeof(x)=="character"){
naMaskVal<-paste(sample(c(letters,LETTERS,0:9),32,replace=T),collapse="")
}else{
naMaskVal<-max(0,abs(x[!is.infinite(x)]),na.rm=T)+1
}
x[which(is.na(x))]<-naMaskVal
}
#### END NEW SECTION PART 1 ####
y<- x[-1L] != x[-n]
i<- c(which(y), n)
#### BEGIN NEW SECTION PART 2 ####
if(naRepFlag)
x[which(x==naMaskVal)]<-NA
if(IS_LOGIC)
x<-as.logical(x)
#### END NEW SECTION PART 2 ####
structure(list(lengths = diff(c(0L, i)), values = x[i]),
class = "rle")
}

Resources