Related
Problem: I have several (10+) custom functions, each defining a step in the workflow. I want to run a nested function of these steps over a large data frame for n (50+) periods iteratively. My current function achieves the result but it is too slow and not very elegant.
Example Input
id x_1975 z_1975
1 1 1 NA
2 2 2 NA
3 3 3 NA
4 4 4 NA
5 5 5 NA
Step 1:
Compare initial x values (x_1975) against a cutoff=3. If x is greater than 3, then the z value should be "Y".
Step 2:
If z value is "Y", then x value in next year should be x times 2. Otherwise, it should be x times 5. Although the z values can be skipped altogether, I need the categorical column to create summary stats.
Note:
The data set I am working with has 20 variables that need to be calculated based on some similar logics.
Desired Output
id x_1975 z_1975 x_1976 z_1976 x_1977 z_1977 x_1978
1 1 1 <NA> 5 Y 10 Y 20
2 2 2 <NA> 10 Y 20 Y 40
3 3 3 <NA> 15 Y 30 Y 60
4 4 4 Y 8 Y 16 Y 32
5 5 5 Y 10 Y 20 Y 40
6 6 6 Y 12 Y 24 Y 48
What I have tried:
Tried setting the data in long format. But found it complicated to iterate over rows.
Pre-allocated all columns with appropriate class. That reduced run time a little although not enough.
Have been trying to use purrr::compose to nest all the functions. But I am not being able to make it work.
Reproducible Example
library(dplyr)
library(purrr)
# Create Data Frame
n <- 6
dat <- data.frame(id=1:n,
x_1975=seq(1,6,1),
z_1975=NA)
cut_off <- 3
# Functions
# Set a value for "z_" variables in period t by comparing "x_" value in period t against the the cut_off value.
func_1 <- function(dat,yr){
# pre-define variables
z <- paste0("z_",yr)
x <- paste0("x_",yr)
# Caclulate values for "z_" in period t
dat <- dat %>% mutate(!!sym(z):=
case_when(!!sym(x)>cut_off ~ "Y",
TRUE~as.character(NA)
))
}
# Calculate the value for "x_" variables in period t+1 based on "z_" variables in period t.
func_2 <- function(dat,yr){
# pre-define variables
x <- paste0("x_",yr+1)
x_lag <- paste0("x_",yr)
z <- paste0("z_",yr)
# Calculate "x_" value for t+1
dat <- dat %>% mutate(!!sym(x):=case_when(
!!sym(z)=="Y"~!!sym(x_lag)*2,
TRUE~!!sym(x_lag)*5
))
}
# Join function 1 and function 2 together. The joined function needs to iterate over the `dat` from beginning year to ending year
joined_func <- function(dat,beginning,ending){
for (year in seq(beginning,ending,1)){
dat <- func_1(dat,year)
# Output of step 1 is used as input for step 2
dat <- func_2(dat,year)
}
return(dat)
}
# Run the code from 1975 to 2025.The data_output has the desired output, but need to reduce runtime.
data_output <- joined_func(dat,1975,1977)
# Tried to use the compose function from purrr. but getting error.
my_funs <- c(func_1, func_2)
f1 <- invoke(compose, my_funs)
joined_func_2 <- function(dat,beginning,ending){
for (year in seq(beginning,ending,1)){
dat <- f1(dat,year=year)
}
}
data_output_2 <- joined_func_2(dat,1975,1977)
# Error message:
# Error in f1(dat, year = year) : unused argument (year = year).
Questions
a) how do I make purrr::compose work? b) any other way to achieve efficiency?
Would really appreciate if someone could help me on this!
I have a function that I've executed many times but is now throwing an error, which I do not understand. I'm trying to apply the function over a list.
I did not write the function and I have limited experience with functions. So, I'm not sure how to troubleshoot the code.
function:
myfun<-function(Year, SampleID, Species, Abundance, resamps) {
library(vegan)
counter<-1
simbaseline<-data.frame(array(NA,dim=c(length(unique(Year)),5)))
names(simbaseline)<-c("Year", "Jaccard","Horn","Bray","Pearson")
simnext<-data.frame(array(NA,dim=c(length(unique(Year)),5)))
names(simnext)<-c("Year", "Jaccard","Horn","Bray","Pearson")
simhind<-data.frame(array(NA,dim=c(length(unique(Year)),5)))
names(simhind)<-c("Year", "Jaccard","Horn","Bray","Pearson")
counter2<-1
# getting vector with number of samples per year
nsamples<-c()
for(y in unique(Year)){
nsamples<-c(nsamples, length(unique(SampleID[Year==y])))
}
t<-1
minsample<-min(nsamples)
for(repeats in 1:resamps){
raref<-data.frame(array(NA,dim=c(1,3)))
names(raref)<-c("Year","Species","Abundance")
for(y in unique(Year)){
#getting samples for this year
samps<-unique(SampleID[Year==y])
# re-sampling samples to equalize number of samples
sam<-as.character(sample(samps,minsample,replace=T))
# getting data that belongs to bootstraped samples
rarefyear<-data.frame(SampleID[which(SampleID %in% sam & Year == y)],
Species[which(SampleID %in% sam & Year == y)],
Abundance[which(SampleID %in% sam & Year == y)])
names(rarefyear)<-c("SampleID", "Species", "Abundance")
# calculating pooled abundances of eahc species to store
spabun<-tapply(as.numeric(rarefyear[,3]),rarefyear[,2],sum)
spar<-data.frame(rep(y, length(spabun)),names(spabun),spabun, row.names=NULL)
names(spar)<-c("Year","Species","Abundance")
raref<-rbind(raref,spar)
counter<-counter+1
}
# calculating year by species table of abundance
rareftabtemp<-with(raref,tapply(Abundance,list(Year,Species),function(x)x))
rareftabtemp[is.na(rareftabtemp)]<-0
Pearsoncor<-cor(t(log(rareftabtemp+1)), method="pearson")
# calculating between year similarities (NOT DISTANCE!) with Jaccard, Morisita-Horn, Bray and Pearson correlations
Jacsim<-as.matrix(1-vegdist(rareftabtemp, method="jaccard"))
Hornsim<-as.matrix(1-vegdist(rareftabtemp, method="horn"))
Braysim<-as.matrix(1-vegdist(rareftabtemp, method="bray"))
n<-length(unique(Year))
simbaseline[counter2:(counter2+n-2),]<-
cbind(unique(Year)[2:n],Jacsim[2:n],Hornsim[2:n],Braysim[2:n],Pearsoncor[2:n])
simnext[counter2:(counter2+n-2),]<-
cbind(unique(Year)[2:n],Jacsim[row(Jacsim)-col(Jacsim)==1],
Hornsim[row(Hornsim)-col(Hornsim)==1],
Braysim[row(Braysim)-col(Braysim)==1],
Pearsoncor[row(Pearsoncor)-col(Pearsoncor)==1])
# added hindcasting
simhind[counter2:(counter2+n-2),]<-
cbind(unique(Year)[1:(n-1)],
Jacsim[row(Jacsim) %in% 1:(max(row(Jacsim))-1) &
col(Jacsim)==max(col(Jacsim))],
Hornsim[row(Hornsim)%in%1:(max(row(Hornsim))-1) &
col(Hornsim)==max(col(Hornsim))],
Braysim[row(Braysim)%in%1:(max(row(Braysim))-1) &
col(Braysim)==max(col(Braysim))],
Pearsoncor[row(Pearsoncor)%in%1:(max(row(Pearsoncor))-1) &
col(Pearsoncor)==max(col(Pearsoncor))])
counter2<-counter2+n
}
baselinesim<-data.frame(unique(Year)[2:n],
tapply(simbaseline$Jaccard,simbaseline$Year,mean),
tapply(simbaseline$Horn,simbaseline$Year,mean),
tapply(simbaseline$Bray,simbaseline$Year,mean),
tapply(simbaseline$Pearson,simbaseline$Year,mean))
names(baselinesim)<-c("Year", "Jaccard","Horn","Bray","Pearson")
nextsim<-data.frame(unique(Year)[2:n],
tapply(simnext$Jaccard,simnext$Year,mean),
tapply(simnext$Horn,simnext$Year,mean),
tapply(simnext$Bray,simnext$Year,mean),
tapply(simnext$Pearson,simnext$Year,mean))
names(nextsim)<-c("Year", "Jaccard","Horn","Bray","Pearson")
hindcastsim<-data.frame(unique(Year)[1:(n-1)],
tapply(simhind$Jaccard,simhind$Year,mean),
tapply(simhind$Horn,simhind$Year,mean),
tapply(simhind$Bray,simhind$Year,mean),
tapply(simhind$Pearson,simhind$Year,mean))
names(hindcastsim)<-c("Year", "Jaccard","Horn","Bray","Pearson")
a<-list(baselinesim,nextsim,hindcastsim)
return(a)
}
error:
Error in [<-.data.frame(*tmp*, counter2:(counter2 + n - 2), , value = c(NA, : replacement has 2 items, need 5
Traceback
6.
stop(sprintf(ngettext(m, "replacement has %d item, need %d",
"replacement has %d items, need %d"), m, n * p), domain = NA)
5.
`[<-.data.frame`(`*tmp*`, counter2:(counter2 + n - 2), , value = structure(c(NA,
2009), .Dim = 2:1))
4.
`[<-`(`*tmp*`, counter2:(counter2 + n - 2), , value = structure(c(NA,
2009), .Dim = 2:1))
3.
myfun(x$Year, x$Bay, x$Species, x$Abundance, 20)
2.
FUN(X[[i]], ...)
1.
lapply(summer.split, function(x) myfun(x$Year, x$Bay,
x$Species, x$Abundance, 20))
Again, the function worked
Someone seems to have asked a similar question before and was answered by #Marat Talipov but I'm not experienced enough to make sense of what the solution was.
The answer was:
This error pops up when you're unlucky and i <- runif(n) < 1/2 consists only of FALSE, i.e. no permutations happen. You need to add a check in the swap function to fix this problem.
R error in '[<-.data.frame'... replacement has # items, need #
A subset of my data can be found here:
https://fil.email/sI4Kyhaj
The data was split by "Bay" to generate the list
Note that the function may not throw an error on a different machine because it seems to occur periodically.
The problem is located at the peace of code below (and similar indexed reasignment of dataframe's expressions in your code):
simbaseline[counter2:(counter2+n-2),]<-
cbind(unique(Year)[2:n],Jacsim[2:n],Hornsim[2:n],Braysim[2:n],Pearsoncor[2:n])
What happend is that the amount of rows you are trying to assign in the left-hand experession is not equal to the right-hand one. To avoid it you can use intermediate dataframe filled-in with e.g. NAs then reassign temporary dataframe to your target data frame. Please see the simulated code below with explanations how it can be done:
# simulation
df <- data.frame(i = 1:10, l = letters[1:10], stringsAsFactors = FALSE)
head(df)
# i l
# 1 1 a
# 2 1 1
# 3 a a
# 4 4 d
# 5 5 e
# 6 6 f
# with error
df[1:5, ] <- cbind(1:3, c("a", "b", "c"))
# Error in `[<-.data.frame`(`*tmp*`, 1:5, , value = c("1", "2", "3", "a", :
# replacement has 6 items, need 10
# without error
dftemp_in <- cbind(1:3, c("a", "b", "c"))
dftemp_out <- df[1:5, ]
dftemp_out[] <- NA
dftemp_out[seq(nrow(dftemp_in)), ] <- dftemp_in
df[1:5, ] <- dftemp_out
df
# i l
# 1 1 a
# 2 2 b
# 3 3 c
# 4 <NA> <NA>
# 5 <NA> <NA>
# 6 6 f
# 7 7 g
# 8 8 h
# 9 9 i
# 10 10 j
I would like to use the vector:
time.int<-c(1,2,3,4,5) #vector to be use as a "guide"
and the database:
time<-c(1,1,1,1,5,5,5)
value<-c("s","s","s","t","d","d","d")
dat1<- as.data.frame(cbind(time,value))
to create the following vector, which I can then add to the first vector "time.int" into a second database.
freq<-c(4,0,0,0,3) #wished result
This vector is the sum of the events that belong to each time interval, there are four 1 in "time" so the first value gets a four and so on.
Potentially I would like to generalize it so that I can decide the interval, for example saying sum in a new vector the events in "times" each 3 numbers of time.int.
EDIT for generalization
time.int<-c(1,2,3,4,5,6)
time<-c(1,1,1,2,5,5,5,6)
value<-c("s","s","s","t", "t","d","d","d")
dat1<- data.frame(time,value)
let's say I want it every 2 seconds (every 2 time.int)
freq<-c(4,0,4) #wished result
or every 3
freq<-c(4,4) #wished result
I know how to do that in excel, with a pivot table.
sorry if a duplicate I could not find a fitting question on this website, I do not even know how to ask this and where to start.
The following will produce vector freq.
freq <- sapply(time.int, function(x) sum(x == time))
freq
[1] 4 0 0 0 3
BTW, don't use the construct as.data.frame(cbind(.)). Use instead
dat1 <- data.frame(time,value))
In order to generalize the code above to segments of time.int of any length, I believe the following function will do it. Note that since you've changed the data the output for n == 1 is not the same as above.
fun <- function(x, y, n){
inx <- lapply(seq_len(length(x) %/% n), function(m) seq_len(n) + n*(m - 1))
sapply(inx, function(i) sum(y %in% x[i]))
}
freq1 <- fun(time.int, time, 1)
freq1
[1] 3 1 0 0 3 1
freq2 <- fun(time.int, time, 2)
freq2
[1] 4 0 4
freq3 <- fun(time.int, time, 3)
freq3
[1] 4 4
We can use the table function to count the event number and use merge to create a data frame summarizing the information. event_dat is the final output.
# Create example data
time.int <- c(1,2,3,4,5)
time <- c(1,1,1,1,5,5,5)
# Count the event using table and convert to a data frame
event <- as.data.frame(table(time))
# Convert the time.int to a data frame
time_dat <- data.frame(time = time.int)
# Merge the data
event_dat <- merge(time_dat, event, by = "time", all = TRUE)
# Replace NA with 0
event_dat[is.na(event_dat)] <- 0
# See the result
event_dat
time Freq
1 1 4
2 2 0
3 3 0
4 4 0
5 5 3
Attempting to calculate differences between every two values in a row then sum the total differences for each dataframe in a list. I know for/while loops in R absolutely suck. I had this working before, but I've broken it. Can someone suggest how to optimize this using an alternative in the apply family? Current code:
for (i in 1:length(refdata)) { #for each dataframe in a list
refdif <- as.data.frame(matrix(0, ncol = 1, nrow = nrow(refdata[[i]])))
refdif1 <- c()
for (z in 1:ncol(refdata[[i]])) { #for each column in a dataframe
for(x in 1:nrow(refdata[[i]])) { #for each row in a dataframe
refdif <- (refdata[[i]][x,z] - refdata[[i]][x,z+1]) #difference of first value + the enxt
refdif1[x,1] <- (refdif1[x,1] + refidf) #sum of latest difference
}
}
print(refdif1) #where I can conduct tests on each individual dataframe with a column of sums of differences
}
example data:
list 1 refdata[[1]]
$`1`
var1 var2 var3 var4
1 1 2 3 4
2 5 6 7 8
$`2`
var1 var2 var3 var4
1 1 2 3 4
2 5 6 7 8
var 1 + 2 has the difference calculated, var 3 and 4 has the difference calculated, then each difference is summed together and placed in a new dataframe in a single column. (5-6) + (7-8), (1-2) + (3-4), etc etc:
$`1`
dif
1 -2
2 -2
$`2`
dif
1 -2
2 -2
One way to do it (per unlisted dataframe) could be by using logical vectors for indexing - their values are recycled - that way calculating the difference between every other column and finally summing the resulting df row-wise.
refdata1<-rowSums(refdata[c(T,F)]-refdata[c(F,T)])
Edit
Exact output can be obtained by
lapply(refdata, function(df){ data.frame(dif=rowSums(df[c(T,F)]-df[c(F,T)])) })
thx Heroka
# Create test data
x <- rbind(1:4, 5:8)
refdata <- list(x,x)
# Calculate results (all elements should have an even number of columns)
lapply(refdata, FUN = function(x) x %*% rep_len(c(1, -1), NCOL(x)))
I am a new guy in R and really unsure how to filter data in date frame.
I have created a data frame with two columns including monthly date and corresponding temperature. It has a length of 324.
> head(Nino3.4_1974_2000)
Month_common Nino3.4_degree_1974_2000_plain
1 1974-01-15 -1.93025
2 1974-02-15 -1.73535
3 1974-03-15 -1.20040
4 1974-04-15 -1.00390
5 1974-05-15 -0.62550
6 1974-06-15 -0.36915
The filter rule is to select the temperature which are greater or equal to 0.5 degree. Also, it has to be at least continuously 5 months.
I have eliminate the data with less than 0.5 degree temperature (see below).
for (i in 1) {
el_nino=Nino3.4_1974_2000[which(Nino3.4_1974_2000$Nino3.4_degree_1974_2000_plain >= 0.5),]
}
> head(el_nino)
Month_common Nino3.4_degree_1974_2000_plain
32 1976-08-15 0.5192000
33 1976-09-15 0.8740000
34 1976-10-15 0.8864501
35 1976-11-15 0.8229501
36 1976-12-15 0.7336500
37 1977-01-15 0.9276500
However, i still need to extract continuously 5 months. I wish someone could help me out.
If you can always rely on the spacing being one month, then let's temporarily discard the time information:
temps <- Nino3.4_1974_2000$Nino3.4_degree_1974_2000_plain
So, since every temperature in that vector is always separated by one month, we just have to look for runs where the temps[i]>=0.5, and the run has to be at least 5 long.
If we do the following:
ofinterest <- temps >= 0.5
we'll have a vector ofinterest with values TRUE FALSE FALSE TRUE TRUE .... etc where it's TRUE when temps[i] was >= 0.5 and FALSE otherwise.
To rephrase your problem then, we just need to look for occurences of at least five TRUE in a row.
To do this we can use the function rle. ?rle gives:
> ?rle
Description
Compute the lengths and values of runs of equal values in a vector
- or the reverse operation.
Value:
‘rle()’ returns an object of class ‘"rle"’ which is a list with
components:
lengths: an integer vector containing the length of each run.
values: a vector of the same length as ‘lengths’ with the
corresponding values.
So we use rle which counts up all the streaks of consecutive TRUE in a row and consecutive FALSE in a row, and look for at least 5 TRUE in a row.
I'll just make up some data to demonstrate:
# for you, temps <- Nino3.4_1974_2000$Nino3.4_degree_1974_2000_plain
temps <- runif(1000)
# make a vector that is TRUE when temperature is >= 0.5 and FALSE otherwise
ofinterest <- temps >= 0.5
# count up the runs of TRUEs and FALSEs using rle:
runs <- rle(ofinterest)
# we need to find points where runs$lengths >= 5 (ie more than 5 in a row),
# AND runs$values is TRUE (so more than 5 'TRUE's in a row).
streakIs <- which(runs$lengths>=5 & runs$values)
# these are all the el_nino occurences.
# We need to convert `streakIs` into indices into our original `temps` vector.
# To do this we add up all the `runs$lengths` up to `streakIs[i]` and that gives
# the index into `temps`.
# that is:
# startMonths <- c()
# for ( n in streakIs ) {
# startMonths <- c(startMonths, sum(runs$lengths[1:(n-1)]) + 1
# }
#
# However, since this is R we can vectorise with sapply:
startMonths <- sapply(streakIs, function(n) sum(runs$lengths[1:(n-1)])+1)
Now if you do Nino3.4_1974_2000$Month_common[startMonths] you'll get all the months in which the El Nino started.
It boils down to just a few lines:
runs <- rle(Nino3.4_1974_2000$Nino3.4_degree_1974_2000_plain>=0.5)
streakIs <- which(runs$lengths>=5 & runs$values)
startMonths <- sapply(streakIs, function(n) sum(runs$lengths[1:(n-1)])+1)
Nino3.4_1974_2000$Month_common[startMonths]
Here's one way using the fact that the months are regular always one month apart. Than the problem reduces to finding 5 consecutive rows with temps >= 0.5 degrees:
# Some sample data
d <- data.frame(Month=1:20, Temp=c(rep(1,6),0,rep(1,4),0,rep(1,5),0, rep(1,2)))
d
# Use rle to find runs of temps >= 0.5 degrees
x <- rle(d$Temp >= 0.5)
# The find the last row in each run of 5 or more
y <- x$lengths>=5 # BUG HERE: See update below!
lastRow <- cumsum(x$lengths)[y]
# Finally, deduce the first row and make a result matrix
firstRow <- lastRow - x$lengths[y] + 1L
res <- cbind(firstRow, lastRow)
res
# firstRow lastRow
#[1,] 1 6
#[2,] 13 17
UPDATE I had a bug that detected runs with 5 values less than 0.5 too. Here's the updated code (and test data):
d <- data.frame(Month=1:20, Temp=c(rep(0,6),1,0,rep(1,4),0,rep(1,5),0, 1))
x <- rle(d$Temp >= 0.5)
y <- x$lengths>=5 & x$values
lastRow <- cumsum(x$lengths)[y]
firstRow <- lastRow - x$lengths[y] + 1L
res <- cbind(firstRow, lastRow)
res
# firstRow lastRow
#[2,] 14 18