Replace character in a df for numeric vector in R - r

I would like to replace characters for specifics numeric vector.
I have this df:
First Second Third
A C D
F R K
and I also have vectors like these
A = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
R = c(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
N = c(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
I have tried several times but I can't do it. Does anyone have some advice or idea?

An option would be to unlist (convert to character if it is factor) and then use mget to return the values for that object in a list
lst1 <- mget(as.character(unlist(df)))

Related

How can I calculate jaccard vertex similarity with weights in igraph

I have a square matrix that represents directed interactions, with values representing the magnitude of the "flow" from row i to column j.
mat <- structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.59734154600838,
0.962276996464401, 0.996554553573577, 0.988150008522967, 0.581536975261071,
0.280105566896129, 0.0520717823071291, 0.0443864046117343, 0.0162858335588474,
0, 0, 0, 0, 0, 0, 0, 0.111900863185923, 0.289483837277475, 0.338036619790556,
0.973201117894343, 0.876145758734938, 0.280105566896129, 0.245172586054694,
0.101440228047504, 0.0136022221272776, 0, 0, 0, 0, 0, 0, 0.073088274682518,
0.21588462733217, 0.258134862678946, 0.93528472971792, 0.921844796228768,
0.318790697187933, 0.280105566896129, 0.117928032625428, 0.016073037487081,
0, 0, 0, 0, 0, 0, 0, 0.0119602547215087, 0.0174757225504163,
0.443466799224191, 0.941024455005652, 0.632609306727839, 0.57418820480725,
0.280105566896129, 0.043827579210664, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0.0547471528159807, 0.884304818335752, 0.937495721370637,
0.925118019265575, 0.280105566896129, 0.055967839940851, 0.0122649398400715,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0679263578760456, 0.104884821422108,
0.569814755335506, 0.853130344409379, 0.280105566896129, 0.0728699300735904,
0.0339371561178606, 0.012188886551821, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0.0219303360220489, 0.843994038605239, 0.759918325154657,
0.280105566896129, 0.143508732965731, 0.0556400089034765, 0.0296286033644999,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.421151438381493, 0.977746695038157,
0.499880491267235, 0.280105566896129, 0.116686808742586, 0.0639605586005988,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0495967410949283, 0.841406989124245,
0.85505217514437, 0.578265483357174, 0.280105566896129, 0.163154497800251,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.499941945587477, 0.993657104473566,
0.807475685951474, 0.45318772928331, 0.280105566896129), .Dim = c(15L,
15L))
I am interested in calculating the weighted linkage similarity (both in and out flows) of all vertices in the network, so taking magnitude into account.
Using igraph, I can calculate the Jaccard similarity but without considering weights
library(igraph)
bin <- mat
bin[bin > 0] <- 1
similarity(graph_from_adjacency_matrix(bin),
mode = "all",
method = "jaccard")
# this gives the same result as the one above
similarity(graph_from_adjacency_matrix(mat, weighted = T),
mode = "all",
method = "jaccard")
Using the code from this blogpost, I was able to calculate the Jaccard similarity of outflows and inflows and combine them.
# outflow similarity
sim.jac.out <- matrix(0, nrow=nrow(mat), ncol=nrow(mat))
pairs <- t(combn(1:nrow(mat), 2))
for (i in 1:nrow(pairs)) {
num <- sum(sapply(1:ncol(mat), function(x) (min(mat[pairs[i,1],x], mat[pairs[i,2],x]))))
den <- sum(sapply(1:ncol(mat), function(x) (max(mat[pairs[i,1],x], mat[pairs[i,2],x]))))
sim.jac.out[pairs[i,1],pairs[i,2]] <- num/den
sim.jac.out[pairs[i,2],pairs[i,1]] <- num/den
}
sim.jac.out[which(is.na(sim.jac.out))] <- 0
diag(sim.jac.out) <- 1
# inflow similarity
sim.jac.in <- matrix(0, nrow=nrow(mat), ncol=nrow(mat))
pairs <- t(combn(1:nrow(t(mat)), 2))
for (i in 1:nrow(pairs)) {
num <- sum(sapply(1:ncol(t(mat)), function(x) (min(t(mat)[pairs[i,1],x], t(mat)[pairs[i,2],x]))))
den <- sum(sapply(1:ncol(t(mat)), function(x) (max(t(mat)[pairs[i,1],x], t(mat)[pairs[i,2],x]))))
sim.jac.in[pairs[i,1],pairs[i,2]] <- num/den
sim.jac.in[pairs[i,2],pairs[i,1]] <- num/den
}
sim.jac.in[which(is.na(sim.jac.in))] <- 0
diag(sim.jac.in) <- 1
# total similariry
sim.jac.all <- (sim.jac.in + sim.jac.out)/2
So the general question is, does this make sense?
But more specifically, I would be interested to know if there is a way to incorporate link weights in the calculation of similarity with igraph.
In my real dataset, I need to do this several times iteratively (swapping individuals), for a large number of networks, so my method would take forever. I believe igraph uses C++ under the hood.

r: Manipulate data so that columns with same values combine in particular ways

I have a dataframe where each column is made up of zero along with one other number. For example:
I want to manipulate the dataframe so that columns that contain the same other number become one column where the value stays as the other number if the other number was present in every row, otherwise it turns to zero.
So for instance, I would want the dataframe above to look like
..1 ..2 ..3
1 2 3
0 2 0
0 0 0
1 0 0
The first row of the dataframe is 1 because the values were both 1 in the first row of the original. The second row of the first column is 0 because there were a 1 and a 0 in the row.
Here is some reproducible data:
structure(list(...1 = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), ...2 = c(1, 0,
0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0), ...3 = c(2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), ...4 = c(3,
0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0,
0, 0, 0, 0, 0, 0), ...5 = c(3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), ...6 = c(3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA,
-28L), class = "data.frame")
Here is a possible solution in base R, where dat is the data frame you provide in the question. We find the unique value for each column, assuming there is only one nonzero value in each column. Then we loop through the groups of columns with each unique value, applying the function all() to each row of the subsetted dataframe to identify rows with all nonzero values. Multiply the resulting logical vector by the value itself to get the desired result. Then store this vector in a list and bind to a data frame.
col_vals <- apply(dat, 2, max)
columns <- list()
for (val in unique(col_vals)) {
columns[[length(columns) + 1]] <- val * apply(dat[, col_vals == val, drop = FALSE], 1, all)
}
as.data.frame(do.call(cbind, columns))

R: Why isn't this matrix 3d linear interpolation working correctly?

I have a matrix of values and zeros, where zero= NA. The values are interspersed around the matrix, and what I want to do is interpolate the values of all the NA values. This is the data:
I'm trying to guess all of these values by taking all the known values in my matrix, and multiplying the value by the distance (such that the further away a point is, the less influence it has). This is what the interpolated result looks like:
As you can see, this method is not very effective, it does affect the NAs nearest to the known values, but then they quickly converge onto an average value. I think this is due to the fact that it's taking the ENTIRE RANGE, which has many ups and downs... rather than just the points nearest to it.
Obviously, matrix operations aren't my specialty... what do I need to change to correctly do the linear-interpolation?
Here's the code:
library(dplyr)
library(plotly)
Cont <- structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1816, 2320, 1406, 2028, 1760, 1932, 1630,
1835, 1873, 1474, 1671, 2073, 1347, 2131, 2038, 1969, 2036, 1602,
1986, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 2311, 1947, 2094, 1947, 2441, 1775, 1461, 1260,
1494, 2022, 1863, 1587, 2082, 1567, 1770, 2065, 1404, 1809, 1972,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 2314, 1595, 2065, 1870, 2178, 1410, 1994, 1979, 2111,
1531, 1917, 1559, 2109, 1921, 1606, 1469, 1601, 1771, 1771), .Dim = c(19L,
30L))
## First get real control values
idx <- which(Cont > 0, arr.ind=TRUE)
V <- Cont[idx]
ControlValues <- data.frame(idx,V)
## Make data.frame of values to fill
toFill <- which(Cont == 0, arr.ind=TRUE) %>% as.data.frame
toFill$V <- 0
## And now figure out the weighted value of each point
for (i in 1:nrow(toFill)){
toFill[i,] -> CurrentPoint
Xs <- (1/abs(CurrentPoint[,1] - ControlValues[,1]))
Xs[is.infinite(Xs)] <- 0
Xs <- Xs/sum(Xs)/100
Ys <- (1/abs(CurrentPoint[,2] - ControlValues[,2]))
Ys[is.infinite(Ys)] <- 0
Ys <- Ys/sum(Ys)/100
ControlValues1 <- data.frame(Xs,Ys)
toFill[i,3] <- sum(rowMeans(ControlValues1) * ControlValues$V)*100
}
## add back in the controls and reorder
bind_rows(ControlValues,toFill) -> Both
Both %>% arrange(row,col) -> Both
## and plot the new surface
NewCont <- matrix(Both$V,max(Both$row),max(Both$col),byrow = T)
plot_ly(z=NewCont, type="surface",showscale=FALSE)
One approach to interpolate and extrapolate data in R is to use the akima package. The following performs bi-linear interpolation followed by extrapolation using as input the known data points in the data frame ControlValues to fill the zeroes in Cont.
library(akima)
library(plotly)
NewCont <- akima::interp(x=ControlValues[,1], y=ControlValues[,2], z=ControlValues[,3],
xo=1:nrow(Cont), yo=1:ncol(Cont), linear=TRUE)$z
NewCont[,1:9] <- akima::interp.old(x=ControlValues[,1], y=ControlValues[,2],
z=ControlValues[,3], xo=1:nrow(Cont),
yo=1:9, ncp=2, extrap=TRUE)$z
plot_ly(z=NewCont, type="surface",showscale=FALSE)
Notes:
The first call to akima::interp performs the bi-linear interpolation. See the help page ?akima::interp for usage and details.
A key point is that the inputs x, y, and z for the known data points need not be on a x-y grid. In this case, these are the columns of ControlValues.
The output of akima::interp is a list whose z component is a matrix of interpolated values over the grid whose x and y coordinates are defined by the inputs xo and yo, respectively. In this case, these are just the row and column indices of Cont
As stated in the help page
z-values for points outside the convex hull are returned as NA.
In this case, the first nine columns of the output corresponding to yo=1:9 will be NAs.
The second call to akima::interp (actually akima::interp.old) performs the data extrapolation to fill in the NAs left by the first call. See this SO quation/answer for the details of this usage.
The above approach gives the following result
Another approach to perform bi-linear interpolation is to use the interp.surface function in the fields package. This approach is mentioned because the implementation is an R-script, which can be listed by typing the function name interp.surface at the R command line.
library(fields)
loc <- make.surface.grid(list(x=1:nrow(Cont), y=1:ncol(Cont)))
NewCont2 <- matrix(interp.surface(list(x=sort(unique(ControlValues[,1])),
y=sort(unique(ControlValues[,2])),
z=matrix(ControlValues[,3],
nrow=length(unique(ControlValues[,1])),
ncol=length(unique(ControlValues[,2])))),
loc), nrow=nrow(Cont), ncol=ncol(Cont))
NewCont2[,1:9] <- akima::interp.old(x=ControlValues[,1], y=ControlValues[,2],
z=ControlValues[,3], xo=1:nrow(Cont),
yo=1:9, ncp=2, extrap=TRUE)$z
Here, the requirements are the opposite to those for akima::interp. Specifically, the known data points must lie on a x-y grid. However, the coordinates to interpolate need not be on a grid and is instead a matrix containing corresponding column vectors of x and y coordinates where each tuple (x[i],y[i]) is a x-y coordinate to interpolate. Since the data points in ControlValues are on a grid, these requirements are also satisfied for this case. See the help page ?interp.surface for usage and details.
Notes:
sort(unique(ControlValues[,1])) and sort(unique(ControlValues[,2])) simply gives the x and y coordinates for the grid of known data points
The z component in the list is simply the z values for the known data points reshaped as a matrix over the grid of known data points
The matrix of coordinates to interpolate is generated by make.surface.grid using as x and y coordinates the row and column indices of Conf, respectively
A coordinate to interpolate that lies outside the grid of known points will result in a interpolated value of NA
interp.surface returns a vector of z values corresponding to the coordinates to interpolate. This is then rehaped to a matrix over the grid of coordinates to interpolate, which has dimensions nrow(Cont) by ncol(Cont)
Finally, it is easy to verify that the two approaches give the same result
print(max(abs(NewCont - NewCont2)))
##[1] 4.547474e-13

how do I rebuild data frame based on columns identified in a numeric vector?

I'm using R to complete some GA driven searches.
Returned from my GA script is the resulting chromosome, returned as a binary numeric of length 40.
An example is: c(0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0).
I also have a corresponding data frame with 40 columns.
Using the data in the numeric vector, how do I efficiently build a (or re-build the) data frame so that it contains only those columns represented by the 1's in my numeric vector?
Building a sample data.frame and assigning your sample vector to x:
df <- as.data.frame(matrix(sample(1:100, 400, replace=T), ncol=40))
x <- c(0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0)
I can subset:
df[ ,x==1]
or:
df[, as.logical(x)]

Add consecutive elements of a vector until a value

I would like to calculate the minimum number of consecutive elements in a vector that when added (consecutively) would be less than a given value.
For example in the following vector
ev<-c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 2.7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3.27, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 370.33, 1375.4,
1394.03, 1423.8, 1360, 1269.77, 1378.8, 1350.37, 1425.97, 1423.6,
1363.4, 1369.87, 1365.5, 1294.97, 1362.27, 1117.67, 1026.97,
1077.4, 1356.83, 565.23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 356.83,
973.5, 0, 240.43, 1232.07, 1440, 1329.67, 1096.87, 1331.37, 1305.03,
1328.03, 1246.03, 1182.3, 1054.53, 723.03, 1171.53, 1263.17,
1200.37, 1054.8, 971.4, 936.4, 968.57, 897.93, 1099.87, 876.43,
1095.47, 1132, 774.4, 1075.13, 982.57, 947.33, 1096.97, 929.83,
1246.9, 1398.2, 1063.83, 1223.73, 1174.37, 1248.5, 1171.63, 1280.57,
1183.33, 1016.23, 1082.1, 795.37, 900.83, 1159.2, 992.5, 967.3,
1440, 804.13, 418.17, 559.57, 563.87, 562.97, 1113.1, 954.87,
883.8, 1207.1, 1046.83, 995.77, 803.93, 1036.63, 946.9, 887.33,
727.97, 733.93, 979.2, 1176.8, 1241.3, 1435.6)
What is the minimum number of elements that when added consecutively (as in the order within the vector) would sum up to lets say 20000
To be more clear i need the following:
Start with ev[1] and add consecutively up to 20000. Record the number of elements you had to add in order to get to 20000 as r[1]. Then start with ev[2] and add till 20000 and so on. Recored the number of elements you had to add till 20000 as r[2]. Do this for the entire length of ev. Then return the min(r)
For example
j<-c(1, 2, 3, 5, 7, 9, 2).
I want the minimum number of elements that when added consecutively would give lets say >20. This should be 3 (5+7+9)
Thanks a lot
Well, I'll give it a shot: This one will find the length of the minimum sequence of numbers
that add up to or above max. It makes no claims to be fast, but it has O(2n) time complexity :-)
I made it return both the start index and the length.
f <- function(x, max=10) {
s <- 0
len <- Inf
start <- 1
j <- 1
for (i in seq_along(x)) {
s <- s + x[i]
while (s >= max) {
if (i-j+1 < len) {
len <- i-j+1
start <- j
}
s <- s - x[j]
j <- j + 1
}
}
list(start=start, length=len)
# uncomment the line below if you don't need the start index...
#len
}
r <- f(ev, 20000) # list(start=245, length=15)
sum(ev[seq(r$start, len=r$length)]) # 20275.42
# Test speed:
x <- sin(1:1e6)
system.time( r <- f(x, 1.9) ) # 1.54 secs
# Compile the function makes it 9x faster...
g <- compiler::cmpfun(f)
system.time( r <- g(x, 1.9) ) # 0.17 secs
library(zoo) # Needed for rollapply
N <- 20000 # The desired sum we want to achieve
j <- 0
for(i in 1:length(ev)){
k <- rollapply(ev, i, sum)
j[i] <- max(k)
if(j[i] >= N){
break
}
}
i # contains how many consecutive elements you need to sum (15)
j[i] # contains the corresponding sum(20275.42)
Currently this doesn't tell you where the specific subset occurs in the vector but another use of rollapply could get you that information.
There are other ways to do it but if you have a really long vector this will break out of the loop so you don't calculate more than you need. The basic idea is to use rollapply to create a vector of the consecutive sums of length k and then find the maximum of that. If this is less than what we desire do the same thing for sums of length k+1. Repeat until we find a sum that is larger than the desired threshold.
Edit:
This appears to be about 100x faster. I haven't compared it to Tommy's answer (which is probably faster than this but this will provide a significant speedup compared to my original method.
Edit 2: Moving the [-n] and removing the suppresswarnings speeds this up quite a bit.
myfun <- function(ev, N){
i <- 1
n <- length(ev)
j <- ev
repeat{
j <- (j[-n] + ev[-c(1:i)])
i <- i+1
n <- n-1
if(max(j) >= N | i > length(ev)){
break;
}
}
return(i)
}
myfun(ev, 20000)
# And stealing the idea from Tommy gives a nice speedup as well
myfuncomp <- compiler:cmpfun(myfun)
myfuncomp(ev, 20000)
myfunc3 <- compiler:cmpfun(myfun, options = list(optimize = 3))
myfunc3(ev, 20000)
library(rbenchmark) # For testing
# If you have Tommy's functions loaded as f and g you can compare
benchmark(f(ev, 20000), g(ev, 20000), myfun(ev, 20000), myfuncomp(ev, 20000), myfunc3(ev, 20000))
you mean something like this?
> sum(ifelse(cumsum(ev)<=200000, 1, 0))
[1] 364
I think this may be a Traveling Salesman Problem in disguise unless you put in some more constraints. You cannot necessarily start at the max ev and go out in either direction since it may be a local non-dense maximum
x=1:length(ev)
plot(x,ev)
lxy <- loess(ev~x )
lines(predict(lxy, x=1:length(y)))
title(main="loess() fit of ev")
But in the region of the most dense values the values are fairly flat.
x=1:length(y); y=c(356.83,
973.5, 0, 240.43, 1232.07, 1440, 1329.67, 1096.87, 1331.37, 1305.03,
1328.03, 1246.03, 1182.3, 1054.53, 723.03, 1171.53, 1263.17,
1200.37, 1054.8, 971.4, 936.4, 968.57, 897.93, 1099.87, 876.43,
1095.47, 1132, 774.4, 1075.13, 982.57, 947.33, 1096.97, 929.83,
1246.9, 1398.2, 1063.83, 1223.73, 1174.37, 1248.5, 1171.63, 1280.57,
1183.33, 1016.23, 1082.1, 795.37, 900.83, 1159.2, 992.5, 967.3,
1440, 804.13, 418.17, 559.57, 563.87, 562.97, 1113.1, 954.87,
883.8, 1207.1, 1046.83, 995.77, 803.93, 1036.63, 946.9, 887.33,
727.97, 733.93, 979.2, 1176.8, 1241.3, 1435.6)
lxyhi <- loess(y~x)
plot(x,y)
lines(predict(lxyhi, x=1:length(y)))

Resources