Find shortest distance between multiple points - r

Imagine a small dataset of xy coordinates. These points are grouped by a variable called indexR, there are 3 groups in total. All xy coordinates are in the same units. The data looks approximately like so:
# A tibble: 61 x 3
indexR x y
<dbl> <dbl> <dbl>
1 1 837 924
2 1 464 661
3 1 838 132
4 1 245 882
5 1 1161 604
6 1 1185 504
7 1 853 870
8 1 1048 859
9 1 1044 514
10 1 141 938
# ... with 51 more rows
The goal is to determine which 3 points, one from each group, are closest to each other, in the sense of minimizing the sum of the pairwise distances between selected points.
I have attempted this by considering euclidian distances, as follows. (Credit goes to #Mouad_S, in this thread, and https://gis.stackexchange.com/questions/233373/distance-between-coordinates-in-r)
#dput provided at bottom of this post
> df$dummy = 1
> df %>%
+ full_join(df, c("dummy" = "dummy")) %>%
+ full_join(df, c("dummy" = "dummy")) %>%
+ filter(indexR.x != indexR.y & indexR.x != indexR & indexR.y != indexR) %>%
+ mutate(dist =
+ ((.$x - .$x.x)^2 + (.$y- .$y.x)^2)^.5 +
+ ((.$x - .$x.y)^2 + (.$y- .$y.y)^2)^.5 +
+ ((.$x.x - .$x.y)^2 + (.$y.x- .$y.y)^2)^.5,
+ dist = round(dist, digits = 0)) %>%
+ arrange(dist) %>%
+ filter(dist == min(dist))
# A tibble: 6 x 11
indexR.x x.x y.x dummy indexR.y x.y y.y indexR x y dist
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 638 324 1 2 592 250 3 442 513 664
2 1 638 324 1 3 442 513 2 592 250 664
3 2 592 250 1 1 638 324 3 442 513 664
4 2 592 250 1 3 442 513 1 638 324 664
5 3 442 513 1 1 638 324 2 592 250 664
6 3 442 513 1 2 592 250 1 638 324 664
From this we can identify the three points closest together (minimum distance apart; enlarged on the figure below). However, the challenge comes when extending this such that indexR has 4,5 ... n groups. The problem is in finding a more practical or optimised method for making this calculation.
structure(list(indexR = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 3, 3), x = c(836.65, 464.43, 838.12, 244.68, 1160.86,
1184.52, 853.4, 1047.96, 1044.2, 141.06, 561.01, 1110.74, 123.4,
1087.24, 827.83, 100.86, 140.07, 306.5, 267.83, 1118.61, 155.04,
299.52, 543.5, 782.25, 737.1, 1132.14, 659.48, 871.78, 1035.33,
867.81, 192.94, 1167.8, 1099.59, 1097.3, 1089.78, 1166.59, 703.33,
671.64, 346.49, 440.89, 126.38, 638.24, 972.32, 1066.8, 775.68,
591.86, 818.75, 953.63, 1104.98, 1050.47, 722.43, 1022.17, 986.38,
1133.01, 914.27, 725.15, 1151.52, 786.08, 1024.83, 246.52, 441.53
), y = c(923.68, 660.97, 131.61, 882.23, 604.09, 504.05, 870.35,
858.51, 513.5, 937.7, 838.47, 482.69, 473.48, 171.78, 774.99,
792.46, 251.26, 757.95, 317.71, 401.93, 326.32, 725.89, 98.43,
414.01, 510.16, 973.61, 445.33, 504.54, 669.87, 598.75, 225.27,
789.45, 135.31, 935.51, 270.38, 241.19, 595.05, 401.25, 160.98,
778.86, 192.17, 323.76, 361.08, 444.92, 354, 249.57, 301.64,
375.75, 440.03, 428.79, 276.5, 408.84, 381.14, 459.14, 370.26,
304.05, 439.14, 339.91, 435.85, 759.42, 513.37)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -61L), .Names = c("indexR",
"x", "y"))

One possibility would be to formulate the problem of identifying the closest elements, one from each group, as a mixed integer program. We could define decision variables y_i for whether each point i is selected, as well as x_{ij} for whether points i and j are both selected (x_{ij} = y_iy_j). We need to select one element from each group.
In practice, you could implement this mixed integer program using the lpSolve package (or one of the other R optimization packages).
opt.closest <- function(df) {
# Compute every pair of indices
library(dplyr)
pairs <- as.data.frame(t(combn(nrow(df), 2))) %>%
mutate(G1=df$indexR[V1], G2=df$indexR[V2]) %>%
filter(G1 != G2) %>%
mutate(dist = sqrt((df$x[V1]-df$x[V2])^2+(df$y[V1]-df$y[V2])^2))
# Compute a few convenience values
n <- nrow(df)
nP <- nrow(pairs)
groups <- sort(unique(df$indexR))
nG <- length(groups)
gpairs <- combn(groups, 2)
nGP <- ncol(gpairs)
# Solve the optimization problem
obj <- c(pairs$dist, rep(0, n))
constr <- rbind(cbind(diag(nP), -outer(pairs$V1, seq_len(n), "==")),
cbind(diag(nP), -outer(pairs$V2, seq_len(n), "==")),
cbind(diag(nP), -outer(pairs$V1, seq_len(n), "==") - outer(pairs$V2, seq_len(n), "==")),
cbind(matrix(0, nG, nP), outer(groups, df$indexR, "==")),
cbind((outer(gpairs[1,], pairs$G1, "==") &
outer(gpairs[2,], pairs$G2, "==")) |
(outer(gpairs[2,], pairs$G1, "==") &
outer(gpairs[1,], pairs$G2, "==")), matrix(0, nGP, n)))
dir <- rep(c("<=", ">=", "="), c(2*nP, nP, nG+nGP))
rhs <- rep(c(0, -1, 1), c(2*nP, nP, nG+nGP))
library(lpSolve)
mod <- lp("min", obj, constr, dir, rhs, all.bin=TRUE)
which(tail(mod$solution, n) == 1)
}
This can compute the closest 3 points, one from each cluster, in your example dataset:
df[opt.closest(df),]
# A tibble: 3 x 3
# indexR x y
# <dbl> <dbl> <dbl>
# 1 1 638.24 323.76
# 2 2 591.86 249.57
# 3 3 441.53 513.37
It can also compute the best possible solution for datasets with more points and groups. Here are the runtimes for datasets with 7 groups each and 100 and 200 points:
make.dataset <- function(n, nG) {
set.seed(144)
data.frame(indexR = sample(seq_len(nG), n, replace=T), x = rnorm(n), y=rnorm(n))
}
df100 <- make.dataset(100, 7)
system.time(opt.closest(df100))
# user system elapsed
# 11.536 2.656 15.407
df200 <- make.dataset(200, 7)
system.time(opt.closest(df200))
# user system elapsed
# 187.363 86.454 323.167
This is far from instantaneous -- it takes 15 seconds for the 100-point, 7-group dataset and 323 seconds for the 200-point, 7-group dataset. Still, it is much quicker than iterating through all 92 million 7-tuples in the 100-point dataset or all 13.8 billion 7-tuples in the 200-point dataset. You could set a runtime limit with a solver like the one from the Rglpk package to get the best solution obtained within that limit.

You cannot afford to enumerate all possible solutions, and I don't see any obvious shortcut.
So I guess you'll have to do a branch and bound optimization approach.
First guess a reasonably good solution. Like the closest two points with different labels. Then add the nearest with a different label until you have all labels covered.
Now do some trivial optimization: for every label, try if there is some point that you can use instead of the current point to improve the result. Stop when you can't find any further improvement.
For this initial guess, compute the distances. This will give you an upper bound, which allows you to stop your search early. You can also compute a lower bound, the sum of all best two-label solutions.
Now you can try to remove points, where the nearest neighbors of each label + the lower bounds for all other labels is already worse than your initial solution. This will hopefully eliminate a lot of points.
Then you can start enumerating solutions (probably begin with the smallest labels first), but stop recursion whenever the current solution + the remaining lower bounds are larger than your best known solution (branch and bound).
You can also try sorting points e.g. by minimum distance to the remaining labels, to hopefully find better bounds fast.
I'd certainly not choose R to implement this...

you can use cross joins to have all the points combinations, calculate the total distance between all three points, then take the minimum of that.
df$id <- row.names(df) # to create ID's for the points
df2 <- merge(df, df, by = NULL ) # the first cross join
df3 <- merge(df2, df, by = NULL) # the second cross join
# eliminating rows where the points are of the same indexR
df3 <- df3[df3$indexR.x != df3$indexR.y & df3$indexR.x != df3$indexR
& df3$indexR.y != df3$indexR,]
## calculating the total distance
df3$total_distance <- ((df3$x - df3$x.x)^2 + (df3$y- df3$y.x)^2)^.5 +
((df3$x - df3$x.y)^2 + (df3$y- df3$y.y)^2)^.5 +
((df3$x.x - df3$x.y)^2 + (df3$y.x- df3$y.y)^2)^.5
## minimum distance
df3[which.min(df3$total_distance),]
indexR.x x.x y.x id.x indexR.y x.y y.y id.y indexR x y id total_distance
155367 3 441.53 513.37 61 2 591.86 249.57 46 1 638.24 323.76 42 664.3373

I developed a simple algorithm to quickly solve this problem. The first step is to overlay a grid on the entire area of points. The first step is to assign each point from each group to the cell or unit square where it is located. Next we go to the lower left corner of the graph and go over one cell and up one cell. This is the starting cell. Then we define a region of interest consisting of this cell and all of its 8 neighbors. Then a test is made to determine whether or not at least one point from each of the groups is within this 9 cell region. If so then the distance from each point represented in this region from each of the groups of points to all other points from all other groups is calculated. In other words all combinations of points in this 9-cell region are used to get a total distance where paired points for distance calculation are never from the same group. From these calculations the one with the minimum distance involving a single point from each group is saved as a possible solution. Then this entire process is repeated by going over one cell to the right. Each 9-cell region is calculated as the central cell moves on to the right. This is stopped one cell from the right end. When the first row is completed the process proceeds by going up one row and starting again at the left but one cell over again. Thus the each cell has been considered when the top row is finished. The solution will be the minimum distance computed from all the tests done for each 9-cell region.
The reason we consider a 9-cell region and not just go cell-by-cell is that we could miss closely spaced points from different groups that are located in the corners of cells.
It's important to choose the correct cell or grid size. If the cells are too small then no possible solution will be found because none of the regions will encompass at least one point from each group. If the cells are too large then there will be many points from each group and calculation time will be excessive. Fortunately this optimal cell size can be quickly found through trial and error.
I've run this algorithm multiple times with varying number of groups and number of points in a group. For randomly scattered points in all groups I found that a 15 x 15 grid size works well for a 10 group - 400 point (40 points per group) case. That example runs in under one second.

Related

Estimating Probabilities in Perudo

I am new to coding and using R. I am working on a project to simulate the game Liar's Dice, also known as Perudo, and have some questions about creating the simulation.
Basically, the game consists of two or more players rolling five dice in a cup, turning it over, and and making bids on how many of a certain side they think is on the table. You can look at your own dice, but not anyone else's. To make bids, on your turn you would say "two 5's," which would mean there are at least two dice that landed on 5. Each bid will either increase the side or the amount. So if you said "two 5's," I could then say "two 6's" or "three 3's" on my turn.
When you believe the last bid is incorrect, you would say "Liar" on your turn, then everyone reveals their dice. If you were wrong, you lose a dice, but if you were right, the last bidder loses a dice. This continues until there is only one player left who has dice.
First, I decided to create a function called cup() which rolls a cup of five six-sided dice.
cup <- function(sides = 6, dice = 5){
sample(1:sides, size = dice, replace = TRUE)
}
Next, with a little assistance, I created a new function called cups() which rolls three cups for three players.
cups <- function(players = 3, sides = 6, dice = 5){
out <- cup(sides, dice)
for(i in 2:players){
out <- rbind(out, cup(sides, dice))
}
rownames(out) <- 1:players
rownames(out) <- c("P1", "P2", "P3")
return(out)
}
What I want to accomplish next is to create a table of probabilities of possible dice outcomes. In other words, what's the probability of there being at least two of a side given fifteen dice (five for each player) in play? And then the probability of there being three, four, five, etc. all the way up to fifteen in this case.
My question is how would I go about doing this in R? And what direction should I go in after getting the probabilities in R?
Here is an empirical process for determining the percentage outcomes of all the same, 4 the same, 3 the same, 2 the same, none the same upon rolling 5 die:
library(gtools) # package with permutations function
allcombos <- permutations(6, 5, repeats.allowed = TRUE) # all 6 choose 5 with replacment combos
alluniques <- apply(allcombos, 1, unique) # uniques for each combo
alllengths <- sapply(alluniques, length) # lengths for each combo imputes num repeats
alllengths2 <- as.factor(alllengths) # convert to factor to count unique
allsum <- summary(alllengths2) # sum by num uniques
allsum
1 2 3 4 5 # 1=all same, 2=4 same, 3=3 same, 4=2 same, 5=all different
6 450 3000 3600 720
totsum <- sum(allsum)
allfrac <- allsum / totsum
allpercent <- allfrac * 100
allpercent
1 2 3 4 5
0.07716049 5.78703704 38.58024691 46.29629630 9.25925926 # percentage breakout
There is no doubt an analytical solution but I don't know what it is. You could use standard probability calculations to estimate specific outcomes among multiple players. E.g. P(at least 1 4-same | 3 players) or run some simulations.
Here's likely more than you asked for but focusing on number of sides on the dice, total number of dice and probability of rolling Nrolled or more
dicegame <- function(Nsides = 6,
Ndice = 5,
Nrolled = 1,
verbose = FALSE)
{
total_possible_outcomes <- choose(Nsides + Ndice - 1, Ndice)
outcomes_matrix <- t(combn(Nsides + Ndice - 1,
Ndice,
sort)) - matrix(rep(c(0:(Ndice - 1)),
each = total_possible_outcomes),
nrow = total_possible_outcomes)
chances <- sum(apply(outcomes_matrix, 1, function(x) sum(x==2)) >= Nrolled) / total_possible_outcomes
if(verbose) {
cat(paste("Number of dice",
Ndice,
"each with", Nsides, "sides",
"chances of rolling", Nrolled,
"\n or more of any one side are:\n"))
}
return(chances)
# return(total_possible_outcomes)
# return(outcomes_matrix)
}
dicegame(verbose = TRUE)
#> Number of dice 5 each with 6 sides chances of rolling 1
#> or more of any one side are:
#> [1] 0.5
dicegame(6, 15, 10)
#> [1] 0.01625387
Using probability we can demonstrate that the probability to get a value n times is equal to :
we can easily write this into an R function:
prob_get_n <- function(ntimes, players=3, dice=5, sides=6){
if(missing(ntimes)) ntimes <- 0:(players*dice)
choose(players*dice,ntimes)*(1-1/sides)^((players*dice)-ntimes)*sides^(-ntimes)
}
Notice that this function is by construction vectorised ie it accepts 1:2, c(9,5) as valid inputs.
prob_get_n() -> probs
data.frame(ntimes=1:length(probs)-1, probs=probs,or_more= rev(cumsum(rev(probs))))
ntimes probs or_more
1 0 6.490547e-02 1.000000e+00
2 1 1.947164e-01 9.350945e-01
3 2 2.726030e-01 7.403781e-01
4 3 2.362559e-01 4.677751e-01
5 4 1.417535e-01 2.315192e-01
6 5 6.237156e-02 8.976567e-02
7 6 2.079052e-02 2.739411e-02
8 7 5.346134e-03 6.603585e-03
9 8 1.069227e-03 1.257451e-03
10 9 1.663242e-04 1.882242e-04
11 10 1.995890e-05 2.190005e-05
12 11 1.814445e-06 1.941153e-06
13 12 1.209630e-07 1.267076e-07
14 13 5.582909e-09 5.744548e-09
15 14 1.595117e-10 1.616385e-10
16 15 2.126822e-12 2.126822e-12
Edit
Or we can use R built in dbinom function to get the distribution and pbinom to get the cumulative probability function:
probs <- function(ntimes, players=3, dice=5, sides=6){
if(missing(ntimes)) ntimes <- 0:(players*dice)
data.frame(ntimes=ntimes, probs=dbinom(ntimes, players*dice, 1/sides), or_more=1-pbinom(ntimes-1, players*dice, 1/sides))
}
ntimes probs or_more
1 0 6.490547e-02 1.000000e+00
2 1 1.947164e-01 9.350945e-01
3 2 2.726030e-01 7.403781e-01
4 3 2.362559e-01 4.677751e-01
5 4 1.417535e-01 2.315192e-01
6 5 6.237156e-02 8.976567e-02
7 6 2.079052e-02 2.739411e-02
8 7 5.346134e-03 6.603585e-03
9 8 1.069227e-03 1.257451e-03
10 9 1.663242e-04 1.882242e-04
11 10 1.995890e-05 2.190005e-05
12 11 1.814445e-06 1.941153e-06
13 12 1.209630e-07 1.267076e-07
14 13 5.582909e-09 5.744548e-09
15 14 1.595117e-10 1.616385e-10
16 15 2.126822e-12 2.126743e-12

Calculating the overall mean, not a mean of means

I feel like this is a really simple question which I should be able to figure out but I have been trying for a while now to no success. I have a dataframe and wish to determine the overall mean rt by cond type and by speaker type, ignoring position. How do I do this?
Simply, three groups of people read sentences ("Speaker"). Each "cond" is a different sentence type ("ExpA, B, C, D), all made of 5 parts ("Position"). Each part has a corresponding reading time in each sentence type. I want to look at the overall reading time for each cond (all positions together) for each group. Eg. the sum of all position reading times (0, 1, 2, 3, 4) for just FR participants for cond "ExpA" to compare whether they were faster or slower overall in "ExpA" than "ExpB".
Dataframe:
Speaker: FR, EN, KR
cond (ExpA, ExpB, ExpC, ExpD)
Position (0, 1, 2, 3, 4)
rt: 1000, 1500, 2000, 1500, 1000
How would I do this? I've been able to get the mean rt by position, condition, and speaker using the code below, but when I remove "position" thinking it will then give me the combined mean for each "cond", it gives me only one value which is too small to be the sum of 5 means, rather it looks to be a mean of those means.
pcsmeans = ddply(subj.means, .(cond, position, speaker), summarise, sd = sd(mean.rt), mean = mean(mean.rt))
I hope the lack of proper dataframe isn't offputting, I don't know how to input one of those here. Many thanks for any help!
Slightly unclear what you are after but looks like you could use dplyr's group_by and summarise:
library(dplyr)
df <- data.frame(Speaker = rep(c("FR", "EN", "KR"),20),
cond = rep(c("ExpA", "ExpB", "ExpC", "ExpD"),15),
Position = rep(c(0, 1, 2, 3, 4),12),
rt = runif(min=1000, max=2000, n = 60))
df %>% group_by(Speaker, cond) %>% summarise(mean_rt = mean(rt), overall_rt = sum(rt))
This gives you the mean and sum by Speaker and cond:
# A tibble: 12 x 4
# Groups: Speaker [3]
Speaker cond mean_rt overall_rt
<fct> <fct> <dbl> <dbl>
1 EN ExpA 1690. 8449.
2 EN ExpB 1625. 8127.
3 EN ExpC 1588. 7940.
4 EN ExpD 1475. 7375.
5 FR ExpA 1321. 6603.
6 FR ExpB 1584. 7922.
7 FR ExpC 1493. 7465.
8 FR ExpD 1463. 7315.
9 KR ExpA 1393. 6965.
10 KR ExpB 1540. 7702.
11 KR ExpC 1569. 7847.
12 KR ExpD 1570. 7849.
It is not very clear what your actual issue is. Since you already mentioned that you do not know ho to add a sample data.frame, here is one example which I guess fits your problem:
#generate mock df
speaker<-c("FR", "EN", "KR")
exp<-c("ExpA", "ExpB", "ExpC", "ExpD")
position<-c(0, 1, 2, 3, 4)
#rt<- 1000, 1500, 2000, 1500, 1000
data<-expand.grid(speaker,exp,position)
names(data)<-c('speaker','exp','position')
data$rt<-rnorm(n=nrow(data),mean = 1300,sd = 250)
head(data)
speaker exp position rt
FR ExpA 0 1269
EN ExpA 0 859
KR ExpA 0 863
FR ExpB 0 718
EN ExpB 0 956
KR ExpB 0 867
...
From this point on, there's multiple options. My preferred quick & efficient tool comes with the sqldf package, which introduces sql language like constructs. SQL is super efficient and easy to read:
require(sqldf)
sqldf::sqldf('select count(*) as N, speaker, exp, avg(rt) as mean from df group by speaker, exp')
Obviously, R has a million tools when it comes to solving a problem, but this is my favourite by far. For anything more complex (i.e. custom functions etc.), I'd probably use a for loop that iterates through each reader and exp combination:
data$identifier <- paste0(data$speaker,data$exp) # helper column
results <- data.frame()
for ( ident in unique(data$identifier) ){
df <- subset(data, identifier == ident)
speaker<-unique(df$speaker)
exp<-unique(df$exp)
mean<-sum( df$rt )
se<-sd( df$rt )/ sqrt( nrow(df) )
quantileButTransformed <- t(as.data.frame(quantile(df$rt))) #whatever you can think of
newLine<-data.frame(speaker = speaker, exp = exp,N = nrow(df), mean = mean, se = se, quantile = quantileButTransformed)
results <- rbind(results, newLine)
}
Cheers!

Choosing specific digits of values from two datasets

I'm relatively new to R. I'm looking for a way to choose a specific value of pfaf from two datasets of points from sites, based on some conditions.
data2 is a subset of data1. But I've only included one value that match.
data1:
site id strahler pfaf
1331879 1232926 4 4359
1331341 1232926 2 816
1330121 1232926 1 45
1331842 1232926 3 4
1331841 1232926 2 552
1329931 1206877 3 413
1329614 1206877 2 47
1329591 1206877 1 8179
1329517 1206877 1 4463
1331411 1554221 1 912
1331364 1554221 1 92
1329694 1554221 2 9113
1331486 1554221 3 8
I need to get the series (several) of sites which corresponds to a series of pfaf numbers from data1. These pfaf numbers need to follow these rules.
1) The first n digits of data2$pfaf matches exactly to data1$pfaf, where n ≥ 0, AND
2) The remaining digits of data2$pfaf are less than and/or equal to the remaining digits of data1$pfaf
At the same time, the id of data2$pfaf and data1$pfaf need to be the same for them to be compared at all. AND the strahler of sites have to be less than or equal to the strahler of points.
data2:
points id strahler pfaf
1331485 1206877 3 821
1329690 1206877 2 47
1329598 1232926 4 46
1329936 1554221 1 962
The correct output would be:
points pfaf_of_site site
1331485 816, 8179 1329614, 1329591
1329690 4463 1329517
1329598 4359, 45, 4 1331879, 1330121, 1331842
1329936 912, 92 1331411, 1331364
Thanks a tonne for the help if someone can do this.
Maybe something like the following function is what you want? Untested, since there is no data2 example.
funMatch <- function(X, Y, n = 1){
x <- as.character(X[['PFAFSTETTER']])
x.n <- substr(x, 1, n)
x.remaining <- substring(x, n + 1)
y <- as.character(Y[['PFAFSTETTER']])
y.n <- substr(y, 1, n)
y.remaining <- substring(y, n + 1)
i <- which(y.n %in% x.n & length(y.remaining) < length(x.remaining))
Y[['WSO1_ID']][i]
}
funMatch(data1, data2, n = 1)
funMatch(data1, data2, n = 2)

Removing negative values and one positive value from R dataframe

I have a dataframe where one column is the amount spent. In the amount spent column there are the values for amount spent and also negative values for any returns. For example.
ID Store Spent
123 A 18.50
123 A -18.50
123 A 18.50
I want to remove the negative value then one of its positive counter parts - the idea is to only keep fully completed spend amounts so I can look at total spend.
Right now I am thinking something like this - where I have the data frame sorted by spend
if spend < 0 {
take absolute value of spend
if diff between abs(spend) and spend+1 = 0 then both are NA}
I would like to have something like
df[df$spend < 0] <- NA
where I can also set one positive counterpart to NA as well. Any suggestions?
There should be a simpler solution to this but here is one way. Also created my own example since the one shared did not have sufficient data points to test
#Original vector
x <- c(1, 2, -2, 1, -1, -1, 2, 3, -4, 1, 4)
#Count the frequency of negative numbers, keeping all the unique numbers
vals <- table(factor(abs(x[x < 0]), levels = unique(abs(x))))
#Count the frequency of absolute value of original vector
vals1 <- table(abs(x))
#Subtract the frequencies between two vectors
new_val <- vals1 - (vals * 2 )
#Recreate the new vector
as.integer(rep(names(new_val), new_val))
#[1] 1 2 3
If you add a rowid column you can do this with data.table ant-joins.
Here's an example which takes ID into account, not deleting "positive counterparts" unless they're the same ID
First create more interesting sample data
df <- fread('
ID Store Spent
123 A 18.50
123 A -18.50
123 A 18.50
123 A -19.50
123 A 19.50
123 A -99.50
124 A -94.50
124 A 99.50
124 A 94.50
124 A 94.50
')
Now remove all the negative values with positive counterparts, and remove those counterparts
negs <- df[Spent < 0][, Spent := -Spent][, rid := rowid(ID, Spent)]
pos <- df[Spent > 0][, rid := rowid(ID, Spent)]
pos[!negs, on = .(ID, Spent, rid), -'rid']
# ID Store Spent rid
# 1: 123 A 18.5 2
# 2: 124 A 99.5 1
# 3: 124 A 94.5 2
And as applied to Ronak's x vector example
x <- c(1, 2, -2, 1, -1, -1, 2, 3, -4, 1, 4)
negs <- data.table(x = -x[x<0])[, rid := rowid(x)]
pos <- data.table(x = x[x>0])[, rid := rowid(x)]
pos[!negs, on = names(pos), -'rid']
# x
# 1: 2
# 2: 3
# 3: 1
I used the following code.
library(dplyr)
store <- rep(LETTERS[1:3], 3)
id <- c(1:4, 1:3, 1:2)
expense <- runif(9, -10, 10)
tibble(store, id, expense) %>%
group_by(store) %>%
summarise(net_expenditure = sum(expense))
to get this output:
# A tibble: 3 x 2
store net_expenditure
<chr> <dbl>
1 A 13.3
2 B 8.17
3 C 16.6
Alternatively, if you wanted the net expenditure per store-id pairing, then you could use this code:
tibble(store, id, expense) %>%
group_by(store, id) %>%
summarise(net_expenditure = sum(expense))
I've approached your question from a slightly different perspective. I'm not sure that my code answers your question, but it might help.

R conditional lookup and sum

I have data on college course completions, with estimated numbers of students from each cohort completing after 1, 2, 3, ... 7 years. I want to use these estimates to calculate the total number of students outputting from each College and Course in any year.
The output of students in a given year will be the sum of the previous 7 cohorts outputting after 1, 2, 3, ... 7 years.
For example, the number of students outputting in 2014 from COLLEGE 1, COURSE A is equal to the sum of:
Output of 2013 cohort (College 1, Course A) after 1 year +
Output of 2012 cohort (College 1, Course A) after 2 years +
Output of 2011 cohort (College 1, Course A) after 3 years +
Output of 2010 cohort (College 1, Course A) after 4 years +
Output of 2009 cohort (College 1, Course A) after 5 years +
Output of 2008 cohort (College 1, Course A) after 6 years +
Output of 2007 cohort (College 1, Course A) after 7 years +
So there are two dataframes: a lookup table that contains all the output estimates, and a smaller summary table that I'm trying to modify. I want to update dummy.summary$output with, for each row, the total output based on the above calculation.
The following code will replicate my data pretty well
# Lookup table
dummy.lookup <- data.frame(cohort = rep(1998:2014, each = 210),
college = rep(rep(paste("College", 1:6), each = 35), 17),
course = rep(rep(paste("Course", LETTERS[1:5]), each = 7),102),
intake = rep(sample(x = 150:300, size = 510, replace=TRUE), each = 7),
output.year = rep(1:7, 510),
output = sample(x = 10:20, size = 3570, replace=TRUE))
# Summary table to be modified
dummy.summary <- aggregate(x = dummy.lookup["intake"], by = list(dummy.lookup$cohort, dummy.lookup$college, dummy.lookup$course), FUN = mean)
names(dummy.summary)[1:3] <- c("year", "college", "course")
dummy.summary <- dummy.summary[order(dummy.summary$year, dummy.summary$college, dummy.summary$course), ]
dummy.summary$output <- 0
The following code does not work, but shows the approach I've been attempting.
dummy.summary$output <- sapply(dummy.summary$output, function(x){
# empty vector to fill with output values
vec <- c()
# Find relevant output for college + course, from each cohort and exit year
for(j in 1:7){
append(x = vec,
values = dummy.lookup[dummy.lookup$college==dummy.summary[x, "college"] &
dummy.lookup$course==dummy.summary[x, "course"] &
dummy.lookup$cohort==dummy.summary[x, "year"]-j &
dummy.lookup$output.year==j, "output"])
}
# Sum and return total output
sum_vec <- sum(vec)
return(sum_vec)
}
)
I guess it doesn't work because I was hoping to use 'x' in the anonymous function to index particular values of the dummy.summary dataframe. But that clearly isn't happening and is only returning zero for each row, presumably because the starting value of 'x' is zero each time. I don't know if it is possible to access the index position of each value that sapply loops over, and use that to index my summary dataframe.
Is this approach fixable or do I need a completely different approach?
Even if it is fixable, is there a more elegant/faster way to acheive what I'm trying to do?
Thanks in anticipation.
I've just updated your output.year to output.year2 where instead of a value from 1 to 7 it gets a value of a year based on the cohort you have.
I've realised that the output information you want corresponds to the output.year, but the intake information you want corresponds to the cohort. So, I calculate them separately and then I join tables/information. This automatically creates empty (NA that I transform to 0) output info for 1998.
# fix your random sampling
set.seed(24)
# Lookup table
dummy.lookup <- data.frame(cohort = rep(1998:2014, each = 210),
college = rep(rep(paste("College", 1:6), each = 35), 17),
course = rep(rep(paste("Course", LETTERS[1:5]), each = 7),102),
intake = rep(sample(x = 150:300, size = 510, replace=TRUE), each = 7),
output.year = rep(1:7, 510),
output = sample(x = 10:20, size = 3570, replace=TRUE))
dummy.lookup$output[dummy.lookup$yr %in% 1:2] <- 0
library(dplyr)
# create result table for output info
dt_output =
dummy.lookup %>%
mutate(output.year2 = output.year+cohort) %>% # update output.year to get a year value
group_by(output.year2, college, course) %>% # for each output year, college, course
summarise(SumOutput = sum(output)) %>% # calculate sum of intake
ungroup() %>%
arrange(college,course,output.year2) %>% # for visualisation purposes
rename(cohort = output.year2) # rename column
# create result for intake info
dt_intake =
dummy.lookup %>%
select(cohort, college, course, intake) %>% # select useful columns
distinct() # keep distinct rows/values
# join info
dt_intake %>%
full_join(dt_output, by=c("cohort","college","course")) %>%
mutate(SumOutput = ifelse(is.na(SumOutput),0,SumOutput)) %>%
arrange(college,course,cohort) %>% # for visualisation purposes
tbl_df() # for printing purposes
# Source: local data frame [720 x 5]
#
# cohort college course intake SumOutput
# (int) (fctr) (fctr) (int) (dbl)
# 1 1998 College 1 Course A 194 0
# 2 1999 College 1 Course A 198 11
# 3 2000 College 1 Course A 223 29
# 4 2001 College 1 Course A 198 45
# 5 2002 College 1 Course A 289 62
# 6 2003 College 1 Course A 163 78
# 7 2004 College 1 Course A 211 74
# 8 2005 College 1 Course A 181 108
# 9 2006 College 1 Course A 277 101
# 10 2007 College 1 Course A 157 109
# .. ... ... ... ... ...

Resources