Number of unique values within a range in data-frame - r

From a data-frame, I want to extract the number of unique values (of X) within a certain range of Y (e.g. for every 0-100, 101-200, 201-300, etc. up to 3000).
Example df
X Y
169 183
546 64
154 148
593 203
60 243
568 370
85 894
168 169
154 148
83 897
…
A time consuming way would be to run the following code for each range:
junk<-subset(df, Y > 0 & Y < 100)
length(unique(junk$record.no))
But I have to ask the experts - there must be a better way?

You can do it with by() and cut():
data <- data.frame(X=ceiling(rnorm(10000, 500, 10)), Y=runif(10000, 0, 3000))
data$Groups <- cut(data$Y, seq(0, 3000, 100)) # Create a categorical variable for each range
by(data$X, data$Group, function(x) length(unique(x)))

This seems valid:
aggregate(DF$X, list(cut(DF$Y, seq(0, 1000, 100))), function(x) unique(x))
# Group.1 x #or length(unique(x))
#1 (0,100] 546
#2 (100,200] 169, 154, 168
#3 (200,300] 593, 60
#4 (300,400] 568
#5 (800,900] 85, 83

You can run a for loop based on the range you want and the size of the dataframe and then count the number of levels by converting to factor:
range <- 100 #based on example
loops <- nrow(df)/range
lvlMatrix <- matrix(nrow=0,ncol=2,dimnames=list(NULL,c("range","unique values")))
for(a in 1:loops){
sub <- df[((a-1)*range):(range*a),]
lvls<-nlevels(factor(sub$X))
lvlMatrix <- rbind(lvlMatrix,cbind(paste(as.character((a-1)*range),"-",as.character(range*a),sep=""),lvls))
}

Related

If() statement in R

I am not very experienced in if statements and loops in R.
Probably you can help me to solve my problem.
My task is to add +1 to df$fz if sum(df$fz) < 450, but in the same time I have to add +1 only to max values in df$fz till that moment when when sum(df$fz) is lower than 450
Here is my df
ID_PP <- c(3,6, 22, 30, 1234456)
z <- c(12325, 21698, 21725, 8378, 18979)
fz <- c(134, 67, 70, 88, 88)
df <- data.frame(ID_PP,z,fz)
After mutating the new column df$new_value, it should look like 134 68 71 88 89
At this moment I have this code, but it adds +1 to all values.
if (sum(df$fz ) < 450) {
mutate(df, new_value=fz+1)
}
I know that I can pick top_n(3, z) and add +1 only to this top, but it is not what I want, because in that case I have to pick a top manually after checking sum(df$fz)
From what I understood from #Oksana's question and comments, we probably can do it this way:
library(tidyverse)
# data
vru <- data.frame(
id = c(3, 6, 22, 30, 1234456),
z = c(12325, 21698, 21725, 8378, 18979),
fz = c(134, 67, 70, 88, 88)
)
# solution
vru %>% #
top_n(450 - sum(fz), z) %>% # subset by top z, if sum(fz) == 450 -> NULL
mutate(fz = fz + 1) %>% # increase fz by 1 for the subset
bind_rows( #
anti_join(vru, ., by = "id"), # take rows from vru which are not in subset
. # take subset with transformed fz
) %>% # bind thous subsets
arrange(id) # sort rows by id
# output
id z fz
1 3 12325 134
2 6 21698 68
3 22 21725 71
4 30 8378 88
5 1234456 18979 89
The clarifications in the comments helped. Let me know if this works for you. Of course, you can drop the cumsum_fz and leftover columns.
# Making variables to use in the calculation
df <- df %>%
arrange(fz) %>%
mutate(cumsum_fz = cumsum(fz),
leftover = 450 - cumsum_fz)
# Find the minimum, non-negative value to use for select values that need +1
min_pos <- min(df$leftover[df$leftover > 0])
# Creating a vector that adds 1 using the min_pos value and keeps
# the other values the same
df$new_value <- c((head(sort(df$fz), min_pos) + 1), tail(sort(df$fz), length(df$fz) - min_pos))
# Checking the sum of the new value
> sum(df$new_value)
[1] 450
>
> df
ID_PP z fz cumsum_fz leftover new_value
1 6 21698 67 67 383 68
2 22 21725 70 137 313 71
3 30 8378 88 225 225 89
4 1234456 18979 88 313 137 88
5 3 12325 134 447 3 134
EDIT:
Because utubun already posted a great tidyverse solution, I am going to translate my first one completely to base (it was a bit sloppy to mix the two anyway). Same logic as above, and using the data OP provided.
> # Using base
> df <- df[order(fz),]
>
> leftover <- 450 - cumsum(fz)
> min_pos <- min(leftover[leftover > 0])
> df$new_value <- c((head(sort(df$fz), min_pos) + 1), tail(sort(df$fz), length(df$fz) - min_pos))
>
> sum(df$new_value)
[1] 450
> df
ID_PP z fz new_value
2 6 21698 67 68
3 22 21725 70 71
4 30 8378 88 89
5 1234456 18979 88 88
1 3 12325 134 134

Finding nearest matching points

What I would like to do is for the red points find the nearest equivalent blue dot on the other side of the abline (i.e. 1,5 find 5,1).
Data:
https://1drv.ms/f/s!Asb7WztvacfOuesIq4evh0jjvejZ4Q
Edit: to open data do readRDS("path/to/data")
So what I have tried is to find the difference between the x and y coordinates, rank them and then find the min value going down the ranks for both x and y. The results and pretty bad. The thing I'm struggling with is finding a way to find nearest match of tuples.
My attempt:
find_nearest <- function(query, subject){
weight_df <- data.frame(ID=query$ID)
#find difference of first, then second, rank and find match in both going from top to bottom
tmp_df <- query
for(i in 1:nrow(subject)){
first_order <- order(abs(query$mean_score_n-subject$mean_score_n[i]))
second_order <- order(abs(query$mean_score_p-subject$mean_score_p[i]))
tmp_df$order_1[first_order] <- seq(1, nrow(tmp_df))
tmp_df$order_2[second_order] <- seq(1, nrow(tmp_df))
weight_df[,i+1] <- tmp_df$order_1 + tmp_df$order_2
}
rownames(weight_df) <- weight_df$ID
weight_df$ID <- NULL
print(dim(weight_df))
nearest_match <- list()
count <- 1
subject_ids <- NA
query_ids <- NA
while(ncol(weight_df) > 0 & count <= ncol(weight_df)){
pos <- which(weight_df == min(weight_df, na.rm = TRUE), arr.ind = TRUE)
if(length(unique(rownames(pos))) > 1){
for(i in nrow(pos)){
#if subject/query already used then mask and find another
if(subject$ID[pos[i,2]] %in% subject_ids){
weight_df[pos[i,1],pos[i,2]] <- NA
}else if(query$ID[pos[i,1]] %in% query_ids){
weight_df[pos[i,1],pos[i,2]] <- NA
}else{
subject_ids <- c(subject_ids, subject$ID[pos[i,2]])
query_ids <- c(query_ids, query$ID[pos[i,1]])
nearest_match[[count]] <- data.frame(query=query[pos[i,1],]$ID, subject=subject[pos[i,2],]$ID)
#mask
weight_df[pos[i,1],pos[i,2]] <- NA
count <- count + 1
}
}
}else if(nrow(pos) > 1){
#if subject/query already used then mask and find another
if(subject$ID[pos[1,2]] %in% subject_ids){
weight_df[pos[1,1],pos[1,2]] <- NA
}else if(query$ID[pos[1,1]] %in% query_ids){
weight_df[pos[1,1],pos[1,2]] <- NA
}else{
subject_ids <- c(subject_ids, subject$ID[pos[1,1]])
query_ids <- c(query_ids, query$ID[pos[1,1]])
nearest_match[[count]] <- data.frame(query=query[pos[1,1],]$ID, subject=subject[pos[1,2],]$ID)
#mask
weight_df[pos[1,1],pos[1,2]] <- NA
count <- count + 1
}
}else{
#if subject/query already used then mask and find another
if(subject$ID[pos[2]] %in% subject_ids){
weight_df[pos[1],pos[2]] <- NA
}else if(query$ID[pos[1]] %in% query_ids){
weight_df[pos[1],pos[2]] <- NA
}else{
subject_ids <- c(subject_ids, subject$ID[pos[2]])
query_ids <- c(query_ids, query$ID[pos[1]])
nearest_match[[count]] <- data.frame(query=query[pos[1],]$ID, subject=subject[pos[2],]$ID)
#mask
weight_df[pos[1],pos[2]] <- NA
count <- count + 1
}
}
}
out <- plyr::ldply(nearest_match, rbind)
out <- merge(out, data.frame(subject=subject$ID,
mean_score_p_n=subject$mean_score_p,
mean_score_n_n= subject$mean_score_n), by="subject", all.x=TRUE)
out <- merge(out, data.frame(query=query$ID,
mean_score_p_p=query$mean_score_p,
mean_score_n_p= query$mean_score_n), by="query", all.x=TRUE)
return(out)
}
Edit: is this what the solution looks like for you?
ggplot() +
geom_point(data=B[out,], aes(x=mean_score_p, y= mean_score_n, color="red")) +
geom_point(data=A, aes(x=mean_score_p, y=mean_score_n, color="blue")) +
geom_abline(intercept = 0, slope = 1)
Let
query <- readRDS("query.dms")
subject <- readRDS("subject.dms")
kA <- nrow(subject)
kB <- nrow(query)
A <- as.matrix(subject[, 2:3])
B <- as.matrix(query[, 2:3])
where we want to find the closest "reverse" point (row) in B to each point in A.
Solution permitting non-unique results
Then, assuming that you are using the Euclidean distance,
D <- as.matrix(dist(rbind(A, B[, 2:1])))[(1 + kA):(kA + kB), 1:kA]
unname(apply(D, 2, which.min))
# [1] 268 183 350 284 21 360 132 287 100 298 58 56 170 70 47 305 353
# [18] 43 266 198 58 215 198 389 412 321 255 181 79 340 292 268 198 54
# [35] 390 38 376 47 19 94 244 18 168 201 160 194 114 247 287 273 182
# [52] 87 94 87 192 63 160 244 101 298 62
are the corresponding row numbers in B. The trick was to switch the coordinates of the points in B by using B[, 2:1].
Solution with unique results
out <- vector("numeric", length = kA)
colnames(D) <- 1:ncol(D)
rownames(D) <- 1:nrow(D)
while(any(out == 0))
for(i in 1:nrow(D)) {
aux <- apply(D, 2, which.min)
if(i %in% aux) {
win <- which(aux == i)[which.min(D[i, aux == i])]
out[as.numeric(names(win))] <- as.numeric(rownames(D)[i])
D <- D[-i, -win, drop = FALSE]
}
}
out
# [1] 268 183 350 284 21 360 132 213 100 298 22 56 170 70 128 305 353
# [18] 43 266 198 58 215 294 389 412 321 255 181 79 340 292 20 347 54
# [35] 390 38 376 47 19 94 73 18 168 201 160 194 114 247 287 273 182
# [52] 87 365 158 192 63 211 244 101 68 62
whereas
all(table(res) == 1)
# [1] TRUE
confirms uniqueness. The solution is not the most efficient, but on your dataset it takes only a couple of seconds. It takes some time because it keeps going over all the available points in B checking if it is the closest one to any of the points in A. If so, the corresponding point in B is assigned to the closest one in A. Then both the point in A and the point in B are eliminated from the distance matrix. The loop goes until every point in A has some match in B.

R - Sum range over lookback period, divided sum of look back - excel to R

I am looking to workout a percentage total over a look back range in R.
I know how to do this in excel with the following formula:
=SUM(B2:B4)/SUM(B2:B4,C2:C4)
This is summing column B over a range of today looking back 3 lines. It then divides this sum buy the total sum of column B + C again looking back 3 lines.
I am looking to achieve the same calculation in R to run across my matrix.
The output would look something like this:
adv dec perct
1 69 376
2 113 293
3 270 150 0.355625492
4 74 371 0.359559402
5 308 96 0.513790386
6 236 173 0.491255962
7 252 134 0.663886572
8 287 129 0.639966969
9 219 187 0.627483444
This is a line of code I could perhaps add the look back range too:
perct <- apply(data.matrix[,c('adv','dec')], 1, function(x) { (x[1] / x[1] + x[2]) } )
If i could get [1] to sum the previous 3 line range and
If i could get [2] to also sum the previous 3 line range.
Still learning how to apply forward and look back periods within R. So any additional learning on the answer would be appreciated!
Here are some approaches. The first 3 use rollsumr and/or rollapplyr in zoo and the last one uses only the base of R.
1) rollsumr Create a matrix with rollsumr whose columns contain the rollling sums, convert that to row proportions and take the "adv" column. Finally assign that to a new column frac in DF. This approach has the shortest code.
library(zoo)
DF$frac <- prop.table(rollsumr(DF, 3, fill = NA), 1)[, "adv"]
giving:
> DF
adv dec frac
1 69 376 NA
2 113 293 NA
3 270 150 0.3556255
4 74 371 0.3595594
5 308 96 0.5137904
6 236 173 0.4912560
7 252 134 0.6638866
8 287 129 0.6399670
9 219 187 0.6274834
1a) This variation is similar except instead of using prop.table we write out the ratio. The code is longer but you may find it clearer.
m <- rollsumr(DF, 3, fill = NA)
DF$frac <- with(as.data.frame(m), adv / (adv + dec))
1b) This is a variation of (1) that is the same except it uses a magrittr pipeline:
library(magrittr)
DF %>% rollsumr(3, fill = NA) %>% prop.table(1) %>% `[`(TRUE, "adv") -> DF$frac
2) rollapplyr We could use rollapplyr with by.column = FALSE like this. The result is the same.
ratio <- function(x) sum(x[, "adv"]) / sum(x)
DF$frac <- rollapplyr(DF, 3, ratio, by.column = FALSE, fill = NA)
3) Yet another variation is to compute the numerator and denominator separately:
DF$frac <- rollsumr(DF$adv, 3, fill = NA) /
rollapplyr(DF, 3, sum, by.column = FALSE, fill = NA)
4) base This uses embed followed by rowSums on each column to get the rolling sums and then uses prop.table as in (1).
DF$frac <- prop.table(sapply(lapply(rbind(NA, NA, DF), embed, 3), rowSums), 1)[, "adv"]
Note: The input used in reproducible form is:
Lines <- "adv dec
1 69 376
2 113 293
3 270 150
4 74 371
5 308 96
6 236 173
7 252 134
8 287 129
9 219 187"
DF <- read.table(text = Lines, header = TRUE)
Consider an sapply that loops through the number of rows in order to index two rows back:
DF$pred <- sapply(seq(nrow(DF)), function(i)
ifelse(i>=3, sum(DF$adv[(i-2):i])/(sum(DF$adv[(i-2):i]) + sum(DF$dec[(i-2):i])), NA))
DF
# adv dec pred
# 1 69 376 NA
# 2 113 293 NA
# 3 270 150 0.3556255
# 4 74 371 0.3595594
# 5 308 96 0.5137904
# 6 236 173 0.4912560
# 7 252 134 0.6638866
# 8 287 129 0.6399670
# 9 219 187 0.6274834

Creating data continuously using rnorm until an outlier occurs in R

Sorry for the confusing title, but i wasn't sure how to title what i am trying to do. My objective is to create a dataset of 1000 obs each would be the length of the run. I have created a phase1 dataset, from which a set of control limits are produced. What i am trying to do now is create a phase2 dataset most likely using rnorm. what im trying to do is create a repeat loop that will continuously create values in the phase2 dataset until one of those values is outside of the control limits produced from the phase1 dataset. for example if i had 3.0 and -3.0 as control limits the phase2 dataset would create a bunch of observations until obs 398 when the value here happens to be 3.45, thus stopping the creation of data. my objective is then to record the number 398. Furthermore, I am then trying to loop the code back to the phase1 dataset/ control limits portion and create a new set of control limits and then run another phase2, until i have 1000 run lengths recorded. the code i have for the phase1/ control limits works fine and looks like this:
nphase1=50
nphase2=1000
varcount=1
meanshift= 0
sigmashift= 1
##### phase1 dataset/ control limits #####
phase1 <- matrix(rnorm(nphase1*varcount, 0, 1), nrow = nphase1, ncol=varcount)
mean_var <- apply(phase1, 2, mean)
std_var <- apply(phase1, 2, sd)
df_var <- data.frame(mean_var, std_var)
Upper_SPC_Limit_Method1 <- with(df_var, mean_var + 3 * std_var)
Lower_SPC_Limit_Method1 <- with(df_var, mean_var - 3 * std_var)
df_control_limits<- data.frame(Upper_SPC_Limit_Method1, Lower_SPC_Limit_Method1)
I have previously created this code in SAS and it looks like this. might be a better reference for what i am trying to achieve then me trying to explain it.
%macro phase2_dataset (n=,varcount=, meanshift=, sigmashift=, nphase1=,simID=,);
%do z=1 %to &n;
%phase1_dataset (n=&nphase1, varcount=&varcount);
data phase2; set control_limits n=lastobs;
call streaminit(0);
do until (phase2_var1<Lower_SPC_limit_method1_var1 or
phase2_var1>Upper_SPC_limit_method1_var1);
phase2_var1 = rand("normal", &meanshift, &sigmashift);
output;
end;
run;
ods exclude all;
proc means data=phase2;
var phase2_var1;
ods output summary=x;
run;
ods select all;
data run_length; set x;
keep Phase2_var1_n;
run;
proc append base= QA.Phase2_dataset&simID data=Run_length force; run;
%end;
%mend;
Also been doing research about using a while loop in replace of the repeat loop.
Im new to R so Any ideas you are able to throw my way are greatly appreciated. Thanks!
Using a while loop indeed seems to be the way to go. Here's what I think you're looking for:
set.seed(10) #Making results reproducible
replicate(100, { #100 is easier to display here
phase1 <- matrix(rnorm(nphase1*varcount, 0, 1), nrow = nphase1, ncol=varcount)
mean_var <- colMeans(phase1) #Slightly better than apply
std_var <- apply(phase1, 2, sd)
df_var <- data.frame(mean_var, std_var)
Upper_SPC_Limit_Method1 <- with(df_var, mean_var + 3 * std_var)
Lower_SPC_Limit_Method1 <- with(df_var, mean_var - 3 * std_var)
df_control_limits<- data.frame(Upper_SPC_Limit_Method1, Lower_SPC_Limit_Method1)
#Phase 2
x <- 0
count <- 0
while(x > Lower_SPC_Limit_Method1 && x < Upper_SPC_Limit_Method1) {
x <- rnorm(1)
count <- count + 1
}
count
})
The result is:
[1] 225 91 97 118 304 275 550 58 115 6 218 63 176 100 308 844 90 2758
[19] 161 311 1462 717 2446 74 175 91 331 210 118 1517 420 32 39 201 350 89
[37] 64 385 212 4 72 730 151 7 1159 65 36 333 97 306 531 1502 26 18
[55] 67 329 75 532 64 427 39 352 283 483 19 9 2 1018 137 160 223 98
[73] 15 182 98 41 25 1136 405 474 1025 1331 159 70 84 129 233 2 41 66
[91] 1 23 8 325 10 455 363 351 108 3
If performance becomes a problem, perhaps it would be interesting to explore some improvements, like creating more numbers with rnorm() at a time and then counting how many are necessary to exceed the limits and repeat if necessary.

2D irregular aggregation of a matrix

I'm trying to bin a symmetric matrix with irregular intervals in R but am not sure how to proceed. My ideas are:
Reshape the matrix to long format, aggregate and cast it back?
Bin as-is in both dimensions (somehow... tapply, aggregate?)
Keep the regular binning but for each of my (larger) irregular bins, replace all inner values with their sum?
Here's an example of what I'm trying to do:
set.seed(42)
# symmetric matrix
a <- matrix(rpois(1e4, 2), 100)
a[upper.tri(a)] <- t(a)[upper.tri(a)]
image(x=1:100, y=1:100, a, asp=1, frame=F, axes=F)
# vector of irregular breaks for binning
breaks <- c(12, 14, 25, 60, 71, 89)
# white line show the desired bins
abline(h=breaks-.5, lwd=2, col="white")
abline(v=breaks-.5, lwd=2, col="white")
(The aim being that each rectangle drawn above be filled according to the sum of values within it.) I'd appreciate any pointers of how best to approach this.
This answer provides a great starting point using tapply:
b <- melt(a)
bb <- with(b, tapply(value,
list(
y=cut(Var1, breaks=c(0, breaks, Inf), include.lowest=T),
x=cut(Var2, breaks=c(0, breaks, Inf), include.lowest=T)
),
sum)
)
bb
# x
# y [0,12] (12,14] (14,25] (25,60] (60,71] (71,89] (89,Inf]
# [0,12] 297 48 260 825 242 416 246
# (12,14] 48 3 43 141 46 59 33
# (14,25] 260 43 261 794 250 369 240
# (25,60] 825 141 794 2545 730 1303 778
# (60,71] 242 46 250 730 193 394 225
# (71,89] 416 59 369 1303 394 597 369
# (89,Inf] 246 33 240 778 225 369 230
These can then be plotted as rectangular bins using a base plot and rect — i.e.:
library("reshape2")
library("magrittr")
bsq <- melt(bb)
# convert range notation to numerics
getNum <- . %>%
# rm brackets
gsub("\\[|\\(|\\]|\\)", "", .) %>%
# split digits and convert
strsplit(",") %>%
unlist %>% as.numeric
y <- t(sapply(bsq[,1], getNum))
x <- t(sapply(bsq[,2], getNum))
# normalise bin intensity by area
bsq$size <- (y[,2] - y[,1]) * (x[,2] - x[,1])
bsq$norm <- bsq$value / bsq$size
# draw rectangles on top of empty plot
plot(1:100, 1:100, type="n", frame=F, axes=F)
rect(ybottom=y[,1], ytop=y[,2],
xleft=x[,1], xright=x[,2],
col=rgb(colorRamp(c("white", "steelblue4"))(bsq$norm / max(bsq$norm)),
alpha=255*(bsq$norm / max(bsq$norm)), max=255),
border="white")

Resources