Sum certain rows given 2 constraints in R - r

I am trying to write an conditional statement with the following constraints. Below is an example data frame showing the problem I am running into.
Row <- c(1,2,3,4,5,6,7)
La <- c(51.25,51.25,51.75,53.25,53.25,54.25,54.25)
Lo <- c(128.25,127.75,127.25,119.75,119.25,118.75,118.25)
Y <- c(5,10,2,4,5,7,9)
Cl <- c("EF","EF","EF","EF","NA","NA","CE")
d <- data.frame(Row,La,Lo,Y,Cl)
Row La Lo Y Cl
1 1 51.25 128.25 5 EF
2 2 51.25 127.75 10 EF
3 3 51.75 127.25 2 EF
4 4 53.25 119.75 4 EF
5 5 53.25 119.25 5 NA
6 6 54.25 118.75 7 NA
7 7 55.25 118.25 9 CE
I would like to sum column "Y" (removing all values from that row) if "Cl" is NA with the corresponding "Lo" and "La" values that are close (equal to or less than 1.00). In effect, I want to remove NA from being in the data frame without losing the value of "Y", but instead adding this value to its closest neighbor.
I would like the return data frame to look like this:
Row2 <- c(1,2,3,4,7)
La2 <- c(51.25,51.25,51.75,53.25,55.25)
Lo2 <- c(128.25,127.75,127.25,119.75,118.25)
Y2 <- c(5,10,2,9,16)
Cl2 <- c("EF","EF","EF","EF","CE")
d2 <- data.frame(Row2,La2,Lo2,Y2,Cl2)
Row2 La2 Lo2 Y2 Cl2
1 1 51.25 128.25 5 EF
2 2 51.25 127.75 10 EF
3 3 51.75 127.25 2 EF
4 4 53.25 119.75 9 EF
5 7 55.25 118.25 16 CE
recent edit: If NA row is close to one row in terms of Lo value and same closeness to another row in La value, join by La value. If there are 2 equally close rows of Lo and La values, join by smaller La value.
Thank you for the help!

Here is a method to use if you can make some distance matrix m for the distance between all the (La, Lo) rows in your data. I use the output of dist, which is euclidean distance. The row with the lowest distance is selected, or the earliest such row if the lowest distance is shared by > 1 row.
w <- which(is.na(d$Cl))
m <- as.matrix(dist(d[c('La', 'Lo')]))
m[row(m) %in% w] <- NA
d$g <- replace(seq(nrow(d)), w, apply(m[,w], 2, which.min))
library(dplyr)
d %>%
group_by(g) %>%
summarise(La = La[!is.na(Cl)],
Lo = Lo[!is.na(Cl)],
Y = sum(Y),
Cl = Cl[!is.na(Cl)]) %>%
select(-g)
# # A tibble: 5 x 4
# La Lo Y Cl
# <dbl> <dbl> <dbl> <fct>
# 1 51.2 128. 5 EF
# 2 51.2 128. 10 EF
# 3 51.8 127. 2 EF
# 4 53.2 120. 9 EF
# 5 54.2 118. 16 CE

Related

R: How to simply compare values of columns in 2 data frames

I am comparing two data frames: FU and FO
Here are short samples of what they look like
"Model_ID" "FU_Lin_Period" "FU_Growth_rate"
2 0.72127 0.0093333
3 0.69281 0.015857
4 0.66735 0.021103
5 0.64414 0.024205
6 0.62288 0.026568
7 0.60318 0.027749
8 0.58472 0.028161
9 0.56734 0.028008
10 0.55085 0.027309
11 0.53522 0.026068
12 0.52029 0.024684
13 0.50603 0.022866
14 0.49237 0.020991
15 0.47928 0.018773
"Model_ID" "FO_Lin_Period" "FO_Growth_rate"
7 0.44398 0.008868
8 0.43114 0.01674
9 0.41896 0.023248
10 0.40728 0.028641
11 0.39615 0.032192
12 0.38543 0.03543
13 0.37517 0.03692
14 0.36525 0.038427
15 0.35573 0.038195
As you can tell, they do not have all the same Model_ID
Basically, what I want to do is go through every Model_ID in the two tables, compare whether FU or FO's growth rate is larger for a given model ID, and...
if FU's is larger (or FU exists for the model number and FO does not), place the model number in a vector called selected_FU
if FO's is larger (or FO exists for the model number and FU does not), place the model number in a vector called selected_FO
Is there a way to do this without using loops?
data.table alternative using similar logic to the tidyverse answer.
Replace NAs with -Infinity, do the comparison of the two FU/FO_Growth_rate variables, flag which group had the larger value, and select the Model_ID into the variables requested.
library(data.table)
setDT(FU)
setDT(FO)
out <- merge(FU, FO, by="Model_ID", all=TRUE)[,
"gr_sel" := c("FO","FU")[(nafill(FU_Growth_rate, fill=-Inf) >
nafill(FO_Growth_rate, fill=-Inf)) + 1],
]
selected_FU <- out[gr_sel == "FU", Model_ID]
selected_FO <- out[gr_sel == "FO", Model_ID]
Data used:
FU <- read.table(text="Model_ID FU_Lin_Period FU_Growth_rate\n2 0.72127 0.0093333\n3 0.69281 0.015857\n4 0.66735 0.021103\n5 0.64414 0.024205\n6 0.62288 0.026568\n7 0.60318 0.027749\n8 0.58472 0.028161\n9 0.56734 0.028008\n10 0.55085 0.027309\n11 0.53522 0.026068\n12 0.52029 0.024684\n13 0.50603 0.022866\n14 0.49237 0.020991\n15 0.47928 0.018773", header=TRUE)
FO <- read.table(text="Model_ID FO_Lin_Period FO_Growth_rate\n7 0.44398 0.008868\n8 0.43114 0.01674\n9 0.41896 0.023248\n10 0.40728 0.028641\n11 0.39615 0.032192\n12 0.38543 0.03543\n13 0.37517 0.03692\n14 0.36525 0.038427\n15 0.35573 0.038195", header=TRUE)
With dplyr, tidyr, and reader.
library(dplyr)
library(tidyr)
library(readr)
FU <- read_table2("test.FU.LINA.table")
FO <- read_table2("test.FO.LINA.table")
df_compared <-
full_join(FU, FO, by = "model_id") %>%
replace_na(list(fo_growth_rate = -1, fu_growth_rate = -1)) %>%
mutate(select_fufo = if_else(fu_growth_rate >= fo_growth_rate, true = "fu", false = "fo"))
df_compared
# A tibble: 6,166 x 6
model_id fu_lin_period fu_growth_rate fo_lin_period fo_growth_rate select_fufo
<dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 2 0.721 0.00933 NA -1 fu
2 3 0.693 0.0159 NA -1 fu
3 4 0.667 0.0211 NA -1 fu
4 5 0.644 0.0242 NA -1 fu
5 6 0.623 0.0266 NA -1 fu
6 7 0.603 0.0277 0.444 0.00887 fu
7 8 0.585 0.0282 0.431 0.0167 fu
8 9 0.567 0.0280 0.419 0.0232 fu
9 10 0.551 0.0273 0.407 0.0286 fo
10 11 0.535 0.0261 0.396 0.0322 fo
# ... with 6,156 more rows
selected_fu <- df_compared %>% filter(select_fufo == "fu") %>% .$model_id
selected_fo <- df_compared %>% filter(select_fufo == "fo") %>% .$model_id

Get column mean every block on n rows based on condition

I have this dataframe
r2 distance
1 33.64 67866
2 8.50 77229
3 15.07 109119
4 24.35 142279
5 7.74 143393
6 8.21 177670
7 12.26 216440
8 12.66 253751
9 26.31 282556
10 39.08 320816
I need to calculate the mean of column r2 for every block of rows where the distance between two values in the column distance is equal or less than 100000.
For this example the desired output would be:
mean_r2 diff_of_distance
1 17.86 75527 ## mean of rows 1 to 5; distance 5 - distance 1
2 13.91 66164 ## mean of rows 2 to 5; distance 5 - distance 2
3 13.84 68551 ## mean of rows 3 to 6; distance 6 - distance 3
4 13.14 74161 ## mean of rows 4 to 7; distance 7 - distance 4
5 9.40 73047 ## mean of rows 5 to 7; distance 7 - distance 5
6 11.04 76081 ## mean of rows 6 to 8; distance 8 - distance 6
and so on.
Edit 1: I have more than 100,000 rows.
Thanks.
Loop through each value of distance, minus this from the values in the distance vector and test if the result is less than 100000. This creates a boolean vector which you sum to identify the index at which the distance is greater than 100000 (i.e. bool becomes FALSE). Use this index to identify your block then take the mean of r2 in each block.
To speed up the code define your vector type and length (to avoid "growing vectors" on each iteration.
means <- vector("numeric", length = nrow(df))
rows <- vector("numeric", length = nrow(df))
distance_diff <- vector("numeric", length = nrow(df))
for (i in seq_along(df$distance)) {
dis_val <- df$distance[i] # the ith distance value
bools <- (df$distance - dis_val) < 100000 # bool indicating if difference between i and every value in vector is less than 100000
block_range <- sum(bools)# taking sum of bools identifies the value at which the distance becomes > 100000
rows[i] <- paste(as.character(i), "-", as.character(block_range))
means[i] <- mean(df$r2[i:block_range]) # take the mean of r2 in the range i to all rows where distance is < 100000
distance_diff[i] <- df$distance[block_range] - dis_val # minus the distance from the value before distance is > 100000 from i
}
data.frame(mean_r2 = means, rows= rows, diff_of_distance=distance_diff)
mean_r2 rows diff_of_distance
1 17.860000 1 - 5 75527
2 13.915000 2 - 5 66164
3 13.842500 3 - 6 68551
4 13.140000 4 - 7 74161
5 9.403333 5 - 7 73047
6 11.043333 6 - 8 76081
7 17.076667 7 - 9 66116
8 26.016667 8 - 10 67065
9 32.695000 9 - 10 38260
10 39.080000 10 - 10 0
You can try:
# your data
d <- read.table(text="r2 distance
1 33.64 67866
2 8.50 77229
3 15.07 109119
4 24.35 142279
5 7.74 143393
6 8.21 177670
7 12.26 216440
8 12.66 253751
9 26.31 282556
10 39.08 320816", header=T)
library(tidyverse) #dplyr_0.7.2
d %>%
mutate(index=1:n()) %>% add row index
group_by(index) %>% # group by this index
# calculate difference and find max row where diff < 100000
mutate(max_row=max(which(.$distance - distance < 100000, arr.ind=T))) %>%
# calculate mean
mutate(mean_r2=mean(.$r2[index:max_row])) %>%
# calculate the difference
mutate(diff_of_distance=.$distance[max_row] - .$distance[index]) %>%
# unite the columns
unite(rows, index, max_row, sep = "-")
# A tibble: 10 x 5
r2 distance rows mean_r2 diff_of_distance
* <dbl> <int> <chr> <dbl> <int>
1 33.64 67866 1-5 17.860000 75527
2 8.50 77229 2-5 13.915000 66164
3 15.07 109119 3-6 13.842500 68551
4 24.35 142279 4-7 13.140000 74161
5 7.74 143393 5-7 9.403333 73047
6 8.21 177670 6-8 11.043333 76081
7 12.26 216440 7-9 17.076667 66116
8 12.66 253751 8-10 26.016667 67065
9 26.31 282556 9-10 32.695000 38260
10 39.08 320816 10-10 39.080000 0
This works because group_by subsets the dataframe, thus you can access within mutate the respective distance value per group and calculate the difference with the complete vector using .$distance as this access the complete column regardless the group_by() function.

Incremental change from predictor variable in R

Sample Data
A B C D E
1 2016 94.49433733 2 81.28
5 2016 95.38104534 4 139.6944
7 2016 95.43885385 1 69.11
8 2016 94.91936704 1 7.23
9 2016 95.21859776 3 152.31
10 2016 95.15797535 1 86.32
11 2016 95.1830432 2 38.24
13 2016 94.01256633 2 33.3
Given the sample data and using R, I want to build a sequence that gives me the incremental impact from my predictor variable (C).
Expected Table (increment by 0.5):
C ANSWER
85
85.05
85.1
85.15
85.2
85.25
85.3
85.35
I am looking to understand for every delta change (increase) in C, what happens to D?
Here is what I tried with transform and apply
transform(df, volumen=unlist(tapply(C, D, function(x) c(0, diff(x)))))
fit <- lm(D ~ C, data = my_sample_data) #Fits a linear model
my_sequence <- seq(from = 85, to = 85.35, by = 0.05 ) # first column
result <- fit$coefficients[1] + my_sequence * fit$coefficients[2] #2nd column
df <- data.frame(C = my_sequence, ANSWER = result) #Makes a table

When a variable switches from 1 to 2, delete some data from the other variables and average what's left?

I am analysing some data and need help.
Basically, I have a dataset that looks like this:
date <- seq(as.Date("2017-04-01"),as.Date("2017-05-09"),length.out=40)
switch <- c(rep(1:2,each=10),rep(1:2,each=10))
O2 <- runif(40,min=21.02,max=21.06)
CO2 <- runif(40,min=0.076,max=0.080)
test.data <- data.frame(date,switch,O2,CO2)
As can be seen, there's a switch column that switches between 1 and 2 every 10 data points. I want to write a code that does: when the "switch" column changes its value (from 1 to 2, or 2 to 1), delete the first 5 rows of data after the switch (i.e. leaving the 5 last data points for all the 4 variables), average the rest of the data points for O2 and CO2, and put them in 2 new columns (avg.O2 and avg.CO2) before the next switch. Then repeat this process until the end.
It's quite easy to do manually on paper or excel, but my real dataset would comprise thousands of data points and I would like to use R to do it automatically for me. So anyone has any ideas that could help me?
Please find my edits which should work for both regular and irregular
date <- seq(as.Date("2017-04-01"),as.Date("2017-05-09"),length.out=40)
switch <- c(rep(1:2,each=10),rep(1:2,each=10))
O2 <- runif(40,min=21.02,max=21.06)
CO2 <- runif(40,min=0.076,max=0.080)
test.data <- data.frame(date,switch,O2,CO2)
CleanMachineData <- function(Data, SwitchData, UnreliableRows = 5){
# First, we can properly turn your switch column into a grouping column (1,2,1,2)->(1,2,3,4)
grouplength <- rle(Data[,"switch"])$lengths
# mapply lets us input vector arguments into typically one/first-element only argument functions.
# In this case we create a sequence of lengths (output is a list/vector)
grouping <- mapply(seq, grouplength)
# Here we want it to become a single vector representing groups
groups <- mapply(rep, 1:length(grouplength), each = grouplength)
# if frequency was irregular, it will be a list, if regular it will be a matrix
# convert either into a vector by doing as follows:
if(class(grouping) == "list"){
groups <- unlist(groups)
} else {
groups <- as.vector(groups)
}
Data$group <- groups
#
# vector of the first row of each new switch (except the starting 0)
switchRow <- c(0,which(abs(diff(SwitchData)) == 1))+1
# I use "as.vector" to turn the matrix output of mapply into a sequence of numbers.
# "ToRemove" will have all the row numbers to get rid of from your original data, except for what happens before (in this case) row 10
ToRemove <- c(1:UnreliableRows, as.vector(mapply(seq, switchRow, switchRow+(UnreliableRows)-1)))
# I concatenate the missing beginning (1,2,3,4,5) and theToRemove them with c() and then remove them from n with "-"
Keep <- seq(nrow(Data))[-c(1:UnreliableRows,ToRemove)]
# Create the new data, (in case you don't know: data[<ROW>,<COLUMN>])
newdat <- Data[-ToRemove,]
# print the results
newdat
}
dat <- CleanMachineData(test.data, test.data$switch, 5)
dat
date switch O2 CO2 group
6 2017-04-05 1 21.03922 0.07648886 1
7 2017-04-06 1 21.04071 0.07747368 1
8 2017-04-07 1 21.05742 0.07946615 1
9 2017-04-08 1 21.04673 0.07782362 1
10 2017-04-09 1 21.04966 0.07936446 1
16 2017-04-15 2 21.02526 0.07833825 2
17 2017-04-16 2 21.04511 0.07747774 2
18 2017-04-17 2 21.03165 0.07662803 2
19 2017-04-18 2 21.03252 0.07960098 2
20 2017-04-19 2 21.04032 0.07892145 2
26 2017-04-25 1 21.03691 0.07691438 3
27 2017-04-26 1 21.05846 0.07857017 3
28 2017-04-27 1 21.04128 0.07891908 3
29 2017-04-28 1 21.03837 0.07817021 3
30 2017-04-29 1 21.02334 0.07917546 3
36 2017-05-05 2 21.02890 0.07723042 4
37 2017-05-06 2 21.04606 0.07979641 4
38 2017-05-07 2 21.03822 0.07985775 4
39 2017-05-08 2 21.04136 0.07781525 4
40 2017-05-09 2 21.05375 0.07941123 4
aggregate(cbind(O2,CO2) ~ group, dat, mean)
group O2 CO2
1 1 21.04675 0.07812336
2 2 21.03497 0.07819329
3 3 21.03967 0.07834986
4 4 21.04166 0.07882221
# crazier, irregular switching
test.data2 <- test.data
test.data2$switch <- unlist(mapply(rep, 1:2, times = 1, each = c(10,8,10,5,3,10)))[1:20]
dat2 <- CleanMachineData(test.data2, test.data2$switch, 5)
dat2
date switch O2 CO2 group
6 2017-04-05 1 21.03922 0.07648886 1
7 2017-04-06 1 21.04071 0.07747368 1
8 2017-04-07 1 21.05742 0.07946615 1
9 2017-04-08 1 21.04673 0.07782362 1
10 2017-04-09 1 21.04966 0.07936446 1
16 2017-04-15 2 21.02526 0.07833825 2
17 2017-04-16 2 21.04511 0.07747774 2
18 2017-04-17 2 21.03165 0.07662803 2
24 2017-04-23 1 21.05658 0.07669662 3
25 2017-04-24 1 21.04452 0.07983165 3
26 2017-04-25 1 21.03691 0.07691438 3
27 2017-04-26 1 21.05846 0.07857017 3
28 2017-04-27 1 21.04128 0.07891908 3
29 2017-04-28 1 21.03837 0.07817021 3
30 2017-04-29 1 21.02334 0.07917546 3
36 2017-05-05 2 21.02890 0.07723042 4
37 2017-05-06 2 21.04606 0.07979641 4
38 2017-05-07 2 21.03822 0.07985775 4
# You can try removing a vector with the following
lapply(5:7, function(x) {
dat <- CleanMachineData(test.data2, test.data2$switch, x)
list(data = dat, means = aggregate(cbind(O2,CO2)~group, dat, mean))
})
Use
test.data[rep(c(FALSE, TRUE), each=5),]
to select always the last five rows from the group of 10 rows.
Then you can use aggregate:
d2 <- test.data[rep(c(FALSE, TRUE), each=5),]
aggregate(cbind(O2, CO2) ~ 1, data=d2, FUN=mean)
If you want the average for every 5-rows-group:
aggregate(cbind(O2, CO2) ~ gl(k=5, n=nrow(d2)/5L), data=d2, FUN=mean)
Here is a generalization for the situation of arbitrary number of rows in test.data:
stay <- rep(c(FALSE, TRUE), each=5, length.out=nrow(test.data))
d2 <- test.data[stay,]
group <- gl(k=5, n=nrow(d2)/5L+1L, length=nrow(d2))
aggregate(cbind(O2, CO2) ~ group, data=d2, FUN=mean)
Here is a variant for mixing the data with the averages:
group <- gl(k=10, n=nrow(test.data)/10L+1L, length=nrow(test.data))
L <- split(test.data, group)
mySummary <- function(x) {
if (nrow(x) <= 5) return(NULL)
x <- x[-(1:5),]
d.avg <- aggregate(cbind(O2, CO2) ~ 1, data=x, FUN=mean)
rbind(x, cbind(date=NA, switch=-1, d.avg))
}
lapply(L, mySummary) # as list of dataframes
do.call(rbind, lapply(L, mySummary)) # as one dataframe

Selecting top finite number of rows for each unique value of a column in a data fame in R

I have a data frame with 3 columns. a,b,c. There are multiple rows corresponding to each unique value of column a. I want to select top 5 rows corresponding to each unique value of column a. column c is some value and the data frame is already sorted by it in descending order, so that would not be a problem. Can anyone please suggest how can I do this in R.
Stealing #ptocquin's example, here's how you can use base function by. You can flatten the result using do.call (see below).
> by(data = data, INDICES = data$a, FUN = function(x) head(x, 5))
# or by(data = data, INDICES = data$a, FUN = head, 5)
data$a: 1
a b c
21 1 0.1188552 1.6389895
41 1 1.0182033 1.4811359
61 1 -0.8795879 0.7784072
81 1 0.6485745 0.7734652
31 1 1.5102255 0.7107957
------------------------------------------------------------
data$a: 2
a b c
15 2 -1.09704040 1.1710693
85 2 0.42914795 0.8826820
65 2 -1.01480957 0.6736782
45 2 -0.07982711 0.3693384
35 2 -0.67643885 -0.2170767
------------------------------------------------------------
A similar thing could be achieved by splitting your data.frame based on a and then using lapply to step through each element subsetting first n rows.
split.data <- split(data, data$a)
subsetted.data <- lapply(split.data, FUN = function(x) head(x, 5)) # or ..., FUN = head, 5) like above
flatten.data <- do.call("rbind", subsetted.data)
head(flatten.data)
a b c
1.21 1 0.11885516 1.63898947
1.41 1 1.01820329 1.48113594
1.61 1 -0.87958790 0.77840718
1.81 1 0.64857445 0.77346517
1.31 1 1.51022545 0.71079568
2.15 2 -1.09704040 1.17106930
2.85 2 0.42914795 0.88268205
2.65 2 -1.01480957 0.67367823
2.45 2 -0.07982711 0.36933837
2.35 2 -0.67643885 -0.21707668
Here is my try :
library(plyr)
data <- data.frame(a=rep(sample(1:20,10),10),b=rnorm(100),c=rnorm(100))
data <- data[rev(order(data$c)),]
head(data, 15)
a b c
28 6 1.69611039 1.720081
91 11 1.62656460 1.651574
70 9 -1.17808386 1.641954
6 15 1.23420550 1.603140
23 7 0.70854914 1.588352
51 11 -1.41234359 1.540738
19 10 2.83730734 1.522825
49 10 0.39313579 1.370831
80 9 -0.59445323 1.327825
59 10 -0.55538404 1.214901
18 6 0.08445888 1.152266
86 15 0.53027267 1.066034
69 10 -1.89077464 1.037447
62 1 -0.43599566 1.026505
3 7 0.78544009 1.014770
result <- ddply(data, .(a), "head", 5)
head(result, 15)
a b c
1 1 -0.43599566 1.02650544
2 1 -1.55113486 0.36380251
3 1 0.68608364 0.30911430
4 1 -0.85406406 0.05555500
5 1 -1.83894595 -0.11850847
6 5 -1.79715809 0.77760033
7 5 0.82814909 0.22401278
8 5 -1.52726859 0.06745849
9 5 0.51655092 -0.02737905
10 5 -0.44004646 -0.28106808
11 6 1.69611039 1.72008079
12 6 0.08445888 1.15226601
13 6 -1.99465060 0.82214319
14 6 0.43855489 0.76221979
15 6 -2.15251353 0.64417757

Resources