Related
I have data that looks like this:
library(dplyr)
Data <- tibble(
ID = c("Code001", "Code001","Code001","Code002","Code002","Code002","Code002","Code002","Code003","Code003","Code003","Code003"),
Value = c(107,107,107,346,346,346,346,346,123,123,123,123))
I need to work out the average value per group per row. However, the value needs to be rounded (so no decimal places) and the group sum needs to equal the group sum of Value.
So solutions like this won't work:
Data %>%
add_count(ID) %>%
group_by(ID) %>%
mutate(Prop_Value_1 = Value/n,
Prop_Value_2 = round(Value/n))
Is there a solution that can produce an output like this:
Data %>%
mutate(Prop_Value = c(35,36,36,69,69,69,69,70,30,31,31,31))
Can use ceiling and then row_number to get there:
Data %>%
group_by(ID) %>%
mutate(count = n(),
ceil_avg = ceiling(Value/count)) %>%
mutate(sum_ceil_avg = sum(ceil_avg),
diff_sum = sum_ceil_avg - Value,
rn = row_number()) %>%
mutate(new_avg = ifelse(rn <= diff_sum,
ceil_avg - 1,
ceil_avg))
# A tibble: 12 × 8
# Groups: ID [3]
ID Value count ceil_avg sum_ceil_avg diff_sum rn new_avg
<chr> <dbl> <int> <dbl> <dbl> <dbl> <int> <dbl>
1 Code001 107 3 36 108 1 1 35
2 Code001 107 3 36 108 1 2 36
3 Code001 107 3 36 108 1 3 36
4 Code002 346 5 70 350 4 1 69
5 Code002 346 5 70 350 4 2 69
6 Code002 346 5 70 350 4 3 69
7 Code002 346 5 70 350 4 4 69
8 Code002 346 5 70 350 4 5 70
9 Code003 123 4 31 124 1 1 30
10 Code003 123 4 31 124 1 2 31
11 Code003 123 4 31 124 1 3 31
12 Code003 123 4 31 124 1 4 31
A first solution is to use integer division:
Data %>%
group_by(ID) %>%
mutate(Prop_Value = ifelse(row_number() <= Value %% n(), Value %/% n() + 1, Value %/% n()))
# A tibble: 12 × 3
# Groups: ID [3]
ID Value Prop_Value
<chr> <dbl> <dbl>
1 Code001 107 36
2 Code001 107 36
3 Code001 107 35
4 Code002 346 70
5 Code002 346 69
6 Code002 346 69
7 Code002 346 69
8 Code002 346 69
9 Code003 123 31
10 Code003 123 31
11 Code003 123 31
12 Code003 123 30
This question already has answers here:
Overlap join with start and end positions
(5 answers)
Closed 1 year ago.
I have got two dataframes - one containing names and ranges of limits (only few hundreds of rows, 1000 at most), which needs to be assigned to a "measurements" dataframe which can consist of million of rows (or ten's of millions of row).
Currently I am doing left_join and filtering value to get a specific limit assigned to each measurement. This however is quite ineffective and cost a lot of resources. For larger dataframes, the code is even unable to run.
Any ideas for more effective solutions will be helpful.
library(dplyr)
## this one has got only few houndreds rows
df_limits <- read.table(text="Title station_id limit_from limit_to
Level_3_Low 1 0 70
Level_2_Low 1 70 90
Level_1_Low 1 90 100
Optimal 1 100 110
Level_1_High 1 110 130
Level_2_High 1 130 150
Level_3_High 1 150 180
Level_3_Low 2 0 70
Level_2_Low 2 70 90
Level_1_Low 2 90 100
Optimal 2 100 110
Level_1_High 2 110 130
Level_2_High 2 130 150
Level_3_High 2 150 180
Level_3_Low 3 0 70
Level_2_Low 3 70 90
Level_1_Low 3 90 100
Optimal 3 100 110
Level_1_High 3 110 130
Level_2_High 3 130 150
Level_3_High 3 150 180
",header = TRUE, stringsAsFactors = TRUE)
# this DF has got millions of rows
df_measurements <- read.table(text="measurement_id station_id value
12121534 1 172
12121618 1 87
12121703 1 9
12121709 2 80
12121760 2 80
12121813 2 115
12121881 3 67
12121907 3 100
12121920 3 108
12121979 1 102
12121995 1 53
12122022 1 77
12122065 2 158
12122107 2 144
12122113 2 5
12122135 3 100
12122187 3 136
12122267 3 130
12122359 1 105
12122366 1 126
12122398 1 143
",header = TRUE, stringsAsFactors = TRUE)
df_results <- left_join(df_measurements,df_limits, by = "station_id") %>%
filter ((value >= limit_from & value < limit_to) | is.na(Title)) %>%
select(names(df_measurements), Title)
Another data.table solution using non-equijoins:
library(data.table)
setDT(df_measurements)
setDT(df_limits)
df_limits[df_measurements, .(station_id, measurement_id, value, Title),
on=.(station_id = station_id, limit_from < value, limit_to >= value)]
station_id measurement_id value Title
1: 1 12121534 172 Level_3_High
2: 1 12121618 87 Level_2_Low
3: 1 12121703 9 Level_3_Low
4: 2 12121709 80 Level_2_Low
5: 2 12121760 80 Level_2_Low
6: 2 12121813 115 Level_1_High
7: 3 12121881 67 Level_3_Low
8: 3 12121907 100 Level_1_Low
9: 3 12121920 108 Optimal
10: 1 12121979 102 Optimal
11: 1 12121995 53 Level_3_Low
12: 1 12122022 77 Level_2_Low
13: 2 12122065 158 Level_3_High
14: 2 12122107 144 Level_2_High
15: 2 12122113 5 Level_3_Low
16: 3 12122135 100 Level_1_Low
17: 3 12122187 136 Level_2_High
18: 3 12122267 130 Level_1_High
19: 1 12122359 105 Optimal
20: 1 12122366 126 Level_1_High
21: 1 12122398 143 Level_2_High
A simple base R (no need additional packages) option using subset + merge
subset(
merge(
df_measurements,
df_limits,
all = TRUE
),
limit_from < value & limit_to >= value
)
gives
station_id measurement_id value Title limit_from limit_to
7 1 12121534 172 Level_3_High 150 180
9 1 12121618 87 Level_2_Low 70 90
15 1 12121703 9 Level_3_Low 0 70
23 1 12122022 77 Level_2_Low 70 90
34 1 12122398 143 Level_2_High 130 150
39 1 12121979 102 Optimal 100 110
43 1 12121995 53 Level_3_Low 0 70
54 1 12122366 126 Level_1_High 110 130
60 1 12122359 105 Optimal 100 110
65 2 12121760 80 Level_2_Low 70 90
75 2 12121813 115 Level_1_High 110 130
79 2 12121709 80 Level_2_Low 70 90
91 2 12122065 158 Level_3_High 150 180
97 2 12122107 144 Level_2_High 130 150
99 2 12122113 5 Level_3_Low 0 70
108 3 12121907 100 Level_1_Low 90 100
116 3 12121920 108 Optimal 100 110
124 3 12122267 130 Level_1_High 110 130
127 3 12121881 67 Level_3_Low 0 70
136 3 12122135 100 Level_1_Low 90 100
146 3 12122187 136 Level_2_High 130 150
Another option is using dplyr
df_measurements %>%
group_by(station_id) %>%
mutate(Title = with(
df_limits,
Title[
findInterval(
value,
unique(unlist(cbind(limit_from, limit_to)[station_id == first(.$station_id)])),
left.open = TRUE
)
]
)) %>%
ungroup()
which gives
# A tibble: 21 x 4
measurement_id station_id value Title
<int> <int> <int> <fct>
1 12121534 1 172 Level_3_High
2 12121618 1 87 Level_2_Low
3 12121703 1 9 Level_3_Low
4 12121709 2 80 Level_2_Low
5 12121760 2 80 Level_2_Low
6 12121813 2 115 Level_1_High
7 12121881 3 67 Level_3_Low
8 12121907 3 100 Level_1_Low
9 12121920 3 108 Optimal
10 12121979 1 102 Optimal
# ... with 11 more rows
Benchmarking
f_TIC1 <- function() {
subset(
merge(
df_measurements,
df_limits,
all = TRUE
),
limit_from < value & limit_to >= value
)
}
f_TIC2 <- function() {
df_measurements %>%
group_by(station_id) %>%
mutate(Title = with(
df_limits,
Title[
findInterval(
value,
unique(unlist(cbind(limit_from, limit_to)[station_id == first(station_id)])),
left.open = TRUE
)
]
)) %>%
ungroup()
}
dt_limits <- as.data.table(df_limits)
dt_measurements <- as.data.table(df_measurements)
f_Waldi <- function() {
dt_limits[
dt_measurements,
.(station_id, measurement_id, value, Title),
on = .(station_id, limit_from < value, limit_to >= value)
]
}
f_TimTeaFan <- function() {
setkey(dt_limits, station_id, limit_from, limit_to)
foverlaps(dt_measurements[, value2 := value],
dt_limits,
by.x = c("station_id", "value", "value2"),
type = "within",
)[
value < limit_to,
.(measurement_id, station_id, value, Title)
]
}
you will see that
Unit: relative
expr min lq mean median uq max neval
f_TIC1() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100
f_TIC2() 4.848639 4.909985 4.895588 4.942616 5.124704 2.580819 100
f_Waldi() 3.182027 3.010615 3.069916 3.114160 3.397845 1.698386 100
f_TimTeaFan() 5.523778 5.112872 5.226145 5.112407 5.745671 2.446987 100
Here is one way to do it. The problematic part was the condition value < limit_to. foverlaps checks for the condition value <= limit_to which results in double matches so here we call the filter condition after the overlapping join and then select the desired columns. Note that the result is not in the same order as the df_results generated with dplyr.
library(data.table)
dt_limits <- as.data.table(df_limits)
dt_measurements <- as.data.table(df_measurements)
setkey(dt_limits, station_id, limit_from, limit_to)
dt_results <- foverlaps(dt_measurements[, value2 := value],
dt_limits,
by.x = c("station_id", "value", "value2"),
type = "within",
)[value < limit_to,
.(measurement_id , station_id, value, Title)]
dt_results[]
#> measurement_id station_id value Title
#> 1: 12121534 1 172 Level_3_High
#> 2: 12121618 1 87 Level_2_Low
#> 3: 12121703 1 9 Level_3_Low
#> 4: 12121709 2 80 Level_2_Low
#> 5: 12121760 2 80 Level_2_Low
#> 6: 12121813 2 115 Level_1_High
#> 7: 12121881 3 67 Level_3_Low
#> 8: 12121907 3 100 Optimal
#> 9: 12121920 3 108 Optimal
#> 10: 12121979 1 102 Optimal
#> 11: 12121995 1 53 Level_3_Low
#> 12: 12122022 1 77 Level_2_Low
#> 13: 12122065 2 158 Level_3_High
#> 14: 12122107 2 144 Level_2_High
#> 15: 12122113 2 5 Level_3_Low
#> 16: 12122135 3 100 Optimal
#> 17: 12122187 3 136 Level_2_High
#> 18: 12122267 3 130 Level_2_High
#> 19: 12122359 1 105 Optimal
#> 20: 12122366 1 126 Level_1_High
#> 21: 12122398 1 143 Level_2_High
#> measurement_id station_id value Title
Created on 2021-08-09 by the reprex package (v0.3.0)
I have a data frame df:
library(tidyverse)
t <- c(103,104,108,120,127,129,140,142,150,151,160,177,178,183,186,187,191,194,198,199)
w <- c(1,1,1,-1,-1,-1,-1,-1,1,1,-1,-1,1,1,1,-1,1,1,-1,-1)
df <- data_frame(t, w)
> dput(df)
structure(list(t = c(103, 104, 108, 120, 127, 129, 140, 142,
150, 151, 160, 177, 178, 183, 186, 187, 191, 194, 198, 199),
w = c(1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, 1, 1, 1,
-1, 1, 1, -1, -1)), .Names = c("t", "w"), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
> df
# A tibble: 20 x 2
t w
<dbl> <dbl>
1 103 1.00
2 104 1.00
3 108 1.00
4 120 -1.00
5 127 -1.00
6 129 -1.00
7 140 -1.00
8 142 -1.00
9 150 1.00
10 151 1.00
11 160 -1.00
12 177 -1.00
13 178 1.00
14 183 1.00
15 186 1.00
16 187 -1.00
17 191 1.00
18 194 1.00
19 198 -1.00
20 199 -1.00
Now, if the value in w is larger than zero, find the nearest previous negative w, and assign the difference between the corresponding t values to a new column d. Otherwise, d is equal to zero. I.e. the desired output should look like this:
t w d
103 1.00 NA (there is no previous w < 0)
104 1.00 NA (there is no previous w < 0)
108 1.00 NA (there is no previous w < 0)
120 -1.00 0
127 -1.00 0
129 -1.00 0
140 -1.00 0
142 -1.00 0
150 1.00 8 = 150 - 142
151 1.00 9 = 151 - 142
160 -1.00 0
177 -1.00 0
178 1.00 1 = 178 - 177
183 1.00 6 = 183 - 177
186 1.00 9 = 186 - 177
187 -1.00 0
191 1.00 4 = 191 - 187
194 1.00 7 = 194 - 187
198 -1.00 0
199 -1.00 0
(The NAs above might be zero as well.)
Since yesterday I'm trying to attack this problem using findInterval(), which(), etc. but without success. Another way I was thinking about is to introduce somehow a variable shift in lag() function...
Ideally, I would like to have a tidyverse-like solution.
Any help would be very much appreciated.
Thank you in advance!
Using data.table (since tidyverse currently has no non-equi joins):
library(data.table)
DT = data.table(df)
DT[, v := 0]
DT[w > 0, v :=
DT[w < 0][.SD, on=.(t < t), mult="last", i.t - x.t]
]
t w v
1: 103 1 NA
2: 104 1 NA
3: 108 1 NA
4: 120 -1 0
5: 127 -1 0
6: 129 -1 0
7: 140 -1 0
8: 142 -1 0
9: 150 1 8
10: 151 1 9
11: 160 -1 0
12: 177 -1 0
13: 178 1 1
14: 183 1 6
15: 186 1 9
16: 187 -1 0
17: 191 1 4
18: 194 1 7
19: 198 -1 0
20: 199 -1 0
It initializes the new column to 0, then replaces it on the subset of rows where w > 0. The replacement uses a join of the subset of data, .SD, where w > 0 to the part of the table where w < 0, DT[w < 0]. The join syntax is x[i, on=, j] where in this case...
x = DT[w < 0]
i = .SD = DT[w > 0]
The join uses each row of i to look up rows in x based on the rules in on=. When multiple matches are found, we take only the last (mult = "last").
j is what we use the join to do, here calculate the difference between two columns. To disambiguate columns from each table, we use prefixes x.* and i.*.
Using cummax. I'm not sure if this generalizes, but it works for the example:
DT[, v := t - cummax(t*(w < 0))]
DT[cumsum(w < 0) == 0, v := NA]
I guess this requires that the t column is sorted in increasing order.
A tidverse way:
First, make an intermediate column (t2) with NA if positive and and t if neg
df <- mutate(df, t2 = case_when(w > 0 ~ as.numeric(NA), TRUE ~ t))
#fill NA in t2 so that for each row, t2 is value of t when w was last neg
df <- fill(df, t2)
#> df
# A tibble: 20 x 3
# t w t2
# <dbl> <dbl> <dbl>
# 1 103 1 NA
# 2 104 1 NA
# 3 108 1 NA
# 4 120 -1 120
# 5 127 -1 127
# 6 129 -1 129
# 7 140 -1 140
# 8 142 -1 142
# 9 150 1 142
#10 151 1 142
#11 160 -1 160
#12 177 -1 177
#13 178 1 177
#14 183 1 177
#15 186 1 177
#16 187 -1 187
#17 191 1 187
#18 194 1 187
#19 198 -1 198
#20 199 -1 199
Then subtract t2 from t
df$d <- with(df, t - t2)
#> df
# A tibble: 20 x 4
# t w t2 d
# <dbl> <dbl> <dbl> <dbl>
# 1 103 1 NA NA
# 2 104 1 NA NA
# 3 108 1 NA NA
# 4 120 -1 120 0
# 5 127 -1 127 0
# 6 129 -1 129 0
# 7 140 -1 140 0
# 8 142 -1 142 0
# 9 150 1 142 8
#10 151 1 142 9
#11 160 -1 160 0
#12 177 -1 177 0
#13 178 1 177 1
#14 183 1 177 6
#15 186 1 177 9
#16 187 -1 187 0
#17 191 1 187 4
#18 194 1 187 7
#19 198 -1 198 0
#20 199 -1 199 0
I'm new to R so this question might be quite basic.
There is a column in my data which goes like 4 4 4 4 7 7 7 13 13 13 13 13 13 13 4 4 7 7 7 13 13 13 13 13 13 13 13 4 4.....
One cycle of 4...7...13... is considered as one complete run, to which I will assign a Run Number (1, 2, 3...) to each run.
The number of times that each value (4, 7, 13) repeats is not fixed, and the total number of rows in a run is not fixed either. The total number of runs is unknown (but typically ranging from 60-90). The order of (4, 7, 13) is fixed.
I have attached my current code here. It works fine, but it does take a minute or two when there's about a few million rows of data. I'm aware that growing vectors in a for loop is really not recommended in R, so I would like to ask if anyone has a more elegant solution to this.
Sample data can be generated with the code below, and the desired output can also be generated with the sample code below.
#Generates sample data
df <- data.frame(Temp = c(sample(50:250, 30)), Pres = c(sample(500:1000, 30)),
Message = c(rep(4, 3), rep(7, 2), rep(13, 6), rep(4, 4), rep(7, 1), rep(13, 7), rep(4, 3), rep(7, 4)))
Current Solution
prev_val = 0
Rcount = 1
Run_Count = c()
for (val in df$Message)
{
delta = prev_val - val
if((delta == 9))
Rcount = Rcount + 1
prev_val = val
Run_Count = append(Run_Count, Rcount)
}
df$Run = Run_Count
The desired output:
226 704 4 1
138 709 4 1
136 684 4 1
57 817 7 1
187 927 7 1
190 780 13 1
152 825 13 1
126 766 13 1
202 855 13 1
214 757 13 1
172 922 13 1
50 975 4 2
159 712 4 2
212 802 4 2
181 777 4 2
102 933 7 2
165 753 13 2
67 962 13 2
119 631 13 2
The data frame will later be split by the Run Number, but after being categorized according to the value, i.e.
... 4 1
... 4 1
... 4 1
... 4 1
... 4 2
... 4 2
... 4 2
... 4 3
.....
I am not sure if this is an improvement, but it uses the rle run length encoding function to determine the length of each repeat in each run.
df <- data.frame(Temp = c(sample(50:250, 30)), Pres = c(sample(500:1000, 30)),
Message = c(rep(4, 3), rep(7, 2), rep(13, 6), rep(4, 4), rep(7, 1), rep(13, 7), rep(4, 3), rep(7, 4)))
rleout<-rle(df$Message)
#find the length of the runs and create the numbering
runcounts<-ceiling(length(rleout$lengths)/3)
runs<-rep(1:runcounts, each=3)
#need to trim the length of run numbers for cases where there is not a
# full sequence, as in the test case.
rleout$values<-runs[1:length(rleout$lengths)]
#create the new column
df$out<-inverse.rle(rleout)
I'm sure someone can come along and demonstrate and a better and faster method using data tables.
easily use:
df$runID <- cumsum(c(-1,diff(df$Message)) < 0)
# Temp Pres Message runID
# 1 174 910 4 1
# 2 181 612 4 1
# 3 208 645 4 1
# 4 89 601 7 1
# 5 172 812 7 1
# 6 213 672 13 1
# 7 137 848 13 1
# 8 153 833 13 1
# 9 127 591 13 1
# 10 243 907 13 1
# 11 146 599 13 1
# 12 151 567 4 2
# 13 139 855 4 2
# 14 147 793 4 2
# 15 227 533 4 2
# 16 241 959 7 2
# 17 206 948 13 2
# 18 236 875 13 2
# 19 133 537 13 2
# 20 70 688 13 2
# 21 218 528 13 2
# 22 244 927 13 2
# 23 161 697 13 2
# 24 177 572 4 3
# 25 179 911 4 3
# 26 192 559 4 3
# 27 60 771 7 3
# 28 245 682 7 3
# 29 196 614 7 3
# 30 171 536 7 3
I have a data frame like this:
df <- data.frame(x=c(7,5,4),y=c(100,100,100),w=c(170,170,170),z=c(132,720,1256))
I create a new column using mapply:
set.seed(123)
library(truncnorm)
df$res <- mapply(rtruncnorm,df$x,df$y,df$w,df$z,25)
So, I got:
> df
#x y w z res
#1 7 100 170 132 117.9881, 126.2456, 133.7627, 135.2322, 143.5229, 100.3735, 114.8287
#2 5 100 170 720 168.8581, 169.4955, 169.6461, 169.8998, 169.0343
#3 4 100 170 1256 169.7245, 167.6744, 169.7025, 169.4441
#dput(df)
df <- structure(list(x = c(7, 5, 4), y = c(100, 100, 100), w = c(170,
170, 170), z = c(132, 720, 1256), res = list(c(117.988108836195,
126.245562762918, 133.762709785614, 135.232193379024, 143.52290514973,
100.373469134837, 114.828678702662), c(168.858147661715, 169.495493758985,
169.646123183828, 169.899849943838, 169.034333943479), c(169.724470294466,
167.674371713068, 169.70250974042, 169.444134892323))), .Names = c("x",
"y", "w", "z", "res"), row.names = c(NA, -3L), class = "data.frame")
But what I really need is repeat each row of df dataframe according to the df$res result as follows:
> df2
# x y w z res
#1 7 100 170 132 117.9881
#2 7 100 170 132 126.2456
#3 7 100 170 132 133.7627
#4 7 100 170 132 135.2322
#5 7 100 170 132 143.5229
#6 7 100 170 132 100.3735
#7 7 100 170 132 114.8287
#8 5 100 170 720 168.8581
#9 5 100 170 720 169.4955
#10 5 100 170 720 169.6461
#11 5 100 170 720 169.8998
#12 5 100 170 720 169.0343
#13 4 100 170 1256 169.7245
#14 4 100 170 1256 167.6744
#15 4 100 170 1256 169.7025
#16 4 100 170 1256 169.4441
How, do I achieve this efficiently? I need to apply this to a big dataframe
df <- data.frame(x=c(7,5,4),y=c(100,100,100),w=c(170,170,170),z=c(132,720,1256))
set.seed(123)
l <- mapply(rtruncnorm,df$x,df$y,df$w,df$z,25)
cbind.data.frame(df[rep(seq_along(l), lengths(l)),],
res = unlist(l))
# x y w z res
# 1 7 100 170 132 117.9881
# 1.1 7 100 170 132 126.2456
# 1.2 7 100 170 132 133.7627
# 1.3 7 100 170 132 135.2322
# 1.4 7 100 170 132 143.5229
# 1.5 7 100 170 132 100.3735
# 1.6 7 100 170 132 114.8287
# 2 5 100 170 720 168.8581
# 2.1 5 100 170 720 169.4955
# 2.2 5 100 170 720 169.6461
# 2.3 5 100 170 720 169.8998
# 2.4 5 100 170 720 169.0343
# 3 4 100 170 1256 169.7245
# 3.1 4 100 170 1256 167.6744
# 3.2 4 100 170 1256 169.7025
# 3.3 4 100 170 1256 169.4441
Try this based on your given df:
df$res <- sapply(df$res, paste0, collapse=",")
do.call(rbind, apply(df, 1, function(x) do.call(expand.grid, strsplit(x, ","))))
# x y w z res
# 1 7 100 170 132 117.988108836195
# 2 7 100 170 132 126.245562762918
# 3 7 100 170 132 133.762709785614
# 4 7 100 170 132 135.232193379024
# 5 7 100 170 132 143.52290514973
# 6 7 100 170 132 100.373469134837
# 7 7 100 170 132 114.828678702662
# 8 5 100 170 720 168.858147661715
# 9 5 100 170 720 169.495493758985
# 10 5 100 170 720 169.646123183828
# 11 5 100 170 720 169.899849943838
# 12 5 100 170 720 169.034333943479
# 13 4 100 170 1256 169.724470294466
# 14 4 100 170 1256 167.674371713068
# 15 4 100 170 1256 169.70250974042
# 16 4 100 170 1256 169.444134892323