Is there a way to replace NAs in R using horizontal order? - r

I have the following data frame:
df <-structure(list(time = c("12:00:00", "12:05:00", "12:10:00", "12:15:00",
"12:20:00", "12:25:00", "12:30:00", "12:35:00", "12:40:00", "12:45:00",
"12:50:00", "12:55:00", "13:00:00", "13:05:00", "13:10:00", "13:15:00",
"13:20:00", "13:25:00"), speedA = c(60L, 75L, 65L, 45L, 12L,
15L, 20L, 45L, 65L, 60L, 60L, 30L, 35L, 45L, 25L, 15L, 10L, 5L
), speedB = c(50L, 30L, NA, 40L, NA, NA, 18L, NA, NA, NA, 15L,
10L, 25L, NA, NA, 12L, NA, NA), speedC = c(60L, 25L, NA, NA,
30L, 15L, 50L, 60L, NA, 35L, 34L, NA, 15L, 64L, 10L, 7L, 60L,
60L), speedD = c(NA, 10L, 60L, NA, 50L, 55L, 45L, 35L, NA, NA,
45L, 60L, 35L, 34L, 36L, 39L, 48L, 47L)), class = "data.frame", row.names = c(NA,
-18L))
I want to replace the NAs with values using interpolation between the horizontal values at the same row of each NA.
The expected result:
df2<- structure(list(time = c("12:00:00", "12:05:00", "12:10:00", "12:15:00",
"12:20:00", "12:25:00", "12:30:00", "12:35:00", "12:40:00", "12:45:00",
"12:50:00", "12:55:00", "13:00:00", "13:05:00", "13:10:00", "13:15:00",
"13:20:00", "13:25:00"), speedA = c(60L, 75L, 65L, 45L, 12L,
15L, 20L, 45L, 65L, 60L, 60L, 30L, 35L, 45L, 25L, 15L, 10L, 5L
), speedB = c(50, 30, 63.33333, 40, 21, 15, 18, 52.5, 65, 47.5,
15, 10, 25, 54.5, 17.5, 12, 35, 32.5), speedC = c(60, 25, 61.66667,
40, 30, 15, 50, 60, 65, 35, 34, 35, 15, 64, 10, 7, 60, 60), speedD = c(60L,
10L, 60L, 40L, 50L, 55L, 45L, 35L, 65L, 35L, 45L, 60L, 35L, 34L,
36L, 39L, 48L, 47L)), class = "data.frame", row.names = c(NA,
-18L))

We can use zoo::na.approx to interpolate values. For values which we are not able to interpolate (NA values at the last) we use tidyr::fill to fill it.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -time) %>%
group_by(time) %>%
mutate(value = zoo::na.approx(value, na.rm = FALSE)) %>%
fill(value) %>%
pivot_wider()
# time speedA speedB speedC speedD
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 12:00:00 60 50 60 60
# 2 12:05:00 75 30 25 10
# 3 12:10:00 65 63.333 61.667 60
# 4 12:15:00 45 40 40 40
# 5 12:20:00 12 21 30 50
# 6 12:25:00 15 15 15 55
# 7 12:30:00 20 18 50 45
# 8 12:35:00 45 52.5 60 35
# 9 12:40:00 65 65 65 65
#10 12:45:00 60 47.5 35 35
#11 12:50:00 60 15 34 45
#12 12:55:00 30 10 35 60
#13 13:00:00 35 25 15 35
#14 13:05:00 45 54.5 64 34
#15 13:10:00 25 17.5 10 36
#16 13:15:00 15 12 7 39
#17 13:20:00 10 35 60 48
#18 13:25:00 5 32.5 60 47

You can use zoo::na.approx() row-wise with c_across().
library(dplyr)
library(tidyr)
library(zoo)
df %>%
rowwise() %>%
mutate(speed = list(na.locf(na.approx(c_across(-time), na.rm = FALSE))), .keep = "unused") %>%
unnest_wider(speed, names_sep = "")
# # A tibble: 18 x 5
# time speed1 speed2 speed3 speed4
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 12:00:00 60 50 60 60
# 2 12:05:00 75 30 25 10
# 3 12:10:00 65 63.3 61.7 60
# 4 12:15:00 45 40 40 40
# 5 12:20:00 12 21 30 50
# 6 12:25:00 15 15 15 55
# 7 12:30:00 20 18 50 45
# 8 12:35:00 45 52.5 60 35
# 9 12:40:00 65 65 65 65
# 10 12:45:00 60 47.5 35 35
# 11 12:50:00 60 15 34 45
# 12 12:55:00 30 10 35 60
# 13 13:00:00 35 25 15 35
# 14 13:05:00 45 54.5 64 34
# 15 13:10:00 25 17.5 10 36
# 16 13:15:00 15 12 7 39
# 17 13:20:00 10 35 60 48
# 18 13:25:00 5 32.5 60 47

Related

How to subtract the specified percentage between observations to perform complex arithmetic operations in R

I have 2 datasets
d1=structure(list(mdm = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L), perc = c(50L, 60L, 70L, 80L, 85L, 90L, 95L,
96L, 97L, 98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 110L,
115L, 120L, 130L, 140L, 150L, 50L, 60L, 70L, 80L, 85L, 90L, 95L,
96L, 97L, 98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 110L,
115L, 120L, 130L, 140L, 150L), price = c(38.9, 46.68, 54.46,
62.24, 66.13, 70.02, 73.91, 74.688, 75.466, 76.244, 77.022, 77.8,
78.578, 79.356, 80.134, 80.912, 81.69, 85.58, 89.47, 93.36, 101.14,
108.92, 116.7, 38.9, 46.68, 54.46, 62.24, 66.13, 70.02, 73.91,
74.688, 75.466, 76.244, 77.022, 77.8, 78.578, 79.356, 80.134,
80.912, 81.69, 85.58, 89.47, 93.36, 101.14, 108.92, 116.7), count = c(175,
160, 145, 130, 122.5, 115, 107.5, 106, 104.5, 103, 101.5, 100,
98.5, 97, 95.5, 94, 92.5, 85, 77.5, 70, 55, 40, 25, 175, 160,
145, 130, 122.5, 115, 107.5, 106, 104.5, 103, 101.5, 100, 98.5,
97, 95.5, 94, 92.5, 85, 77.5, 70, 55, 40, 25), profit = c(2607.5,
3628.8, 4416.7, 4971.2, 5160.925, 5292.3, 5365.325, 5372.928,
5378.197, 5381.132, 5381.733, 5380, 5375.933, 5369.532, 5360.797,
5349.728, 5336.325, 5234.3, 5073.925, 4855.2, 4242.7, 3396.8,
2317.5, 2432.5, 3468.8, 4271.7, 4841.2, 5038.425, 5177.3, 5257.825,
5266.928, 5273.697, 5278.132, 5280.233, 5280, 5277.433, 5272.532,
5265.297, 5255.728, 5243.825, 5149.3, 4996.425, 4785.2, 4187.7,
3356.8, 2292.5)), class = "data.frame", row.names = c(NA, -46L
))
and second dataset represents the percentage values by which it is necessary to reduce or increase the value of price and count in d1 also d2 contains the cost column
d2=structure(list(mdm = 7:8, elast = c(1.5, 1.5), cost = 24:25), class = "data.frame", row.names = c(NA,
-2L))
I'm having some troubles with complex arithmetic and I need help.
I'll try to describe my question in more detail.
I need for each mdm group to subtract the percentage indicated in perc column from the price value, where the perc column = 100. (100 is always the start value.)
For example for mdm=7, perc=100 where price=77.8.
The next perc value is 99, i.e. less by 1, so subtract 1 percent from 77.8 and get 77.022. perc = 85, this means that from the starting point 77.8 we subtract 15% = 66.13, perc = 50, which means we subtract 50 percent from the starting point.
In a similar way, I need to add percentages up, for example 101, this means that 1 percent up from the price = 77.8 i.e. 78,578, thus forming the price column and so on.
Further, the price value with perc = 100 has a value in the count column, in this example it is also = 100 (but this is not always the case).
I need to take the value from the elast column for each mdm group from d2 dataset and this value is multiplied by the next percentage of 100. For example, where perc = 99 for mdm = 7, the value of 1.5 must be multiplied by 1 (100*1,5=101.5), where the value of perc = 70, then 30 * 1.5 (100-70=30) 30*1,5=45 100+45=145 and so on.
The last step for each mdm in data d2 there is the cost price. This means that from the already formed price column, we must subtract the cost value, for example, for mdm = 7, cost=24 from the price (38.9-24 = 14.9), this value is multiplied by the value in the count column, i.e. in this case 175. This action creates a new column profit=14.9*175=2607
In this reproducible example, the price column is all filled in by me for a sample. In raw data this table looks like this (indeed desired output in d1 dataset)
The initial data looks like this
mdm perc price count
1 7 50 NA NA
2 7 60 NA NA
3 7 70 NA NA
4 7 80 NA NA
5 7 85 NA NA
6 7 90 NA NA
7 7 95 NA NA
8 7 96 NA NA
9 7 97 NA NA
10 7 98 NA NA
11 7 99 NA NA
**12 7 100 77.8 100**
13 7 101 NA NA
14 7 102 NA NA
15 7 103 NA NA
16 7 104 NA NA
17 7 105 NA NA
18 7 110 NA NA
19 7 115 NA NA
20 7 120 NA NA
21 7 130 NA NA
22 7 140 NA NA
23 7 150 NA NA
24 8 50 NA NA
25 8 60 NA NA
26 8 70 NA NA
27 8 80 NA NA
28 8 85 NA NA
29 8 90 NA NA
30 8 95 NA NA
31 8 96 NA NA
32 8 97 NA NA
33 8 98 NA NA
34 8 99 NA NA
**35 8 100 77.8 100**
36 8 101 NA NA
37 8 102 NA NA
38 8 103 NA NA
39 8 104 NA NA
40 8 105 NA NA
41 8 110 NA NA
42 8 115 NA NA
43 8 120 NA NA
44 8 130 NA NA
45 8 140 NA NA
46 8 150 NA NA
Thanks for your any valuable help.
Using data.table
library(data.table)
setDT(d1)[d2, c("price", "count", "cost") :=
.((price[perc == 100]/100)*perc, count[perc == 100] +
(elast* count[perc == 100]-perc), i.cost), on = .(mdm)]
d1[, last_step := (price - cost) * count]
-output
> head(d1)
mdm perc price count profit cost last_step
1: 7 50 38.90 200 2607.500 24 2980.00
2: 7 60 46.68 190 3628.800 24 4309.20
3: 7 70 54.46 180 4416.700 24 5482.80
4: 7 80 62.24 170 4971.200 24 6500.80
5: 7 85 66.13 165 5160.925 24 6951.45
6: 7 90 70.02 160 5292.300 24 7363.20
Here is one way:
First we join both dataframes,
then we define the rules as you describe in detail (therefore it is easy to translate to code :-).
I think most challenging and tricky thinking is to fix the price value at 100% -> in this case price[perc=100]. The rest is described by your fantastic explanation:
library(dplyr)
df %>%
left_join(d2, by="mdm") %>%
group_by(mdm) %>%
mutate(price = (price[perc==100]/100)*perc,
count = (count[perc==100]+(elast* count[perc==100]-perc)),
last_step = (price-cost)*count)
mdm perc price count elast cost last_step
<int> <int> <dbl> <dbl> <dbl> <int> <dbl>
1 7 50 38.9 175 1.5 24 2607.
2 7 60 46.7 160 1.5 24 3629.
3 7 70 54.5 145 1.5 24 4417.
4 7 80 62.2 130 1.5 24 4971.
5 7 85 66.1 122. 1.5 24 5161.
6 7 90 70.0 115 1.5 24 5292.
7 7 95 73.9 108. 1.5 24 5365.
8 7 96 74.7 106 1.5 24 5373.
9 7 97 75.5 104. 1.5 24 5378.
10 7 98 76.2 103 1.5 24 5381.
# … with 36 more rows
# ℹ Use `print(n = ...)` to see more rows
You should be able to produce d1 from the original frame as follows:
d1 %>%
group_by(mdm) %>%
mutate(price = price[perc==100]*(1-(100-perc)/100)) %>%
ungroup %>%
inner_join(d2, by="mdm") %>%
mutate(count = count[perc==100] + (100-perc)*elast, profit = count*(price-cost)) %>%
select(-c(elast,cost))
Input:
d1 = structure(list(mdm = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L), perc = c(50L, 60L, 70L, 80L, 85L, 90L, 95L,
96L, 97L, 98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 110L,
115L, 120L, 130L, 140L, 150L, 50L, 60L, 70L, 80L, 85L, 90L, 95L,
96L, 97L, 98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 110L,
115L, 120L, 130L, 140L, 150L), price = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 77.8, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 77.8,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), count = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 100, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 100, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-46L))
Output:
mdm perc price count profit
1 7 50 38.900 175.0 2607.500
2 7 60 46.680 160.0 3628.800
3 7 70 54.460 145.0 4416.700
4 7 80 62.240 130.0 4971.200
5 7 85 66.130 122.5 5160.925
6 7 90 70.020 115.0 5292.300
7 7 95 73.910 107.5 5365.325
8 7 96 74.688 106.0 5372.928
9 7 97 75.466 104.5 5378.197
10 7 98 76.244 103.0 5381.132
11 7 99 77.022 101.5 5381.733
12 7 100 77.800 100.0 5380.000
13 7 101 78.578 98.5 5375.933
14 7 102 79.356 97.0 5369.532
15 7 103 80.134 95.5 5360.797
16 7 104 80.912 94.0 5349.728
17 7 105 81.690 92.5 5336.325
18 7 110 85.580 85.0 5234.300
19 7 115 89.470 77.5 5073.925
20 7 120 93.360 70.0 4855.200
21 7 130 101.140 55.0 4242.700
22 7 140 108.920 40.0 3396.800
23 7 150 116.700 25.0 2317.500
24 8 50 38.900 175.0 2432.500
25 8 60 46.680 160.0 3468.800
26 8 70 54.460 145.0 4271.700
27 8 80 62.240 130.0 4841.200
28 8 85 66.130 122.5 5038.425
29 8 90 70.020 115.0 5177.300
30 8 95 73.910 107.5 5257.825
31 8 96 74.688 106.0 5266.928
32 8 97 75.466 104.5 5273.697
33 8 98 76.244 103.0 5278.132
34 8 99 77.022 101.5 5280.233
35 8 100 77.800 100.0 5280.000
36 8 101 78.578 98.5 5277.433
37 8 102 79.356 97.0 5272.532
38 8 103 80.134 95.5 5265.297
39 8 104 80.912 94.0 5255.728
40 8 105 81.690 92.5 5243.825
41 8 110 85.580 85.0 5149.300
42 8 115 89.470 77.5 4996.425
43 8 120 93.360 70.0 4785.200
44 8 130 101.140 55.0 4187.700
45 8 140 108.920 40.0 3356.800
46 8 150 116.700 25.0 2292.500

Merge 2 data frames using common date, plus 2 rows before and n-1 rows after

So i need to merge 2 data frames:
The first data frame contains dates in YYYY-mm-dd format and event lengths:
datetime length
2003-06-03 1
2003-06-07 1
2003-06-13 1
2003-06-17 3
2003-06-28 5
2003-07-10 1
2003-07-23 1
...
The second data frame contains dates in the same format and discharge data:
datetime q
2003-05-29 36.2
2003-05-30 34.6
2003-05-31 33.1
2003-06-01 30.7
2003-06-02 30.0
2003-06-03 153.0
2003-06-04 69.0
...
The second data frame is much larger.
I want to merge/join only the following rows of the second data frame to the first:
all rows that have the same date as the first frame (I know this can be done with left_join(df1,df2, by = c("datetime"))
two rows before that row
n-1 rows after that row, where n = "length" value of row in first data frame.
I would like to identify the rows belonging to the same event as well.
Ideally i would have the following output: (Notice the event from 2003-06-17)
EventDatesNancy length q event#
2003-06-03 1 153.0 1
2003-06-07 1 120.0 2
2003-06-13 1 45.3 3
2003-06-15 na 110.0 4
2003-06-16 na 53.1 4
2003-06-17 3 78.0 4
2003-06-18 na 167.0 4
2003-06-19 na 145.0 4
...
I hope this makes clear what I am trying to do.
This might be one approach using tidyverse and fuzzyjoin.
First, indicate event numbers in your first data.frame. Add two columns to indicate the start and end dates (start date is 2 days before the date, and end date is length days - 1 after the date).
Then, you can use fuzzy_inner_join to get the selected rows from the second data.frame. Here, you will want to include where the datetime in the second data.frame falls after the start date and before the end date of the first data.frame.
library(tidyverse)
library(fuzzyjoin)
df1$event <- seq_along(1:nrow(df1))
df1$start_date <- df1$datetime - 2
df1$end_date <- df1$datetime + df1$length - 1
fuzzy_inner_join(
df1,
df2,
by = c("start_date" = "datetime", "end_date" = "datetime"),
match_fun = c(`<=`, `>=`)
) %>%
select(datetime.y, length, q, event)
I tried this out with some made up data:
R> df1
datetime length
1 2003-06-03 1
2 2003-06-12 1
3 2003-06-21 1
4 2003-06-30 3
5 2003-07-09 5
6 2003-07-18 1
7 2003-07-27 1
8 2003-08-05 2
9 2003-08-14 1
10 2003-08-23 1
11 2003-09-01 3
R> df2
datetime q
1 2003-06-03 44
2 2003-06-04 52
3 2003-06-05 34
4 2003-06-06 20
5 2003-06-07 57
6 2003-06-08 67
7 2003-06-09 63
8 2003-06-10 51
9 2003-06-11 56
10 2003-06-12 37
11 2003-06-13 16
12 2003-06-14 54
13 2003-06-15 46
14 2003-06-16 6
15 2003-06-17 32
16 2003-06-18 91
17 2003-06-19 61
18 2003-06-20 42
19 2003-06-21 28
20 2003-06-22 98
21 2003-06-23 77
22 2003-06-24 81
23 2003-06-25 13
24 2003-06-26 15
25 2003-06-27 73
26 2003-06-28 38
27 2003-06-29 27
28 2003-06-30 49
29 2003-07-01 10
30 2003-07-02 89
31 2003-07-03 9
32 2003-07-04 80
33 2003-07-05 68
34 2003-07-06 26
35 2003-07-07 31
36 2003-07-08 29
37 2003-07-09 84
38 2003-07-10 60
39 2003-07-11 19
40 2003-07-12 97
41 2003-07-13 35
42 2003-07-14 47
43 2003-07-15 70
This will give the following output:
datetime.y length q event
1 2003-06-03 1 44 1
2 2003-06-10 1 51 2
3 2003-06-11 1 56 2
4 2003-06-12 1 37 2
5 2003-06-19 1 61 3
6 2003-06-20 1 42 3
7 2003-06-21 1 28 3
8 2003-06-28 3 38 4
9 2003-06-29 3 27 4
10 2003-06-30 3 49 4
11 2003-07-01 3 10 4
12 2003-07-02 3 89 4
13 2003-07-07 5 31 5
14 2003-07-08 5 29 5
15 2003-07-09 5 84 5
16 2003-07-10 5 60 5
17 2003-07-11 5 19 5
18 2003-07-12 5 97 5
19 2003-07-13 5 35 5
If the output desired is different than above, please let me know what should be different so that I can correct it.
Data
df1 <- structure(list(datetime = structure(c(12206, 12215, 12224, 12233,
12242, 12251, 12260, 12269, 12278, 12287, 12296), class = "Date"),
length = c(1, 1, 1, 3, 5, 1, 1, 2, 1, 1, 3), event = 1:11,
start_date = structure(c(12204, 12213, 12222, 12231, 12240,
12249, 12258, 12267, 12276, 12285, 12294), class = "Date"),
end_date = structure(c(12206, 12215, 12224, 12235, 12246,
12251, 12260, 12270, 12278, 12287, 12298), class = "Date")), row.names = c(NA,
-11L), class = "data.frame")
df2 <- structure(list(datetime = structure(c(12206, 12207, 12208, 12209,
12210, 12211, 12212, 12213, 12214, 12215, 12216, 12217, 12218,
12219, 12220, 12221, 12222, 12223, 12224, 12225, 12226, 12227,
12228, 12229, 12230, 12231, 12232, 12233, 12234, 12235, 12236,
12237, 12238, 12239, 12240, 12241, 12242, 12243, 12244, 12245,
12246, 12247, 12248), class = "Date"), q = c(44L, 52L, 34L, 20L,
57L, 67L, 63L, 51L, 56L, 37L, 16L, 54L, 46L, 6L, 32L, 91L, 61L,
42L, 28L, 98L, 77L, 81L, 13L, 15L, 73L, 38L, 27L, 49L, 10L, 89L,
9L, 80L, 68L, 26L, 31L, 29L, 84L, 60L, 19L, 97L, 35L, 47L, 70L
)), class = "data.frame", row.names = c(NA, -43L))

How to apply function to specific columns based upon column name?

I am working with a wide data set resembling the following:
I am looking to write a function that I can iterate over sets of columns with similar names, but with different names. For the sake of simplicity here in terms of the function itself, I'll just create a function that takes the mean of two columns.
avg <- function(data, scorecol, distcol) {
ScoreDistanceAvg = (scorecol + distcol)/2
data$ScoreDistanceAvg <- ScoreDistanceAvg
return(data)
}
avg(data = dat, scorecol = dat$ScoreGame0, distcol = dat$DistanceGame0)
How can I apply the new function to sets of columns with repeated names but different numbers? That is, how could I create a column that takes the mean of ScoreGame0 and DistanceGame0, then create a column that takes the mean of ScoreGame5 and DistanceGame5, and so on? This would be the final output:
Of course, I could just run the function multiple times, but since my full data set is much larger, how could I automate this process? I imagine it involves apply, but I'm not sure how to use apply with a repeated pattern like that. Additionally, I imagine it may involve rewriting the function to better automate the naming of columns.
Data:
structure(list(Player = c("Lebron James", "Lebron James", "Lebron James",
"Lebron James", "Lebron James", "Lebron James", "Lebron James",
"Lebron James", "Lebron James", "Lebron James", "Lebron James",
"Lebron James", "Steph Curry", "Steph Curry", "Steph Curry",
"Steph Curry", "Steph Curry", "Steph Curry", "Steph Curry", "Steph Curry",
"Steph Curry", "Steph Curry", "Steph Curry", "Steph Curry"),
Game = c(0L, 1L, 2L, 3L, 4L, 5L, 0L, 1L, 2L, 3L, 4L, 5L,
0L, 1L, 2L, 3L, 4L, 5L, 0L, 1L, 2L, 3L, 4L, 5L), ScoreGame0 = c(32L,
32L, 32L, 32L, 32L, 32L, 44L, 44L, 44L, 44L, 44L, 44L, 45L,
45L, 45L, 45L, 45L, 45L, 76L, 76L, 76L, 76L, 76L, 76L), ScoreGame5 = c(27L,
27L, 27L, 27L, 27L, 27L, 12L, 12L, 12L, 12L, 12L, 12L, 76L,
76L, 76L, 76L, 76L, 76L, 32L, 32L, 32L, 32L, 32L, 32L), DistanceGame0 = c(12L,
12L, 12L, 12L, 12L, 12L, 79L, 79L, 79L, 79L, 79L, 79L, 18L,
18L, 18L, 18L, 18L, 18L, 88L, 88L, 88L, 88L, 88L, 88L), DistanceGame5 = c(13L,
13L, 13L, 13L, 13L, 13L, 34L, 34L, 34L, 34L, 34L, 34L, 42L,
42L, 42L, 42L, 42L, 42L, 54L, 54L, 54L, 54L, 54L, 54L)), class = "data.frame", row.names = c(NA,
-24L))
Rewrite your function slightly and use it in mapply by greping over the columns. sort makes this even safer.
avg <- function(scorecol, distcol) {
(scorecol + distcol)/2
}
mapply(avg, dat[sort(grep('ScoreGame', names(dat)))], dat[sort(grep('DistanceGame', names(dat)))])
# ScoreGame0 ScoreGame5
# [1,] 22.0 20
# [2,] 22.0 20
# [3,] 22.0 20
# [4,] 22.0 20
# [5,] 22.0 20
# [6,] 22.0 20
# [7,] 61.5 23
# [8,] 61.5 23
# [9,] 61.5 23
# [10,] 61.5 23
# [11,] 61.5 23
# [12,] 61.5 23
# [13,] 31.5 59
# [14,] 31.5 59
# [15,] 31.5 59
# [16,] 31.5 59
# [17,] 31.5 59
# [18,] 31.5 59
# [19,] 82.0 43
# [20,] 82.0 43
# [21,] 82.0 43
# [22,] 82.0 43
# [23,] 82.0 43
# [24,] 82.0 43
To see what grep does try
grep('DistanceGame', names(dat), value=TRUE)
# [1] "DistanceGame0" "DistanceGame5"
in Base R:
cols_used <- names(df[, -(1:2)])
f <- sub("[^0-9]+", 'ScoreDistance', cols_used)
data.frame(lapply(split.default(df[cols_used], f), rowMeans))
ScoreDistance0 ScoreDistance5
1 22.0 20
2 22.0 20
3 22.0 20
4 22.0 20
5 22.0 20
6 22.0 20
7 61.5 23
8 61.5 23
9 61.5 23
10 61.5 23
11 61.5 23
12 61.5 23
13 31.5 59
14 31.5 59
15 31.5 59
16 31.5 59
17 31.5 59
18 31.5 59
19 82.0 43
20 82.0 43
21 82.0 43
22 82.0 43
23 82.0 43
24 82.0 43
Using tidyverse:
Here's a solution with a forloop and readr:
library(readr)
game_num <- names(dat) |>
readr::parse_number() |>
na.omit()
for(i in unique(game_num)) {
avg <- paste0("ScoreDistanceAvg", i)
score <- paste0("ScoreGame", i)
distance <- paste0("DistanceGame", i)
dat[[avg]] <- (dat[[score]] + dat[[distance]])/2
}
Which gives:
Player Game ScoreGame0 ScoreGame5 DistanceGame0 DistanceGame5 ScoreDistanceAvg0 ScoreDistanceAvg5
1 Lebron James 0 32 27 12 13 22.0 20
2 Lebron James 1 32 27 12 13 22.0 20
3 Lebron James 2 32 27 12 13 22.0 20
4 Lebron James 3 32 27 12 13 22.0 20
5 Lebron James 4 32 27 12 13 22.0 20
6 Lebron James 5 32 27 12 13 22.0 20
7 Lebron James 0 44 12 79 34 61.5 23
8 Lebron James 1 44 12 79 34 61.5 23
9 Lebron James 2 44 12 79 34 61.5 23
10 Lebron James 3 44 12 79 34 61.5 23
11 Lebron James 4 44 12 79 34 61.5 23
12 Lebron James 5 44 12 79 34 61.5 23
13 Steph Curry 0 45 76 18 42 31.5 59

How to move selected matrix rows to top of matrix based on a selection vector of row names

I have a matrix that has been ordered by rowSums(). I now want to take a selected few of these rows, by passing a char vector of row names, and easily move them back at the top of the matrix while keeping the moved rows in the same order as they are in the selection vector.
I've tried to do this with various combinations of subset() or just straight index selection, but I can never get the resulting matrix in the order I want, if it works at all. I feel like there has to be a more straightforward way to do this.
Let's say I have a matrix mat ordered by rowSums():
sam1 sam2 sam3 sam4 sam5
sig1 1 2 3 4 5
sig2 6 7 8 9 10
sig3 11 12 13 14 15
sig4a 16 17 18 19 20
sig4b 21 22 23 24 25
sig4c 26 27 28 29 30
sig5 31 32 33 34 35
sig6 36 37 38 39 40
sig7a 41 42 43 44 45
aig7b 46 47 48 49 50
And I want to take a select number of rows I'm interested in:
select = c('sig6','sig4a','sig2')
And move them back to the top of the matrix, while keeping them in the order in the select vector, while leaving the remaining unselected rows below them to get a new matrix:
sam1 sam2 sam3 sam4 sam5
sig6 36 37 38 39 40 *
sig4a 16 17 18 19 20 *
sig2 6 7 8 9 10 *
sig1 1 2 3 4 5
sig3 11 12 13 14 15
sig4b 21 22 23 24 25
sig4c 26 27 28 29 30
sig5 31 32 33 34 35
sig7a 41 42 43 44 45
aig7b 46 47 48 49 50
Is there a straightforward way to do this that doesn't involve making intermediate matrices or complicated workarounds? It seems like there should be, but I haven't been able to find a solution. Maybe I am overlooking something.
An option is to specify the vector of row names first followed by the ones that are left with setdiff
mat[c(select, setdiff(row.names(mat), select)),]
#. sam1 sam2 sam3 sam4 sam5
#sig6 36 37 38 39 40
#sig4a 16 17 18 19 20
#sig2 6 7 8 9 10
#sig1 1 2 3 4 5
#sig3 11 12 13 14 15
#sig4b 21 22 23 24 25
#sig4c 26 27 28 29 30
#sig5 31 32 33 34 35
#sig7a 41 42 43 44 45
#aig7b 46 47 48 49 50
data
mat <- structure(c(1L, 6L, 11L, 16L, 21L, 26L, 31L, 36L, 41L, 46L, 2L,
7L, 12L, 17L, 22L, 27L, 32L, 37L, 42L, 47L, 3L, 8L, 13L, 18L,
23L, 28L, 33L, 38L, 43L, 48L, 4L, 9L, 14L, 19L, 24L, 29L, 34L,
39L, 44L, 49L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L
), .Dim = c(10L, 5L), .Dimnames = list(c("sig1", "sig2", "sig3",
"sig4a", "sig4b", "sig4c", "sig5", "sig6", "sig7a", "aig7b"),
c("sam1", "sam2", "sam3", "sam4", "sam5")))

R programming - data frame manoevur

Suppose I have the following dataframe:
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 50 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
5: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
6: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 0 48
4: 2 3 TRUE 1 2010 0 50
5: 2 3 TRUE 1 2010 0 52
6: 3 3 FALSE 1 2010 0 57
I'd like to turn it into a new dataframe like the following:
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 0 (sum of nF for 48 and 50, factdcx) 48
4: 2 3 TRUE 1 2010 0 52
5: 3 3 FALSE 1 2010 0 57
How can I do it? (Surely, the dataframe, abc, is much larger, but I want the sum of all categories of 48 and 50 and group it into a new category, say '48').
Many thanks!
> dput(head(abc1))
structure(list(dc = c(24L, 41L, 48L, 50L, 52L, 57L), tmin = c(-1L,
-3L, 0L, 0L, 3L, -2L), tmax = c(4L, 5L, 5L, 5L, 5L, 5L), cint = c(5L,
8L, 5L, 5L, 2L, 7L), wcmin = c(-5L, -8L, -4L, -4L, -3L, -6L),
wcmax = c(-2L, -3L, 0L, 0L, 1L, -1L), wsmin = c(20L, 15L,
30L, 30L, 20L, 25L), wsmax = c(25L, 20L, 35L, 35L, 25L, 30L
), gsmin = c(35L, 35L, 45L, 45L, 35L, 35L), gsmax = c(40L,
40L, 50L, 50L, 40L, 40L), wd = c(90L, 90L, 45L, 45L, 45L,
315L), rmin = c(11.8, 10, 7.3, 7.3, 6.7, 4.4), rmax = c(26.6,
23.5, 19, 19, 17.4, 13.8), cir = c(14.8, 13.5, 11.7, 11.7,
10.7, 9.4), lr = c(3L, 3L, 6L, 6L, 6L, 7L), lc = c(1L, 1L,
2L, 2L, 2L, 3L), wc = c(3L, 3L, 3L, 3L, 3L, 3L), li = c(TRUE,
TRUE, TRUE, TRUE, TRUE, FALSE), yd = c(1L, 1L, 1L, 1L, 1L,
1L), yr = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L), nF = c(2L,
8L, 0L, 0L, 0L, 0L), factdcx = structure(1:6, .Label = c("24",
"41", "48", "50", "52", "57", "70"), class = "factor")), .Names = c("dc",
"tmin", "tmax", "cint", "wcmin", "wcmax", "wsmin", "wsmax", "gsmin",
"gsmax", "wd", "rmin", "rmax", "cir", "lr", "lc", "wc", "li",
"yd", "yr", "nF", "factdcx"), class = c("data.table", "data.frame"
), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x054b24a0>)
Still got a problem, sir/madam:
> head(abc1 (updated))
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
6: 70 -2 3 5 -4 -1 20 25 30 35 360 3.6 10.2 6.6 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 57 48
4: 2 3 TRUE 1 2010 0 52
5: 3 3 FALSE 1 2010 0 57
6: 3 2 TRUE 1 2010 1 70
The sum of nF was incorrect, it should be zero.
Try
library(data.table)
unique(setDT(df1)[, factdcx:= as.character(factdcx)][factdcx %chin%
c('48','50'), c('dc', 'factdcx', 'nF') := list('48', '48', sum(nF))])
# dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
#1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
#2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
#3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
#4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
#5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
# lc wc li yd yr nF factdcx
#1: 1 3 TRUE 1 2010 2 24
#2: 1 3 TRUE 1 2010 8 41
#3: 2 3 TRUE 1 2010 0 48
#4: 2 3 TRUE 1 2010 0 52
#5: 3 3 FALSE 1 2010 0 57
For abc1,
res1 <- unique(setDT(abc1)[, factdcx:= as.character(factdcx)][factdcx %chin%
c('48','50'), c('dc', 'factdcx', 'nF') := list(48, '48', sum(nF))])
res1
# dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
#1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
#2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
#3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
#4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
#5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
# lc wc li yd yr nF factdcx
#1: 1 3 TRUE 1 2010 2 24
#2: 1 3 TRUE 1 2010 8 41
#3: 2 3 TRUE 1 2010 0 48
#4: 2 3 TRUE 1 2010 0 52
#5: 3 3 FALSE 1 2010 0 57
data
df1 <- structure(list(dc = structure(1:6, .Label = c("24", "41",
"48",
"50", "52", "57"), class = "factor"), tmin = c(-1L, -3L, 0L,
0L, 3L, -2L), tmax = c(4L, 5L, 5L, 5L, 5L, 5L), cint = c(5L,
8L, 5L, 5L, 2L, 7L), wcmin = c(-5L, -8L, -4L, -4L, -3L, -6L),
wcmax = c(-2L, -3L, 0L, 0L, 1L, -1L), wsmin = c(20L, 15L,
30L, 30L, 20L, 25L), wsmax = c(25L, 20L, 35L, 35L, 25L, 30L
), gsmin = c(35L, 35L, 45L, 45L, 35L, 35L), gsmax = c(40L,
40L, 50L, 50L, 40L, 40L), wd = c(90L, 90L, 45L, 45L, 45L,
315L), rmin = c(11.8, 10, 7.3, 7.3, 6.7, 4.4), rmax = c(26.6,
23.5, 19, 19, 17.4, 13.8), cir = c(14.8, 13.5, 11.7, 11.7,
10.7, 9.4), lr = c(3L, 3L, 6L, 6L, 6L, 7L), lc = c(1L, 1L,
2L, 2L, 2L, 3L), wc = c(3L, 3L, 3L, 3L, 3L, 3L), li = c(TRUE,
TRUE, TRUE, TRUE, TRUE, FALSE), yd = c(1L, 1L, 1L, 1L, 1L,
1L), yr = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L), nF = c(2L,
8L, 0L, 0L, 0L, 0L), factdcx = structure(1:6, .Label = c("24",
"41", "48", "50", "52", "57"), class = "factor")), .Names = c("dc",
"tmin", "tmax", "cint", "wcmin", "wcmax", "wsmin", "wsmax", "gsmin",
"gsmax", "wd", "rmin", "rmax", "cir", "lr", "lc", "wc", "li",
"yd", "yr", "nF", "factdcx"), row.names = c("1:", "2:", "3:",
"4:", "5:", "6:"), class = "data.frame")

Resources