How to summarise all columns using group_by and summarise? - r

I'm trying to tidy my daily activity data (accelerometer data). I would like to sum the repeated rows of each day for all columns. I have 32 rows (some are repeated) and 90 columns (data of one subject).
# Example of my data with 32 rows and 14 columns
df <- data.frame(LbNr = c(22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002,22002),
Type = c("A2. Working" ,"A1. NonWorking" ,"A1. NonWorking" ,"A4. SleepWeek" ,"A1. NonWorking" ,"A2. Working" ,"A1. NonWorking" ,"A4. SleepWeek" ,"A4. SleepWeek" ,"A1. NonWorking" ,"A2. Working" ,"A1. NonWorking" ,"A1. NonWorking" ,"A4. SleepWeek" ,"A1. NonWorking" ,"A2. Working" ,"A1. NonWorking" ,"A4. SleepWeek" ,"A4. SleepWeek" ,"A1. NonWorking" ,"A2. Working" ,"A1. NonWorking" ,"A1. NonWorking" ,"C4. SleepWeekend" ,"C0. Leisure" ,"C0. Leisure" ,"C4. SleepWeekend" ,"C0. Leisure" ,"C4. SleepWeekend" ,"C4. SleepWeekend" ,"A1. NonWorking" ,"A2. Working"),
Weekday = c(1,1,2,2,2,2,2,2,3,3,3,3,4,4,4,4,4,4,5,5,5,5,6,6,6,7,7,7,7,1,1,1),
Time = c(0.66667,5.66667,0.35,6.15,1.5,9.83333,6.05,0.11667,6.83333,1.33333,9.83333,6,0.03333,7.2,6.43333,5,5.23333,0.1,6.41667,0.96667,11.01667,5.6,0.43333,7.9,15.66667,0.03333,7.91667,15.61667,0.43333,6.33333,0.66667,6.83333),
lie = c(0.00583,0.37778,0.03556,4.84389,0.05444,0.05972,0.67639,0.0125,5.68806,0.02333,0.65278,0.23889,0.00917,7.2,0.45472,0.38333,0.29694,0.08,5.48694,0.01889,0.01028,0.12139,0.01694,6.96028,0.24472,0.00333,6.93639,0.11833,0.41861,5.74889,0.00861,0.07333),
sit = c(0.31194,4.36167,0.14417,1.30611,0.45083,6.64111,4.14306,0.10417,1.14528,0.51167,5.79417,3.11833,0,0,2.23944,2.79722,3.66583,0.00472,0.92972,0.29917,6.76806,4.21056,0.30222,0.92194,9.77694,0.00417,0.91833,12.02972,0.01472,0.58444,0.15806,5.58694),
stand = c(0.13389,0.47111,0.09139,0,0.67278,1.63667,0.51806,0,0,0.46417,1.81917,1.57472,0.01889,0,1.88917,0.88639,0.63028,0.00667,0,0.3975,1.83417,0.72528,0.05889,0.00667,2.33944,0.01361,0.03639,1.78139,0,0,0.25472,0.41167),
move = c(0.09056,0.34444,0.05167,0,0.21611,0.59472,0.34306,0,0,0.21333,0.525,0.72806,0.00528,0,0.76583,0.39194,0.41861,0.00667,0,0.14056,1.04694,0.36944,0.03778,0.00806,2.44583,0.00944,0.02083,0.93083,0,0,0.15417,0.235),
walk = c(0.11528,0.10722,0.02722,0,0.10583,0.84194,0.35639,0,0,0.11694,1.00806,0.33167,0,0,1.04611,0.51389,0.20833,0,0,0.09333,1.28528,0.16083,0.0175,0.00306,0.79972,0.00278,0.00472,0.65306,0,0,0.08139,0.49528),
run = c(0,0.00111,0,0,0,0.00167,0.00194,0,0,0,0.00083,0.00083,0,0,0.00333,0.0025,0.00083,0,0,0.00139,0.00472,0,0,0,0.00194,0,0,0.08694,0,0,0,0.00111),
stairs = c(0.00917,0.00333,0,0,0,0.0575,0.01111,0,0,0.00389,0.03333,0.0075,0,0,0.03472,0.02472,0.00472,0.00194,0,0.00583,0.06722,0.0125,0,0,0.05806,0,0,0.01639,0,0,0.00417,0.03),
cycle = c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00778,0,0,0.01,0,0,0,0,0,0,0,0,0,0,0.00556,0),
WalkSlow = c(0.01222,0.02056,0.00389,0,0.03056,0.17417,0.03361,0,0,0.01889,0.35889,0.07778,0,0,0.07528,0.04222,0.03417,0,0,0.02444,0.13722,0.03361,0.00417,0,0.14,0,0.00056,0.08056,0,0,0.02278,0.08278),
WalkFast = c(0.10278,0.08639,0.02278,0,0.07417,0.66,0.32194,0,0,0.0975,0.64583,0.25139,0,0,0.97083,0.46861,0.17222,0,0,0.06861,1.14694,0.12667,0.01306,0.00278,0.65444,0.00194,0.0025,0.56944,0,0,0.0575,0.41))
I have tried some small codes, but, I have failed in almost all. The code below is what I could get, it's too big. I'm wondering if have any other way to do it smaller.
# LbNr = subjects' id
# Weekday = 1 Monday.... 7 Sunday
# Type = activities: A1. NonWorking, A2. Working, A4. SleepWeek, C0. Leisure, C4. SleepWeekend
# code
df %>% select(LbNr, Type, Weekday, Time, lie:IncTrunkWalk) %>%
group_by(LbNr, Type, Weekday) %>%
summarise(n = n(), Time = sum(Time),lie = sum(lie), sit = sum(sit), stand = sum(stand),
move = sum(move), walk = sum(walk), run = sum(run), stairs = sum(stairs),
cycle = sum(cycle), row = sum(row), WalkSlow = sum(WalkSlow),
WalkFast = sum(WalkFast)) %>%
arrange(Weekday) %>% filter(Weekday %in% c('3':'7'))
So far I had another problem with this code. My problem is on Saturday "6", when I concatenate the time could be that Saturday receives activities that started on Friday (see the example below), sometimes will appear "A1. NonWorking" or "A4. SleepWeek", depends on the volunteer. I would like to sum this different activity on "C0. Leisure". If it was possible I would like to do it in one code.
# LbNr Type Weekday n Time lie sit
# <dbl> <fct> <dbl> <int> <dbl> <dbl> <dbl>
#8 22002 A2. Working 5 1 11.0 0.0103 6.77
#9 22002 A4. SleepWeek 5 1 6.42 5.49 0.930
#10 22002 A1. NonWorking 6 1 0.433 0.0169 0.302
#11 22002 C0. Leisure 6 1 15.7 0.245 9.78
#12 22002 C4. SleepWeekend 6 1 7.9 6.96 0.922
#13 22002 C0. Leisure 7 2 15.6 0.122 12.0
#I would like to get something like this.
# LbNr Type Weekday n Time lie sit
# <dbl> <fct> <dbl> <int> <dbl> <dbl> <dbl>
#8 22002 A2. Working 5 1 11.0 0.0103 6.77
#9 22002 A4. SleepWeek 5 1 6.42 5.49 0.930
#10 22002 C0. Leisure 6 1 16.133 0.2619 10.082
#11 22002 C4. SleepWeekend 6 1 7.9 6.96 0.922
#12 22002 C0. Leisure 7 2 15.6 0.122 12.0
For the first problem, I expect to get a small code. Moreover, if it was possible, I would expect to get a better code for the sum of different activities on Saturday.
Thanks in advance,
Luiz

It's hard to try and answer your question without a better example (ie, you can dput() your data to give us a sample). But here is a solution to your last issue: "For the first problem, I expect to get a table with the sum of repeated rows for all columns. Moreover, if it was possible, I would expect to get a better code for the sum of different activities on Saturday."
# create toy data of 3 different IDs, 3 different types, and repeated days
df <- data.frame(id=sample(c(1:3),100,T),
type=sample(letters[1:3],100,T),
day=sample(c(1:7),100,T),
matrix(runif(300),nrow=100),
stringsAsFactors = F)
# gather data, summarize each activity column by ID, type and day
# and select Saturday==6
df %>% gather(k,v,-id,-type,-day) %>%
group_by(id,type,day,k) %>%
summarise(sum=sum(v)) %>%
filter(day==6) %>%
spread(k,sum)
# A tibble: 8 x 6
# Groups: id, type, day [8]
id type day X1 X2 X3
<int> <chr> <int> <dbl> <dbl> <dbl>
1 1 a 6 1.85 3.26 2.09
2 1 b 6 0.604 0.583 0.586
3 1 c 6 0.163 0.663 0.624
4 2 a 6 0.185 0.952 0.349
5 2 b 6 1.16 0.832 0.974
6 2 c 6 0.906 1.62 0.853
7 3 b 6 0.671 1.39 0.887
8 3 c 6 0.449 0.150 0.647
UPDATE
Here is an updated solution with the new data provided.
df %>% group_by(LbNr,Type,Weekday) %>% summarise_all(.,sum)
# A tibble: 20 x 14
# Groups: LbNr, Type [5]
LbNr Type Weekday Time lie sit stand move walk run stairs cycle
<dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 22002 A1. ~ 1 6.33 0.386 4.52e+0 0.726 0.499 0.189 0.00111 0.0075 0.00556
2 22002 A1. ~ 2 7.9 0.766 4.74e+0 1.28 0.611 0.489 0.00194 0.0111 0
3 22002 A1. ~ 3 7.33 0.262 3.63e+0 2.04 0.941 0.449 0.00083 0.0114 0
4 22002 A1. ~ 4 11.7 0.761 5.91e+0 2.54 1.19 1.25 0.00416 0.0394 0.00778
5 22002 A1. ~ 5 6.57 0.140 4.51e+0 1.12 0.51 0.254 0.00139 0.0183 0.01
6 22002 A1. ~ 6 0.433 0.0169 3.02e-1 0.0589 0.0378 0.0175 0 0 0
7 22002 A2. ~ 1 7.5 0.0792 5.90e+0 0.546 0.326 0.611 0.00111 0.0392 0
8 22002 A2. ~ 2 9.83 0.0597 6.64e+0 1.64 0.595 0.842 0.00167 0.0575 0
9 22002 A2. ~ 3 9.83 0.653 5.79e+0 1.82 0.525 1.01 0.00083 0.0333 0
10 22002 A2. ~ 4 5 0.383 2.80e+0 0.886 0.392 0.514 0.0025 0.0247 0
11 22002 A2. ~ 5 11.0 0.0103 6.77e+0 1.83 1.05 1.29 0.00472 0.0672 0
12 22002 A4. ~ 2 6.27 4.86 1.41e+0 0 0 0 0 0 0
13 22002 A4. ~ 3 6.83 5.69 1.15e+0 0 0 0 0 0 0
14 22002 A4. ~ 4 7.3 7.28 4.72e-3 0.00667 0.00667 0 0 0.00194 0
15 22002 A4. ~ 5 6.42 5.49 9.30e-1 0 0 0 0 0 0
16 22002 C0. ~ 6 15.7 0.245 9.78e+0 2.34 2.45 0.800 0.00194 0.0581 0
17 22002 C0. ~ 7 15.6 0.122 1.20e+1 1.80 0.940 0.656 0.0869 0.0164 0
18 22002 C4. ~ 1 6.33 5.75 5.84e-1 0 0 0 0 0 0
19 22002 C4. ~ 6 7.9 6.96 9.22e-1 0.00667 0.00806 0.00306 0 0 0
20 22002 C4. ~ 7 8.35 7.36 9.33e-1 0.0364 0.0208 0.00472 0 0 0
# ... with 2 more variables: WalkSlow <dbl>, WalkFast <dbl>
I think this answers your first question about wanting a 'small code'. I don't understand your second question still about "I would expect to get a better code for the sum of different activities on Saturday." Does this mean that you want to sum across the different activities (lie, sit, etc.) for Saturday only? Or do you want to sum across different types (A2, C0, etc) of activities?
df %>% group_by(LbNr,Type,Weekday) %>% summarise_all(.,sum) %>%
filter(Weekday==6)
# A tibble: 3 x 14
# Groups: LbNr, Type [3]
LbNr Type Weekday Time lie sit stand move walk run stairs cycle WalkSlow
<dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 22002 A1. ~ 6 0.433 0.0169 0.302 0.0589 0.0378 0.0175 0 0 0 0.00417
2 22002 C0. ~ 6 15.7 0.245 9.78 2.34 2.45 0.800 0.00194 0.0581 0 0.14
3 22002 C4. ~ 6 7.9 6.96 0.922 0.00667 0.00806 0.00306 0 0 0 0
# ... with 1 more variable: WalkFast <dbl>
# summarise across different activities, for each column, on Saturday only
df %>% group_by(LbNr,Type,Weekday) %>% summarise_all(.,sum) %>%
filter(Weekday==6) %>% group_by(LbNr) %>% select(-Type,-Weekday) %>%
summarise_all(.,sum)
# A tibble: 1 x 12
LbNr Time lie sit stand move walk run stairs cycle WalkSlow WalkFast
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 22002 24 7.22 11.0 2.41 2.49 0.820 0.00194 0.0581 0 0.144 0.670

Related

Make multiple new columns (ideally tidyverse) by applying mutate across a vector?

I am trying to simulate dataset for a linear regression in a bit of bayesian stats.
Obviously the overall formula is
Y = A + BX
I have simulated a variety of values of A and B using
A <- rnorm(10,0,1)
B <- rnorm(10,0,1)
#10 Random draws from a normal distribution for the values of each of A and B
I setup a list of possible values of X
stuff <- tibble(x = seq(130,170,10)) %>%
#Make table for possible values of X between 130>170 in intervals of 10
mutate(Y = A + B*x)
Make new value which is A plus B*each value of X
This works fine when I have only 1 value in A & B (i.e if I do A <- rnorm(1,0,1))
But obviously it doesnt work when the length of A & B > 1
What I am trying to figure out how to do us something that would be like
mutate(Y[i] = A[i] + B[i]*x
Resulting in 10 new columns Y1>Y10
Any suggestions welcomed
Here's how I would do what I think you want. I'd start long and then convert to wide...
library(tidyverse)
set.seed(123)
df <- tibble() %>%
expand(
nesting(
ID=1:10,
A=rnorm(10,0,1),
B=rnorm(10,0,1)
),
X=seq(130,170,10)
) %>%
mutate(Y=A + B*X)
df
# A tibble: 50 × 5
ID A B X Y
<int> <dbl> <dbl> <dbl> <dbl>
1 1 -1.07 0.426 130 54.4
2 1 -1.07 0.426 140 58.6
3 1 -1.07 0.426 150 62.9
4 1 -1.07 0.426 160 67.2
5 1 -1.07 0.426 170 71.4
6 2 -0.218 -0.295 130 -38.6
7 2 -0.218 -0.295 140 -41.5
8 2 -0.218 -0.295 150 -44.5
9 2 -0.218 -0.295 160 -47.4
10 2 -0.218 -0.295 170 -50.4
# … with 40 more rows
Now, pivot to wide...
df %>%
pivot_wider(
names_from=ID,
values_from=Y,
names_prefix="Y",
id_cols=X
)
# A tibble: 5 × 11
X Y1 Y2 Y3 Y4 Y5 Y6 Y7 Y8 Y9 Y10
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 130 54.4 -38.6 115. 113. 106. 87.8 72.8 -7.90 -40.9 -48.2
2 140 58.6 -41.5 124. 122. 114. 94.7 78.4 -8.51 -44.0 -52.0
3 150 62.9 -44.5 133. 131. 123. 102. 83.9 -9.13 -47.0 -55.8
4 160 67.2 -47.4 142. 140. 131. 108. 89.5 -9.75 -50.1 -59.6
5 170 71.4 -50.4 151. 149. 139. 115. 95.0 -10.4 -53.2 -63.4
At this point you've lost A & B, because you'd need another 10 columns to store the original A's and another 10 to store the original B's.
Personally, I'd probably stick with the long format, because that's most likely going to make your future workflow easier. And I get to keep the A's and B's.

Average a multiple number of rows for every column, multiple times

Here I have a snippet of my dataset. The rows indicate different days of the year.
The Substations represent individuals, there are over 500 individuals.
The 10 minute time periods run all the way through 24 hours.
I need to find an average value for each 10 minute interval for each individual in this dataset. This should result in single row for each individual substation, with the respective average value for each time interval.
I have tried:
meanbygroup <- stationgroup %>%
group_by(Substation) %>%
summarise(means = colMeans(tenminintervals[sapply(tenminintervals, is.numeric)]))
But this averages the entire column and I am left with the same average values for each individual substation.
So for each individual substation, I need an average for each individual time interval.
Please help!
Try using summarize(across()), like this:
df %>%
group_by(Substation) %>%
summarize(across(everything(), ~mean(.x, na.rm=T)))
Output:
Substation `00:00` `00:10` `00:20`
<chr> <dbl> <dbl> <dbl>
1 A -0.233 0.110 -0.106
2 B 0.203 -0.0997 -0.128
3 C -0.0733 0.196 -0.0205
4 D 0.0905 -0.0449 -0.0529
5 E 0.401 0.152 -0.0957
6 F 0.0368 0.120 -0.0787
7 G 0.0323 -0.0792 -0.278
8 H 0.132 -0.0766 0.157
9 I -0.0693 0.0578 0.0732
10 J 0.0776 -0.176 -0.0192
# … with 16 more rows
Input:
set.seed(123)
df = bind_cols(
tibble(Substation = sample(LETTERS,size = 1000, replace=T)),
as_tibble(setNames(lapply(1:3, function(x) rnorm(1000)),c("00:00", "00:10", "00:20")))
) %>% arrange(Substation)
# A tibble: 1,000 × 4
Substation `00:00` `00:10` `00:20`
<chr> <dbl> <dbl> <dbl>
1 A 0.121 -1.94 0.137
2 A -0.322 1.05 0.416
3 A -0.158 -1.40 0.192
4 A -1.85 1.69 -0.0922
5 A -1.16 -0.455 0.754
6 A 1.95 1.06 0.732
7 A -0.132 0.655 -1.84
8 A 1.08 -0.329 -0.130
9 A -1.21 2.82 -0.0571
10 A -1.04 0.237 -0.328
# … with 990 more rows

Summing up Certain Sequences of a Dataframe in R

I have several data frames of daily rates of different regions by age-groups:
Date 0-14 Rate 15-29 Rate 30-44 Rate 45-64 Rate 65-79 Rate 80+ Rate
2020-23-12 0 33.54 45.68 88.88 96.13 41.28
2020-24-12 0 25.14 35.28 66.14 90.28 38.41
It begins on Wednesday (2020-23-12) and I have data from then on up to date.
I want to obtain weekly row sums of rates from each Wednesday to Tuesday.
There should be a wise way of combinations with aggregate, seq and rowsum functions to do this using a few lines. Otherwise, I'll use too long ways to do this.
I created some minimal data, three weeks with some arbitrary column and numerics (no missings). You can use tidyverse language to sum over columns, create groups per week and sum over rowsums by week:
# Minimal Data
MWE <- data.frame(date = c(outer(as.Date("12/23/20", "%m/%d/%y"), 0:20, `+`)),
column1 = runif(21,0,1),
column2 = runif(21,0,1))
library(tidyverse)
MWE %>%
# Calculate Row Sum Everywhere
mutate(sum = rowSums(across(where(is.numeric)))) %>%
# Create Week Groups
group_by(week = ceiling(row_number()/7)) %>%
# Sum Over All RowSums per Group
summarise(rowSums_by_week = sum(sum))
# Groups: week [3]
date column1 column2 sum week
<date> <dbl> <dbl> <dbl> <dbl>
1 2020-12-23 0.449 0.759 1.21 1
2 2020-12-24 0.423 0.0956 0.519 1
3 2020-12-25 0.974 0.592 1.57 1
4 2020-12-26 0.798 0.250 1.05 1
5 2020-12-27 0.870 0.487 1.36 1
6 2020-12-28 0.952 0.345 1.30 1
7 2020-12-29 0.349 0.817 1.17 1
8 2020-12-30 0.227 0.727 0.954 2
9 2020-12-31 0.292 0.209 0.501 2
10 2021-01-01 0.678 0.276 0.954 2
# ... with 11 more rows
# A tibble: 3 x 2
week rowSums_by_week
<dbl> <dbl>
1 1 8.16
2 2 6.02
3 3 6.82

regression by group and retain all the columns in R

I am doing a linear regression by group and want to extract the residuals of the regression
library(dplyr)
set.seed(124)
dat <- data.frame(ID = sample(111:503, 18576, replace = T),
ID2 = sample(11:50, 18576, replace = T),
ID3 = sample(1:14, 18576, replace = T),
yearRef = sample(1998:2014, 18576, replace = T),
value = rnorm(18576))
resid <- dat %>% dplyr::group_by(ID3) %>%
do(augment(lm(value ~ yearRef, data=.))) %>% ungroup()
How do I retain the ID, ID2 as well in the resid. At the moment, it only retains the ID3 in the final data frame
Use group_split then loop through each group using map_dfr to bind ID, ID2 and augment output using bind_cols
library(dplyr)
library(purrr)
dat %>% group_split(ID3) %>%
map_dfr(~bind_cols(select(.x,ID,ID2), augment(lm(value~yearRef, data=.x))), .id = "ID3")
# A tibble: 18,576 x 12
ID3 ID ID2 value yearRef .fitted .se.fit .resid .hat .sigma .cooksd
<chr> <int> <int> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 196 16 -0.385 2009 -0.0406 0.0308 -0.344 1.00e-3 0.973 6.27e-5
2 1 372 47 -0.793 2012 -0.0676 0.0414 -0.726 1.81e-3 0.973 5.05e-4
3 1 470 15 -0.496 2011 -0.0586 0.0374 -0.438 1.48e-3 0.973 1.50e-4
4 1 242 40 -1.13 2010 -0.0496 0.0338 -1.08 1.21e-3 0.973 7.54e-4
5 1 471 34 1.28 2006 -0.0135 0.0262 1.29 7.26e-4 0.972 6.39e-4
6 1 434 35 -1.09 1998 0.0586 0.0496 -1.15 2.61e-3 0.973 1.82e-3
7 1 467 45 -0.0663 2011 -0.0586 0.0374 -0.00769 1.48e-3 0.973 4.64e-8
8 1 334 27 -1.37 2003 0.0135 0.0305 -1.38 9.86e-4 0.972 9.92e-4
9 1 186 25 -0.0195 2003 0.0135 0.0305 -0.0331 9.86e-4 0.973 5.71e-7
10 1 114 34 1.09 2014 -0.0857 0.0500 1.18 2.64e-3 0.973 1.94e-3
# ... with 18,566 more rows, and 1 more variable: .std.resid <dbl>
Taking the "many models" approach, you can nest the data on ID3 and use purrr::map to create a list-column of the broom::augment data frames. The data list-column has all the original columns aside from ID3; map into that and select just the ones you want. Here I'm assuming you want to keep any column that starts with "ID", but you can change this. Then unnest both the data and the augment data frames.
library(dplyr)
library(tidyr)
dat %>%
group_by(ID3) %>%
nest() %>%
mutate(aug = purrr::map(data, ~broom::augment(lm(value ~ yearRef, data = .))),
data = purrr::map(data, select, starts_with("ID"))) %>%
unnest(c(data, aug))
#> # A tibble: 18,576 x 12
#> # Groups: ID3 [14]
#> ID3 ID ID2 value yearRef .fitted .se.fit .resid .hat .sigma
#> <int> <int> <int> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 11 431 15 0.619 2002 0.0326 0.0346 0.586 1.21e-3 0.995
#> 2 11 500 21 -0.432 2000 0.0299 0.0424 -0.462 1.82e-3 0.995
#> 3 11 392 28 -0.246 1998 0.0273 0.0515 -0.273 2.67e-3 0.995
#> 4 11 292 40 -0.425 1998 0.0273 0.0515 -0.452 2.67e-3 0.995
#> 5 11 175 36 -0.258 1999 0.0286 0.0468 -0.287 2.22e-3 0.995
#> 6 11 419 23 3.13 2005 0.0365 0.0273 3.09 7.54e-4 0.992
#> 7 11 329 17 -0.0414 2007 0.0391 0.0274 -0.0806 7.57e-4 0.995
#> 8 11 284 23 -0.450 2006 0.0378 0.0268 -0.488 7.25e-4 0.995
#> 9 11 136 28 -0.129 2006 0.0378 0.0268 -0.167 7.25e-4 0.995
#> 10 11 118 17 -1.55 2013 0.0470 0.0470 -1.60 2.24e-3 0.995
#> # … with 18,566 more rows, and 2 more variables: .cooksd <dbl>,
#> # .std.resid <dbl>

interpolate data by a few columns

I have a large data frame with meteorological conditions at different locations (column radar_id), time (column date) and heights (column hgt).
I need to interpolate the data of each parameter (temp,u,v...) to a specific height (500 m above the ground for each radar- altitude_500 column) separately for each location (radar_id) and date.
I tried to do the approx command in dplyr pipes or splitting the data frame but it didn't work for me...
example of part of my data frame:
head (example)
radar_id date temp u v hgt W wind_ang temp_diff tw altitude_500
<chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Dagan 2014-03-02 18.8 -6.00 4.80 77 7.68 129. 5. -3.33 547
2 Dagan 2014-03-02 17.6 -2.40 9.30 742 9.60 166. 6 -9.20 547
3 Dagan 2014-03-02 16.2 3.10 15.4 1463 15.7 -169. 5.80 -10.4 547
4 Dagan 2014-03-03 16.2 0.900 -0.500 96 1.03 -60.9 -2.6 -0.971 547
5 Dagan 2014-03-03 13.0 3.10 -0.500 754 3.14 -80.8 -4.6 -2.39 547
6 Dagan 2014-03-03 10.8 8.10 4.10 1462 9.08 -117. -5.30 -5.01 547
I want to get a column with the y values from the approx command for each parameter (the x values are the height -hgt),at a specific height (by the altitude_500 column), after the data frame is grouped by radar_id and date .
Here's a dplyr solution. First, I define the data.
# Data
df <- read.table(text = "radar_id date temp u v hgt W wind_ang temp_diff tw altitude_500
1 Dagan 2014-03-02 18.8 -6.00 4.80 77 7.68 129. 5. -3.33 547
2 Dagan 2014-03-02 17.6 -2.40 9.30 742 9.60 166. 6 -9.20 547
3 Dagan 2014-03-02 16.2 3.10 15.4 1463 15.7 -169. 5.80 -10.4 547
4 Dagan 2014-03-03 16.2 0.900 -0.500 96 1.03 -60.9 -2.6 -0.971 547
5 Dagan 2014-03-03 13.0 3.10 -0.500 754 3.14 -80.8 -4.6 -2.39 547
6 Dagan 2014-03-03 10.8 8.10 4.10 1462 9.08 -117. -5.30 -5.01 547")
Then, I load the dplyr package.
# Load library
library(dplyr)
Finally, I group by both radar_id and date and perform a linear interpolation using approx to get the value at altitude_500 m for each column (except the grouping variables and hgt).
# Group then summarise
df %>%
group_by(radar_id, date) %>%
summarise_at(vars(-hgt), ~approx(hgt, ., xout = first(altitude_500))$y)
#> # A tibble: 2 x 10
#> # Groups: radar_id [1]
#> radar_id date temp u v W wind_ang temp_diff tw
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Dagan 2014~ 18.0 -3.46 7.98 9.04 155. 5.71 -7.48
#> 2 Dagan 2014~ 14.0 2.41 -0.5 2.48 -74.5 -3.97 -1.94
#> # ... with 1 more variable: altitude_500 <dbl>
Created on 2019-08-21 by the reprex package (v0.3.0)
This assumes that there is only one value of altitude_500 for each radar_id -date pair.

Resources