Pivot longer with multiple variables in columns - r

My data looks like this:
# A tibble: 120 x 5
age death_rate_male life_exp_male death_rate_fem life_exp_fem
<dbl> <dbl> <dbl> <dbl> <dbl>
1 0 0.00630 76.0 0.00523 81.0
2 1 0.000426 75.4 0.000342 80.4
3 2 0.00029 74.5 0.000209 79.4
4 3 0.000229 73.5 0.000162 78.4
5 4 0.000162 72.5 0.000143 77.4
6 5 0.000146 71.5 0.000125 76.5
7 6 0.000136 70.5 0.000113 75.5
8 7 0.000127 69.6 0.000104 74.5
9 8 0.000115 68.6 0.000097 73.5
10 9 0.000103 67.6 0.000093 72.5
# ... with 110 more rows
>
I'm trying to create a tidy table where the variables are age, gender, life expectancy, and death rate.
I managed to do this by splitting the data frame into two (one containing life expectancy, the other death rate), tidying both with pivot_longer(), and then appending the two tables.
Is there a way to do this more elegantly, with a single pivot_longer() command? Thank you in advance.

We can use names_pattern (where we capture as a group based on the pattern)
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = -age, names_to = c( '.value', 'grp'),
names_pattern = "^(\\w+_\\w+)_(\\w+)")
# A tibble: 20 x 4
# age grp death_rate life_exp
# <int> <chr> <dbl> <dbl>
# 1 0 male 0.0063 76
# 2 0 fem 0.00523 81
# 3 1 male 0.000426 75.4
# 4 1 fem 0.000342 80.4
# 5 2 male 0.00029 74.5
# 6 2 fem 0.000209 79.4
# 7 3 male 0.000229 73.5
# 8 3 fem 0.000162 78.4
# 9 4 male 0.000162 72.5
#10 4 fem 0.000143 77.4
#11 5 male 0.000146 71.5
#12 5 fem 0.000125 76.5
#13 6 male 0.000136 70.5
#14 6 fem 0.000113 75.5
#15 7 male 0.000127 69.6
#16 7 fem 0.000104 74.5
#17 8 male 0.000115 68.6
#18 8 fem 0.000097 73.5
#19 9 male 0.000103 67.6
#20 9 fem 0.000093 72.5
or names_sep (specify the pattern here it is underscore followed by no character that is an underscore until the end)
df1 %>%
pivot_longer(cols = -age, names_to = c( '.value', 'grp'),
names_sep = "_(?=[^_]+$)")
data
df1 <- structure(list(age = 0:9, death_rate_male = c(0.0063, 0.000426,
0.00029, 0.000229, 0.000162, 0.000146, 0.000136, 0.000127, 0.000115,
0.000103), life_exp_male = c(76, 75.4, 74.5, 73.5, 72.5, 71.5,
70.5, 69.6, 68.6, 67.6), death_rate_fem = c(0.00523, 0.000342,
0.000209, 0.000162, 0.000143, 0.000125, 0.000113, 0.000104, 9.7e-05,
9.3e-05), life_exp_fem = c(81, 80.4, 79.4, 78.4, 77.4, 76.5,
75.5, 74.5, 73.5, 72.5)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"))

Borrowing the data from akrun, here is a base R option using reshape
reshape(
setNames(df, gsub("(.*)_(\\w+)", "\\1\\.\\2", names(df))),
direction = "long",
varying = -1
)
such that
age time death_rate life_exp id
1.male 0 male 0.006300 76.0 1
2.male 1 male 0.000426 75.4 2
3.male 2 male 0.000290 74.5 3
4.male 3 male 0.000229 73.5 4
5.male 4 male 0.000162 72.5 5
6.male 5 male 0.000146 71.5 6
7.male 6 male 0.000136 70.5 7
8.male 7 male 0.000127 69.6 8
9.male 8 male 0.000115 68.6 9
10.male 9 male 0.000103 67.6 10
1.fem 0 fem 0.005230 81.0 1
2.fem 1 fem 0.000342 80.4 2
3.fem 2 fem 0.000209 79.4 3
4.fem 3 fem 0.000162 78.4 4
5.fem 4 fem 0.000143 77.4 5
6.fem 5 fem 0.000125 76.5 6
7.fem 6 fem 0.000113 75.5 7
8.fem 7 fem 0.000104 74.5 8
9.fem 8 fem 0.000097 73.5 9
10.fem 9 fem 0.000093 72.5 10

Related

Choose lowest value in each row R [duplicate]

This question already has an answer here:
Dplyr mutate minimum column name
(1 answer)
Closed 12 months ago.
I have the df below.
Postcode A B C D E
1 4251 45.8 55.1 70.2 79.5 102
2 4254 41.7 51.0 66.1 75.3 97.9
3 4255 45.5 48.7 63.9 73.1 95.6
4 4681 114 100 96.4 102 125
I want the minimum value of each row where postcode is not a number but a character.
expected output:
Postcode Dis Location
1 4251 45.8 A
2 4254 41.7 A
3 4255 45.5 A
4 4681 96.4 C
library(dplyr)
library(tidyr)
df |>
pivot_longer(cols = A:E, names_to = "Location", values_to = "Dis") |>
group_by(Postcode) |>
filter(Dis == min(Dis))
+ # A tibble: 4 × 3
# Groups: Postcode [4]
Postcode Location Dis
<chr> <chr> <dbl>
1 4251 A 45.8
2 4254 A 41.7
3 4255 A 45.5
4 4681 C 96.4

Loop to sum weekly rolling average

I am new to coding. I have a data set of daily stream flow averages over 20 years. Following is an example:
DATE FLOW
1 10/1/2001 88.2
2 10/2/2001 77.6
3 10/3/2001 68.4
4 10/4/2001 61.5
5 10/5/2001 55.3
6 10/6/2001 52.5
7 10/7/2001 49.7
8 10/8/2001 46.7
9 10/9/2001 43.3
10 10/10/2001 41.3
11 10/11/2001 39.3
12 10/12/2001 37.7
13 10/13/2001 35.8
14 10/14/2001 34.1
15 10/15/2001 39.8
I need to create a loop summing the previous 6 days as well as the current day (rolling weekly average), and print it to an array for the designated water year. I have already created an aggregate function to separate yearly average daily means into their designated water years.
# Separating dates into specific water years
wtr_yr <- function(dates, start_month=9)
# Convert dates into POSIXlt
POSIDATE = as.POSIXlt(NEW_DATE)
# Year offset
offset = ifelse(POSIDATE$mon >= start_month - 1, 1, 0)
# Water year
adj.year = POSIDATE$year + 1900 + offset
# Aggregating the water year function to take the mean
mean.FLOW=aggregate(data_set$FLOW,list(adj.year), mean)
It seems that it can be done much more easily.
But first I need to prepare a bit more data.
library(tidyverse)
library(lubridate)
df = tibble(
DATE = seq(mdy("1/1/2010"), mdy("12/31/2022"), 1),
FLOW = rnorm(length(DATE), 40, 10)
)
output
# A tibble: 4,748 x 2
DATE FLOW
<date> <dbl>
1 2010-01-01 34.4
2 2010-01-02 37.7
3 2010-01-03 55.6
4 2010-01-04 40.7
5 2010-01-05 41.3
6 2010-01-06 57.2
7 2010-01-07 44.6
8 2010-01-08 27.3
9 2010-01-09 33.1
10 2010-01-10 35.5
# ... with 4,738 more rows
Now let's do the aggregation by year and week number
df %>%
group_by(year(DATE), week(DATE)) %>%
summarise(mean = mean(FLOW))
output
# A tibble: 689 x 3
# Groups: year(DATE) [13]
`year(DATE)` `week(DATE)` mean
<dbl> <dbl> <dbl>
1 2010 1 44.5
2 2010 2 39.6
3 2010 3 38.5
4 2010 4 35.3
5 2010 5 44.1
6 2010 6 39.4
7 2010 7 41.3
8 2010 8 43.9
9 2010 9 38.5
10 2010 10 42.4
# ... with 679 more rows
Note, for the function week, the first week starts on January 1st. If you want to number the weeks according to the ISO 8601 standard, use the isoweek function. Alternatively, you can also use an epiweek compatible with the US CDC.
df %>%
group_by(year(DATE), isoweek(DATE)) %>%
summarise(mean = mean(FLOW))
output
# A tibble: 681 x 3
# Groups: year(DATE) [13]
`year(DATE)` `isoweek(DATE)` mean
<dbl> <dbl> <dbl>
1 2010 1 40.0
2 2010 2 45.5
3 2010 3 33.2
4 2010 4 38.9
5 2010 5 45.0
6 2010 6 40.7
7 2010 7 38.5
8 2010 8 42.5
9 2010 9 37.1
10 2010 10 42.4
# ... with 671 more rows
If you want to better understand how these functions work, please follow the code below
df %>%
mutate(
w1 = week(DATE),
w2 = isoweek(DATE),
w3 = epiweek(DATE)
)
output
# A tibble: 4,748 x 5
DATE FLOW w1 w2 w3
<date> <dbl> <dbl> <dbl> <dbl>
1 2010-01-01 34.4 1 53 52
2 2010-01-02 37.7 1 53 52
3 2010-01-03 55.6 1 53 1
4 2010-01-04 40.7 1 1 1
5 2010-01-05 41.3 1 1 1
6 2010-01-06 57.2 1 1 1
7 2010-01-07 44.6 1 1 1
8 2010-01-08 27.3 2 1 1
9 2010-01-09 33.1 2 1 1
10 2010-01-10 35.5 2 1 2
# ... with 4,738 more rows

Looping linear regression output in a data frame in r

I have a dataset below in which I want to do linear regression for each country and state and then cbind the predicted values in the dataset:
Final data frame after adding three more columns:
I have done it for one country and one area but want to do it for each country and area and put the predicted, upper and lower limit values back in the data set by cbind:
data <- data.frame(country = c("US","US","US","US","US","US","US","US","US","US","UK","UK","UK","UK","UK"),
Area = c("G","G","G","G","G","I","I","I","I","I","A","A","A","A","A"),
week = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5),amount = c(12,23,34,32,12,12,34,45,65,45,45,34,23,43,43))
data_1 <- data[(data$country=="US" & data$Area=="G"),]
model <- lm(amount ~ week, data = data_1)
pre <- predict(model,newdata = data_1,interval = "prediction",level = 0.95)
pre
How can I loop this for other combination of country and Area?
...and a Base R solution:
data <- data.frame(country = c("US","US","US","US","US","US","US","US","US","US","UK","UK","UK","UK","UK"),
Area = c("G","G","G","G","G","I","I","I","I","I","A","A","A","A","A"),
week = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5),amount = c(12,23,34,32,12,12,34,45,65,45,45,34,23,43,43))
splitVar <- paste0(data$country,"-",data$Area)
dfList <- split(data,splitVar)
result <- do.call(rbind,lapply(dfList,function(x){
model <- lm(amount ~ week, data = x)
cbind(x,predict(model,newdata = x,interval = "prediction",level = 0.95))
}))
result
...the results:
country Area week amount fit lwr upr
UK-A.11 UK A 1 45 36.6 -6.0463638 79.24636
UK-A.12 UK A 2 34 37.1 -1.3409128 75.54091
UK-A.13 UK A 3 23 37.6 0.6671656 74.53283
UK-A.14 UK A 4 43 38.1 -0.3409128 76.54091
UK-A.15 UK A 5 43 38.6 -4.0463638 81.24636
US-G.1 US G 1 12 20.8 -27.6791493 69.27915
US-G.2 US G 2 23 21.7 -21.9985147 65.39851
US-G.3 US G 3 34 22.6 -19.3841749 64.58417
US-G.4 US G 4 32 23.5 -20.1985147 67.19851
US-G.5 US G 5 12 24.4 -24.0791493 72.87915
US-I.6 US I 1 12 20.8 -33.8985900 75.49859
US-I.7 US I 2 34 30.5 -18.8046427 79.80464
US-I.8 US I 3 45 40.2 -7.1703685 87.57037
US-I.9 US I 4 65 49.9 0.5953573 99.20464
US-I.10 US I 5 45 59.6 4.9014100 114.29859
We can also use function augment from package broom to get your desired information:
library(purrr)
library(broom)
data %>%
group_by(country, Area) %>%
nest() %>%
mutate(models = map(data, ~ lm(amount ~ week, data = .)),
aug = map(models, ~ augment(.x, interval = "prediction"))) %>%
unnest(aug) %>%
select(country, Area, amount, week, .fitted, .lower, .upper)
# A tibble: 15 x 7
# Groups: country, Area [3]
country Area amount week .fitted .lower .upper
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 US G 12 1 20.8 -27.7 69.3
2 US G 23 2 21.7 -22.0 65.4
3 US G 34 3 22.6 -19.4 64.6
4 US G 32 4 23.5 -20.2 67.2
5 US G 12 5 24.4 -24.1 72.9
6 US I 12 1 20.8 -33.9 75.5
7 US I 34 2 30.5 -18.8 79.8
8 US I 45 3 40.2 -7.17 87.6
9 US I 65 4 49.9 0.595 99.2
10 US I 45 5 59.6 4.90 114.
11 UK A 45 1 36.6 -6.05 79.2
12 UK A 34 2 37.1 -1.34 75.5
13 UK A 23 3 37.6 0.667 74.5
14 UK A 43 4 38.1 -0.341 76.5
15 UK A 43 5 38.6 -4.05 81.2
Here is a tidyverse way to do this for every combination of country and Area.
library(tidyverse)
data %>%
group_by(country, Area) %>%
nest() %>%
mutate(model = map(data, ~ lm(amount ~ week, data = .x)),
result = map2(model, data, ~data.frame(predict(.x, newdata = .y,
interval = "prediction",level = 0.95)))) %>%
ungroup %>%
select(-model) %>%
unnest(c(data, result))
# country Area week amount fit lwr upr
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 US G 1 12 20.8 -27.7 69.3
# 2 US G 2 23 21.7 -22.0 65.4
# 3 US G 3 34 22.6 -19.4 64.6
# 4 US G 4 32 23.5 -20.2 67.2
# 5 US G 5 12 24.4 -24.1 72.9
# 6 US I 1 12 20.8 -33.9 75.5
# 7 US I 2 34 30.5 -18.8 79.8
# 8 US I 3 45 40.2 -7.17 87.6
# 9 US I 4 65 49.9 0.595 99.2
#10 US I 5 45 59.6 4.90 114.
#11 UK A 1 45 36.6 -6.05 79.2
#12 UK A 2 34 37.1 -1.34 75.5
#13 UK A 3 23 37.6 0.667 74.5
#14 UK A 4 43 38.1 -0.341 76.5
#15 UK A 5 43 38.6 -4.05 81.2
And one more:
library(tidyverse)
data %>%
mutate(CountryArea=paste0(country,Area) %>% factor %>% fct_inorder) %>%
split(.$CountryArea) %>%
map(~lm(amount~week, data=.)) %>%
map(predict, interval = "prediction",level = 0.95) %>%
reduce(rbind) %>%
cbind(data, .)
country Area week amount fit lwr upr
1 US G 1 12 20.8 -27.6791493 69.27915
2 US G 2 23 21.7 -21.9985147 65.39851
3 US G 3 34 22.6 -19.3841749 64.58417
4 US G 4 32 23.5 -20.1985147 67.19851
5 US G 5 12 24.4 -24.0791493 72.87915
6 US I 1 12 20.8 -33.8985900 75.49859
7 US I 2 34 30.5 -18.8046427 79.80464
8 US I 3 45 40.2 -7.1703685 87.57037
9 US I 4 65 49.9 0.5953573 99.20464
10 US I 5 45 59.6 4.9014100 114.29859
11 UK A 1 45 36.6 -6.0463638 79.24636
12 UK A 2 34 37.1 -1.3409128 75.54091
13 UK A 3 23 37.6 0.6671656 74.53283
14 UK A 4 43 38.1 -0.3409128 76.54091
15 UK A 5 43 38.6 -4.0463638 81.24636

Add new column to state data frame based on other column data [duplicate]

This question already has answers here:
Categorize numeric variable into group/ bins/ breaks
(4 answers)
Closed 1 year ago.
I am attempting to add a new column to the state sample data frame in R. I am hoping for this column to cluster the ID of states into broader categories (1-4). My code is close to what I am looking for but I am not getting it quite right.. I know I could enter each state ID line by line but is there a a quicker way? Thank you!
library(tidyverse)
#Add column to denote each state
States=state.x77
States=data.frame(States)
States <- tibble::rowid_to_column(States, "ID")
States
#Create new variable for state buckets
States <- States %>%
mutate(WAGE_BUCKET=case_when(ID <= c(1,12) ~ '1',
ID <= c(13,24) ~ '2',
ID <= c(25,37) ~ '3',
ID <= c(38,50) ~ '4',
TRUE ~ 'NA'))
View(States) #It is not grouping the states in the way I want/I am still getting some NA values but unsure why!
You can use cut or findInterval if all of your groups will be using contiguous ID values:
findInterval(States$ID, c(0, 12, 24, 37, 51))
# [1] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4
If you want to make it a bit more verbose, you can use dplyr::between in your case_when:
States %>%
mutate(
WAGE_BUCKET = case_when(
between(ID, 1, 12) ~ "1",
between(ID, 13, 24) ~ "2",
between(ID, 25, 37) ~ "3",
between(ID, 38, 50) ~ "4",
TRUE ~ NA_character_)
)
# ID Population Income Illiteracy Life Exp Murder HS Grad Frost Area WAGE_BUCKET
# 1 1 3615 3624 2.1 69.05 15.1 41.3 20 50708 1
# 2 2 365 6315 1.5 69.31 11.3 66.7 152 566432 1
# 3 3 2212 4530 1.8 70.55 7.8 58.1 15 113417 1
# 4 4 2110 3378 1.9 70.66 10.1 39.9 65 51945 1
# 5 5 21198 5114 1.1 71.71 10.3 62.6 20 156361 1
# 6 6 2541 4884 0.7 72.06 6.8 63.9 166 103766 1
# 7 7 3100 5348 1.1 72.48 3.1 56.0 139 4862 1
# 8 8 579 4809 0.9 70.06 6.2 54.6 103 1982 1
# 9 9 8277 4815 1.3 70.66 10.7 52.6 11 54090 1
# 10 10 4931 4091 2.0 68.54 13.9 40.6 60 58073 1
# 11 11 868 4963 1.9 73.60 6.2 61.9 0 6425 1
# 12 12 813 4119 0.6 71.87 5.3 59.5 126 82677 1
# 13 13 11197 5107 0.9 70.14 10.3 52.6 127 55748 2
# 14 14 5313 4458 0.7 70.88 7.1 52.9 122 36097 2
# 15 15 2861 4628 0.5 72.56 2.3 59.0 140 55941 2
# 16 16 2280 4669 0.6 72.58 4.5 59.9 114 81787 2
# 17 17 3387 3712 1.6 70.10 10.6 38.5 95 39650 2
# 18 18 3806 3545 2.8 68.76 13.2 42.2 12 44930 2
# 19 19 1058 3694 0.7 70.39 2.7 54.7 161 30920 2
# 20 20 4122 5299 0.9 70.22 8.5 52.3 101 9891 2
# 21 21 5814 4755 1.1 71.83 3.3 58.5 103 7826 2
# 22 22 9111 4751 0.9 70.63 11.1 52.8 125 56817 2
# 23 23 3921 4675 0.6 72.96 2.3 57.6 160 79289 2
# 24 24 2341 3098 2.4 68.09 12.5 41.0 50 47296 2
# 25 25 4767 4254 0.8 70.69 9.3 48.8 108 68995 3
# 26 26 746 4347 0.6 70.56 5.0 59.2 155 145587 3
# 27 27 1544 4508 0.6 72.60 2.9 59.3 139 76483 3
# 28 28 590 5149 0.5 69.03 11.5 65.2 188 109889 3
# 29 29 812 4281 0.7 71.23 3.3 57.6 174 9027 3
# 30 30 7333 5237 1.1 70.93 5.2 52.5 115 7521 3
# 31 31 1144 3601 2.2 70.32 9.7 55.2 120 121412 3
# 32 32 18076 4903 1.4 70.55 10.9 52.7 82 47831 3
# 33 33 5441 3875 1.8 69.21 11.1 38.5 80 48798 3
# 34 34 637 5087 0.8 72.78 1.4 50.3 186 69273 3
# 35 35 10735 4561 0.8 70.82 7.4 53.2 124 40975 3
# 36 36 2715 3983 1.1 71.42 6.4 51.6 82 68782 3
# 37 37 2284 4660 0.6 72.13 4.2 60.0 44 96184 3
# 38 38 11860 4449 1.0 70.43 6.1 50.2 126 44966 4
# 39 39 931 4558 1.3 71.90 2.4 46.4 127 1049 4
# 40 40 2816 3635 2.3 67.96 11.6 37.8 65 30225 4
# 41 41 681 4167 0.5 72.08 1.7 53.3 172 75955 4
# 42 42 4173 3821 1.7 70.11 11.0 41.8 70 41328 4
# 43 43 12237 4188 2.2 70.90 12.2 47.4 35 262134 4
# 44 44 1203 4022 0.6 72.90 4.5 67.3 137 82096 4
# 45 45 472 3907 0.6 71.64 5.5 57.1 168 9267 4
# 46 46 4981 4701 1.4 70.08 9.5 47.8 85 39780 4
# 47 47 3559 4864 0.6 71.72 4.3 63.5 32 66570 4
# 48 48 1799 3617 1.4 69.48 6.7 41.6 100 24070 4
# 49 49 4589 4468 0.7 72.48 3.0 54.5 149 54464 4
# 50 50 376 4566 0.6 70.29 6.9 62.9 173 97203 4
It is a vector of length > 1. The comparison operators works on a single vector. We could use between
library(dplyr)
States <- States %>%
mutate(WAGE_BUCKET=case_when(between(ID, 1, 12) ~ '1',
between(ID, 13,24) ~ '2',
between(ID, 25,37) ~ '3',
between(ID, 38,50) ~ '4',
TRUE ~ NA_character_))
Or another option is to use & with > and <=
States %>%
mutate(WAGE_BUCKET=case_when(ID >= 1 & ID <=12 ~ '1',
ID >= 13 & ID <= 24) ~ '2',
ID >= 25 & ID <= 37 ~ '3',
ID >= 38 & ID <= 50 ~ '4',
TRUE ~ NA_character))
Or may be the OP meant to use %in%
States %>%
mutate(WAGE_BUCKET=case_when(ID %in% c(1,12) ~ '1',
ID %in% c(13,24) ~ '2',
ID %in% c(25,37) ~ '3',
ID %in% c(38,50) ~ '4',
TRUE ~ NA_character_))

How to calculate difference between data in different rows? [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 6 years ago.
Improve this question
I have got monthly data in this format
PrecipMM Date
122.7 2004-01-01
54.2 2005-01-01
31.9 2006-01-01
100.5 2007-01-01
144.9 2008-01-01
96.4 2009-01-01
75.3 2010-01-01
94.8 2011-01-01
67.6 2012-01-01
93.0 2013-01-01
184.6 2014-01-01
101.0 2015-01-01
149.3 2016-01-01
50.2 2004-02-01
46.2 2005-02-01
57.7 2006-02-01
I want to calculate all of the difference of precipMM in same month of different years.
My dream output is like this:
PrecipMM Date PrecipMM_diff
122.7 2004-01-01 NA
54.2 2005-01-01 -68.5
31.9 2006-01-01 -22.3
100.5 2007-01-01 68.6
144.9 2008-01-01 44.4
96.4 2009-01-01 -48.5
75.3 2010-01-01 -21.2
94.8 2011-01-01 19.5
67.6 2012-01-01 -27.2
93.0 2013-01-01 25.4
184.6 2014-01-01 91.6
101.0 2015-01-01 -83.6
149.3 2016-01-01 48.3
50.2 2004-02-01 NA
46.2 2005-02-01 -4.0
57.7 2006-02-01 11.5
I think diff() can do this but I have no idea how.
I think you can do this with lag combined with group_by from dplyr. Here's how:
library(dplyr)
library(lubridate) # makes dealing with dates easier
# Load your example data
df <- structure(list(PrecipMM = c(4.4, 66.7, 48.2, 60.9, 108.1, 109.2,
101.7, 38.1, 53.8, 71.9, 75.4, 67.1, 92.7, 115.3, 68.9, 38.9),
Date = structure(5:20, .Label = c("101.7", "108.1", "109.2",
"115.3", "1766-01-01", "1766-02-01", "1766-03-01", "1766-04-01",
"1766-05-01", "1766-06-01", "1766-07-01", "1766-08-01", "1766-09-01",
"1766-10-01", "1766-11-01", "1766-12-01", "1767-01-01", "1767-02-01",
"1767-03-01", "1767-04-01", "38.1", "38.9", "4.4", "48.2",
"53.8", "60.9", "66.7", "67.1", "68.9", "71.9", "75.4", "92.7"
), class = "factor")), class = "data.frame", row.names = c(NA,
-16L), .Names = c("PrecipMM", "Date"))
results <- df %>%
mutate(years = year(Date), months = month(Date)) %>%
group_by(months) %>%
arrange(years) %>%
mutate(lagged.rain = lag(PrecipMM), rain.diff = PrecipMM - lagged.rain)
results
# Source: local data frame [16 x 6]
# Groups: months [12]
#
# PrecipMM Date years months lagged.rain rain.diff
# (dbl) (fctr) (dbl) (dbl) (dbl) (dbl)
# 1 4.4 1766-01-01 1766 1 NA NA
# 2 92.7 1767-01-01 1767 1 4.4 88.3
# 3 66.7 1766-02-01 1766 2 NA NA
# 4 115.3 1767-02-01 1767 2 66.7 48.6
# 5 48.2 1766-03-01 1766 3 NA NA
# 6 68.9 1767-03-01 1767 3 48.2 20.7
# 7 60.9 1766-04-01 1766 4 NA NA
# 8 38.9 1767-04-01 1767 4 60.9 -22.0
# 9 108.1 1766-05-01 1766 5 NA NA
# 10 109.2 1766-06-01 1766 6 NA NA
# 11 101.7 1766-07-01 1766 7 NA NA
# 12 38.1 1766-08-01 1766 8 NA NA
# 13 53.8 1766-09-01 1766 9 NA NA
# 14 71.9 1766-10-01 1766 10 NA NA
# 15 75.4 1766-11-01 1766 11 NA NA
# 16 67.1 1766-12-01 1766 12 NA NA

Resources