replace duplicate values with NA in time series data using dplyr - r

My data seems a bit different than other similar kind of posts.
box_num date x y
1-Q 2018-11-18 20.2 8
1-Q 2018-11-25 21.23 7.2
1-Q 2018-12-2 21.23 23
98-L 2018-11-25 0.134 9.3
98-L 2018-12-2 0.134 4
76-GI 2018-12-2 22.734 4.562
76-GI 2018-12-9 28 4.562
Here I would like to replace the repeated values with NA in both x and y columns.
The code I have tried using dplyr :
(1)df <- df %>% group_by(box_num) %>% arrange(box_num,date) %>%
mutate(df$x[duplicated(df$x),] <- NA)
It creates a new column with all NA's instead of just replacing a repeated value with NA
(2)df <- df %>% group_by(box_num) %>% arrange(box_num,date) %>%
distinct(x,.keep_all = TRUE)
The second one just gives the rows that are not duplicated(we are missing the time series)
Desired Output :
box_num date x y
1-Q 2018-11-18 20.2 8
1-Q 2018-11-25 21.23 7.2
1-Q 2018-12-2 NA 23
98-L 2018-11-25 0.134 9.3
98-L 2018-12-2 NA 4
76-GI 2018-12-2 22.734 4.562
76-GI 2018-12-9 28 NA

Using dplyr we can group_by box_num and use mutate_at x and y column and replace the duplicated value by NA.
library(dplyr)
df %>%
group_by(box_num) %>%
mutate_at(vars(x:y), funs(replace(., duplicated(.), NA)))
# box_num date x y
# <fct> <fct> <dbl> <dbl>
#1 1-Q 2018-11-18 20.2 8
#2 1-Q 2018-11-25 21.2 7.2
#3 1-Q 2018-12-2 NA 23
#4 98-L 2018-11-25 0.134 9.3
#5 98-L 2018-12-2 NA 4
#6 76-GI 2018-12-2 22.7 4.56
#7 76-GI 2018-12-9 28 NA
A base R option (which might not be the best in this case) would be :
cols <- c("x", "y")
df[cols] <- sapply(df[cols], function(x)
ave(x, df$box_num, FUN = function(x) replace(x, duplicated(x), NA)))

Here is an option with data.table. Convert the 'data.frame' to 'data.table' (setDT(df1), specify the columns of interest in .SDcols, replace the duplicated elements in the columns with NA and update those columns by assigning (:=) the output back to the columns
library(data.table)
setDT(df1)[, c('x', 'y') := lapply(.SD, function(x)
replace(x, anyDuplicated(x), NA)), box_num, .SDcols= x:y]
df1
# box_num date x y
#1: 1-Q 2018-11-18 20.200 8.000
#2: 1-Q 2018-11-25 21.230 7.200
#3: 1-Q 2018-12-2 NA 23.000
#4: 98-L 2018-11-25 0.134 9.300
#5: 98-L 2018-12-2 NA 4.000
#6: 76-GI 2018-12-2 22.734 4.562
#7: 76-GI 2018-12-9 28.000 NA
data
df1 <- structure(list(box_num = c("1-Q", "1-Q", "1-Q", "98-L", "98-L",
"76-GI", "76-GI"), date = c("2018-11-18", "2018-11-25", "2018-12-2",
"2018-11-25", "2018-12-2", "2018-12-2", "2018-12-9"), x = c(20.2,
21.23, 20.2, 0.134, 0.134, 22.734, 28), y = c(8, 7.2, 23, 9.3,
4, 4.562, 4.562)), class = "data.frame",
row.names = c(NA, -7L))

Related

Error in `mutate` function when working with tibbles of different data points

I think I'm stuck at a pretty simple thing, although I couldn't figure it out myself.
So, I have the following tibbles:
library(tidyverse)
library(tidyquant)
library(timetk)
library(tibbletime)
symbols_low <- c("TAP", "VLO", "AFL", "ABC", "QCOM", "SWKS", "LRCX","HST","NFLX", "NKE", "AAPL", "SPY")
start_date <- "2016-01-01"
returns_low <- symbols_low %>%
tq_get(get = 'stock.prices', from = start_date, collapse = "monthly", complete_cases = TRUE) %>%
select(date, symbol, adjusted) %>%
group_by(symbol) %>%
as_tbl_time(index=date) %>%
as_period(period = 'monthly', side = 'end') %>%
mutate(monthly.returns = log(adjusted) - log(lag(adjusted))) %>%
replace_na(list(monthly.returns=0)) %>%
mutate(growth = cumprod(1 + monthly.returns)) %>%
as_tibble()
wage_growth <-
tq_get(
x = 'A576RC1',
get = 'economic.data',
collapse = 'monthly',
from = start_date) %>%
select(date, price) %>%
mutate (monthly_increase = (price - lag(price))/lag(price)) %>%
replace_na(list(monthly_increase=0)) %>%
mutate (wage_index = cumprod (1 + monthly_increase)) %>%
mutate (wage_lag_3 = lag(monthly_increase, 3)) %>%
mutate (wage_lag_6 = lag(monthly_increase, 6)) %>%
dplyr::filter(date <= "2021-07-01")
I tried to create a new tibble returns_wages_low, but got the following error message.
returns_wages_low <-
returns_low %>%
dplyr::filter(date <= "2021-06-01") %>%
mutate(wage_change = wage_growth$monthly_increase,
wage_index = wage_growth$wage_index,
wage_lag_3 = wage_growth$wage_lag_3,
wage_lag_6 = wage_growth$wage_lag_6) %>%
as_tibble()
Error: Problem with `mutate()` column `wage_change`.
ℹ `wage_change = wage_growth$monthly_increase`.
ℹ `wage_change` must be size 780 or 1, not 65.
A problem may be related to a tibble returns_low, that consists of the data in a long format. The tibble has more data points than the wage_growth tibble. A tibble returns_low contains 67 data points for each of the 12 tickers selected (804 in total). At the same time, tibble wage_growth contains 65 data points. I suspect, that the source of the error may be inconsistency between these tibbles.
I would appreciate your inputs in solving the issue. Thank all in advance.
Here, we need a left_join instead of extracting the column with $ as the datasets are obviously not having the same dimensions (that is complicated again after filtering)
library(dplyr)
library(tibbletime)
library(tidyquant)
returns_low %>%
dplyr::filter(date <= "2021-06-01") %>%
left_join(wage_growth) %>%
rename(wage_change = monthly_increase)
-output
# A tibble: 780 x 10
date symbol adjusted monthly.returns growth price wage_change wage_index wage_lag_3 wage_lag_6
<date> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2016-01-29 TAP 81.2 0 1 NA NA NA NA NA
2 2016-02-29 TAP 76.6 -0.0593 0.941 NA NA NA NA NA
3 2016-03-31 TAP 86.8 0.125 1.06 NA NA NA NA NA
4 2016-04-29 TAP 86.3 -0.00573 1.05 NA NA NA NA NA
5 2016-05-31 TAP 89.5 0.0364 1.09 NA NA NA NA NA
6 2016-06-30 TAP 91.6 0.0235 1.12 NA NA NA NA NA
7 2016-07-29 TAP 92.5 0.0101 1.13 NA NA NA NA NA
8 2016-08-31 TAP 93.1 0.00560 1.13 NA NA NA NA NA
9 2016-09-30 TAP 99.9 0.0706 1.21 NA NA NA NA NA
10 2016-10-31 TAP 94.4 -0.0561 1.15 NA NA NA NA NA
# … with 770 more rows

Reference the previous non-zero row, find the difference and divide by nrows

I must be asking the question terribly because I can't find what I looking for!
I have a large excel file that looks like this for every day of the month:
Date
Well1
1/1/16
10
1/2/16
NA
1/3/16
NA
1/4/16
NA
1/5/16
20
1/6/16
NA
1/7/16
25
1/8/16
NA
1/9/16
NA
1/10/16
35
etc
NA
I want to make a new column that has the difference between the non-zero rows and divide that by the number of rows between each non zero row. Aiming for something like this:
Date
Well1
Adjusted
1/1/16
10
=(20-10)/4 = 2.5
1/2/16
NA
1.25
1/3/16
NA
1.25
1/4/16
NA
1.25
1/5/16
20
=(25-20)/2= 2.5
1/6/16
NA
2.5
1/7/16
25
=(35-25)/3 = 3.3
1/8/16
NA
3.3
1/9/16
NA
3.3
1/10/16
35
etc
etc
NA
etc
I'm thinking I should use lead or lag, but the thing is that the steps are different between each nonzero row (so I'm not sure how to use n in the lead/lag function). I've used group_by so that each month stands alone, as well as attempted case_when and ifelse Mostly need ideas on translating excel format into a workable R format.
With some diff-ing and repeating of values, you should be able to get there.
dat$Date <- as.Date(dat$Date, format="%m/%d/%y")
nas <- is.na(dat$Well1)
dat$adj <- with(dat[!nas,],
diff(Well1) / as.numeric(diff(Date), units="days")
)[cumsum(!nas)]
# Date Well1 adj
#1 2016-01-01 10 2.5
#2 2016-01-02 NA 2.5
#3 2016-01-03 NA 2.5
#4 2016-01-04 NA 2.5
#5 2016-01-05 20 2.5
#6 2016-01-06 NA 2.5
#7 2016-01-07 25 5.0
#8 2016-01-08 NA 5.0
#9 2016-01-09 NA 5.0
#10 2016-01-10 40 NA
dat being used is:
dat <- read.table(text="Date Well1
1/1/16 10
1/2/16 NA
1/3/16 NA
1/4/16 NA
1/5/16 20
1/6/16 NA
1/7/16 25
1/8/16 NA
1/9/16 NA
1/10/16 40", header=TRUE, stringsAsFactors=FALSE)
Base R in the same vein as #thelatemail but with transformations all in one expression:
nas <- is.na(dat$Well1)
res <- within(dat, {
Date <- as.Date(Date, "%m/%d/%y")
Adjusted <- (diff(Well1[!nas]) /
as.numeric(diff(Date[!nas]), units = "days"))[cumsum(!nas)]
}
)
Data:
dat <- read.table(text="Date Well1
1/1/16 10
1/2/16 NA
1/3/16 NA
1/4/16 NA
1/5/16 20
1/6/16 NA
1/7/16 25
1/8/16 NA
1/9/16 NA
1/10/16 40", header=TRUE, stringsAsFactors=FALSE)
Maybe this should work
library(dplyr)
df1 %>%
#// remove the rows with NA
na.omit %>%
# // create a new column with the lead values of Well1
transmute(Date, Well2 = lead(Well1)) %>%
# // join with original data
right_join(df1 %>%
mutate(rn = row_number())) %>%
# // order by the original order
arrange(rn) %>%
# // create a grouping column based on the NA values
group_by(grp = cumsum(!is.na(Well1))) %>%
# // subtract the first element of Well2 with Well1 and divide
# // by number of rows - n() in the group
mutate(Adjusted = (first(Well2) - first(Well1))/n()) %>%
ungroup %>%
select(-grp, - Well2)

apply a rolling mean to a database by an index

I would like to calculate a rolling mean on data in a single data frame by multiple ids. See my example dataset below.
date <- as.Date(c("2015-02-01", "2015-02-02", "2015-02-03", "2015-02-04",
"2015-02-05", "2015-02-06", "2015-02-07", "2015-02-08",
"2015-02-09", "2015-02-10", "2015-02-01", "2015-02-02",
"2015-02-03", "2015-02-04", "2015-02-05", "2015-02-06",
"2015-02-07", "2015-02-08", "2015-02-09", "2015-02-10"))
index <- c("a","a","a","a","a","a","a","a","a","a",
"b","b","b","b","b","b","b","b","b","b")
x <- runif(20,1,100)
y <- runif(20,50,150)
z <- runif(20,100,200)
df <- data.frame(date, index, x, y, z)
I would like to calculate the rolling mean for x, y and z, by a and then by b.
I tried the following, but I am getting an error.
test <- tapply(df, df$index, FUN = rollmean(df, 5, fill=NA))
The error:
Error in xu[k:n] - xu[c(1, seq_len(n - k))] :
non-numeric argument to binary operator
It seems like there is an issue with the fact that index is a character, but I need it in order to calculate the means...
1) ave Try ave rather than tapply and make sure it is applied only over the columns of interest, i.e. columns 3, 4, 5.
roll <- function(x) rollmean(x, 5, fill = NA)
cbind(df[1:2], lapply(df[3:5], function(x) ave(x, df$index, FUN = roll)))
giving:
date index x y z
1 2015-02-01 a NA NA NA
2 2015-02-02 a NA NA NA
3 2015-02-03 a 66.50522 127.45650 129.8472
4 2015-02-04 a 61.71320 123.83633 129.7673
5 2015-02-05 a 56.56125 120.86158 126.1371
6 2015-02-06 a 66.13340 119.93428 127.1819
7 2015-02-07 a 59.56807 105.83208 125.1244
8 2015-02-08 a 49.98779 95.66024 139.2321
9 2015-02-09 a NA NA NA
10 2015-02-10 a NA NA NA
11 2015-02-01 b NA NA NA
12 2015-02-02 b NA NA NA
13 2015-02-03 b 55.71327 117.52219 139.3961
14 2015-02-04 b 54.58450 107.81763 142.6101
15 2015-02-05 b 50.48102 104.94084 136.3167
16 2015-02-06 b 37.89790 95.45489 135.4044
17 2015-02-07 b 33.05259 85.90916 150.8673
18 2015-02-08 b 49.91385 90.04940 147.1376
19 2015-02-09 b NA NA NA
20 2015-02-10 b NA NA NA
2) by Another way is to use by. roll2 handles one group, by applies it to each group producing a by list and do.call("rbind", ...) puts it back together.
roll2 <- function(x) cbind(x[1:2], rollmean(x[3:5], 5, fill = NA))
do.call("rbind", by(df, df$index, roll2))
giving:
date index x y z
a.1 2015-02-01 a NA NA NA
a.2 2015-02-02 a NA NA NA
a.3 2015-02-03 a 66.50522 127.45650 129.8472
a.4 2015-02-04 a 61.71320 123.83633 129.7673
a.5 2015-02-05 a 56.56125 120.86158 126.1371
a.6 2015-02-06 a 66.13340 119.93428 127.1819
a.7 2015-02-07 a 59.56807 105.83208 125.1244
a.8 2015-02-08 a 49.98779 95.66024 139.2321
a.9 2015-02-09 a NA NA NA
a.10 2015-02-10 a NA NA NA
b.11 2015-02-01 b NA NA NA
b.12 2015-02-02 b NA NA NA
b.13 2015-02-03 b 55.71327 117.52219 139.3961
b.14 2015-02-04 b 54.58450 107.81763 142.6101
b.15 2015-02-05 b 50.48102 104.94084 136.3167
b.16 2015-02-06 b 37.89790 95.45489 135.4044
b.17 2015-02-07 b 33.05259 85.90916 150.8673
b.18 2015-02-08 b 49.91385 90.04940 147.1376
b.19 2015-02-09 b NA NA NA
b.20 2015-02-10 b NA NA NA
3) wide form Another approach is to convert df from long form to wide form in which case a plain rollmean will do it.
rollmean(read.zoo(df, split = 2), 5, fill = NA)
giving:
x.a y.a z.a x.b y.b z.b
2015-02-01 NA NA NA NA NA NA
2015-02-02 NA NA NA NA NA NA
2015-02-03 66.50522 127.45650 129.8472 55.71327 117.52219 139.3961
2015-02-04 61.71320 123.83633 129.7673 54.58450 107.81763 142.6101
2015-02-05 56.56125 120.86158 126.1371 50.48102 104.94084 136.3167
2015-02-06 66.13340 119.93428 127.1819 37.89790 95.45489 135.4044
2015-02-07 59.56807 105.83208 125.1244 33.05259 85.90916 150.8673
2015-02-08 49.98779 95.66024 139.2321 49.91385 90.04940 147.1376
2015-02-09 NA NA NA NA NA NA
2015-02-10 NA NA NA NA NA NA
This works because the dates are the same for both groups. If the dates were different then it could introduce NAs and rollmean cannot handle those. In that case use
rollapply(read.zoo(df, split = 2), 5, mean, fill = NA)
Note: Since the input uses random numbers in its definition to make it reproducible we must issue set.seed first. We used this:
set.seed(123)
date <- as.Date(c("2015-02-01", "2015-02-02", "2015-02-03", "2015-02-04",
"2015-02-05", "2015-02-06", "2015-02-07", "2015-02-08",
"2015-02-09", "2015-02-10", "2015-02-01", "2015-02-02",
"2015-02-03", "2015-02-04", "2015-02-05", "2015-02-06",
"2015-02-07", "2015-02-08", "2015-02-09", "2015-02-10"))
index <- c("a","a","a","a","a","a","a","a","a","a",
"b","b","b","b","b","b","b","b","b","b")
x <- runif(20,1,100)
y <- runif(20,50,150)
z <- runif(20,100,200)
This ought to do the trick using the library dplyr and zoo:
library(dplyr)
library(zoo)
df %>%
group_by(index) %>%
mutate(x_mean = rollmean(x, 5, fill = NA),
y_mean = rollmean(y, 5, fill = NA),
z_mean = rollmean(z, 5, fill = NA))
You could probably tidy this up more using mutate_each or some other form of mutate.
You can also change the arguments within rollmean to fit your needs, such as align = "right" or na.pad = TRUE

How to convert daywise(daily) data to monthly data using R? [duplicate]

This question already has answers here:
Aggregate Daily Data to Month/Year intervals
(9 answers)
Closed 7 years ago.
I have day-wise data of interest rate of 15 years from 01-01-2000 to 01-01-2015.
I want to convert this data to monthly data, which only having month and year.
I want to take mean of the values of all the days in a month and make it one value of that month.
How can I do this in R.
> str(mibid)
'data.frame': 4263 obs. of 6 variables:
$ Days: int 1 2 3 4 5 6 7 8 9 10 ...
$ Date: Date, format: "2000-01-03" "2000-01-04" "2000-01-05" "2000-01-06" ...
$ BID : num 8.82 8.82 8.88 8.79 8.78 8.8 8.81 8.82 8.86 8.78 ...
$ I.S : num 0.092 0.0819 0.0779 0.0801 0.074 0.0766 0.0628 0.0887 0.0759 0.073 ...
$ BOR : num 9.46 9.5 9.52 9.36 9.33 9.37 9.42 9.39 9.4 9.33 ...
$ R.S : num 0.0822 0.0817 0.0828 0.0732 0.084 0.0919 0.0757 0.0725 0.0719 0.0564 ...
> head(mibid)
Days Date BID I.S BOR R.S
1 1 2000-01-03 8.82 0.0920 9.46 0.0822
2 2 2000-01-04 8.82 0.0819 9.50 0.0817
3 3 2000-01-05 8.88 0.0779 9.52 0.0828
4 4 2000-01-06 8.79 0.0801 9.36 0.0732
5 5 2000-01-07 8.78 0.0740 9.33 0.0840
6 6 2000-01-08 8.80 0.0766 9.37 0.0919
>
I'd do this with xts:
set.seed(21)
mibid <- data.frame(Date=Sys.Date()-100:1,
BID=rnorm(100, 8, 0.1), I.S=rnorm(100, 0.08, 0.01),
BOR=rnorm(100, 9, 0.1), R.S=rnorm(100, 0.08, 0.01))
require(xts)
# convert to xts
xmibid <- xts(mibid[,-1], mibid[,1])
# aggregate
agg_xmibid <- apply.monthly(xmibid, colMeans)
# convert back to data.frame
agg_mibid <- data.frame(Date=index(agg_xmibid), agg_xmibid, row.names=NULL)
head(agg_mibid)
# Date BID I.S BOR R.S
# 1 2015-04-30 8.079301 0.07189111 9.074807 0.06819096
# 2 2015-05-31 7.987479 0.07888328 8.999055 0.08090253
# 3 2015-06-30 8.043845 0.07885779 9.018338 0.07847999
# 4 2015-07-31 7.990822 0.07799489 8.980492 0.08162038
# 5 2015-08-07 8.000414 0.08535749 9.044867 0.07755017
A small example of how this might be done using dplyr and lubridate
set.seed(321)
dat <- data.frame(day=seq.Date(as.Date("2010-01-01"), length.out=200, by="day"),
x = rnorm(200),
y = rexp(200))
head(dat)
day x y
1 2010-01-01 1.7049032 2.6286754
2 2010-01-02 -0.7120386 0.3916089
3 2010-01-03 -0.2779849 0.1815379
4 2010-01-04 -0.1196490 0.1234461
5 2010-01-05 -0.1239606 2.2237404
6 2010-01-06 0.2681838 0.3217511
require(dplyr)
require(lubridate)
dat %>%
mutate(year = year(day),
monthnum = month(day),
month = month(day, label=T)) %>%
group_by(year, month) %>%
arrange(year, monthnum) %>%
select(-monthnum) %>%
summarise(x = mean(x),
y = mean(y))
Source: local data frame [7 x 4]
Groups: year
year month x y
1 2010 Jan 0.02958633 0.9387509
2 2010 Feb 0.07711820 1.0985411
3 2010 Mar -0.06429982 1.2395438
4 2010 Apr -0.01787658 1.3627864
5 2010 May 0.19131861 1.1802712
6 2010 Jun -0.04894075 0.8224855
7 2010 Jul -0.22410057 1.1749863
Another option is using data.table which has several very convenient datetime functions. Using the data of #SamThomas:
library(data.table)
setDT(dat)[, lapply(.SD, mean), by=.(year(day), month(day))]
this gives:
year month x y
1: 2010 1 0.02958633 0.9387509
2: 2010 2 0.07711820 1.0985411
3: 2010 3 -0.06429982 1.2395438
4: 2010 4 -0.01787658 1.3627864
5: 2010 5 0.19131861 1.1802712
6: 2010 6 -0.04894075 0.8224855
7: 2010 7 -0.22410057 1.1749863
On the data of #JoshuaUlrich:
setDT(mibid)[, lapply(.SD, mean), by=.(year(Date), month(Date))]
gives:
year month BID I.S BOR R.S
1: 2015 5 7.997178 0.07794925 8.999625 0.08062426
2: 2015 6 8.034805 0.07940600 9.019823 0.07823314
3: 2015 7 7.989371 0.07822263 8.996015 0.08195401
4: 2015 8 8.010541 0.08364351 8.982793 0.07748399
If you want the names of the months instead of numbers, you will have to include [, day:=as.IDate(day)] after the setDT() part and use months instead of month:
setDT(mibid)[, Date:=as.IDate(Date)][, lapply(.SD, mean), by=.(year(Date), months(Date))]
Note: Especially on larger datasets, data.table will probably be (a lot) faster then the other two solutions.

Create several new derived variables from existing variables in data.frame

In R I have a data.frame that has several variables that have been measured monthly over several years. I would like to derive the monthly average (using all years) for each variable. Ideally these new variables would all be together in a new data.frame (carrying over the ID), below I am simply adding the new variable to the data.frame. The only way I know how to do this at the moment (below) seems quite laborious, and I was hoping there might be a smarter way to do this in R, that would not require typing out each month and variable as I did below.
# Example data.frame with only two years, two month, and two variables
# In the real data set there are always 12 months per year
# and there are at least four variables
df<- structure(list(ID = 1:4, ABC.M1Y2001 = c(10, 12.3, 45, 89), ABC.M2Y2001 = c(11.1,
34, 67.7, -15.6), ABC.M1Y2002 = c(-11.1, 9, 34, 56.5), ABC.M2Y2002 = c(12L,
13L, 11L, 21L), DEF.M1Y2001 = c(14L, 14L, 14L, 16L), DEF.M2Y2001 = c(15L,
15L, 15L, 12L), DEF.M1Y2002 = c(5, 12, 23.5, 34), DEF.M2Y2002 = c(6L,
34L, 61L, 56L)), .Names = c("ID", "ABC.M1Y2001", "ABC.M2Y2001","ABC.M1Y2002",
"ABC.M2Y2002", "DEF.M1Y2001", "DEF.M2Y2001", "DEF.M1Y2002",
"DEF.M2Y2002"), class = "data.frame", row.names = c(NA, -4L))
# list variable to average for ABC Month 1 across years
ABC.M1.names <- c("ABC.M1Y2001", "ABC.M1Y2002")
df <- transform(df, ABC.M1 = rowMeans(df[,ABC.M1.names], na.rm = TRUE))
# list variable to average for ABC Month 2 across years
ABC.M2.names <- c("ABC.M2Y2001", "ABC.M2Y2002")
df <- transform(df, ABC.M2 = rowMeans(df[,ABC.M2.names], na.rm = TRUE))
# and so forth for ABC
# ...
# list variables to average for DEF Month 1 across years
DEF.M1.names <- c("DEF.M1Y2001", "DEF.M1Y2002")
df <- transform(df, DEF.M1 = rowMeans(df[,DEF.M1.names], na.rm = TRUE))
# and so forth for DEF
# ...
Here's a solution using data.table development version v1.8.11 (which has melt and cast methods implemented for data.table):
require(data.table)
require(reshape2) # melt/cast builds on S3 generic from reshape2
dt <- data.table(df) # where df is your data.frame
dcast.data.table(melt(dt, id="ID")[, sum(value)/.N, list(ID,
gsub("Y.*$", "", variable))], ID ~ gsub)
ID ABC.M1 ABC.M2 DEF.M1 DEF.M2
1: 1 -0.55 11.55 9.50 10.5
2: 2 10.65 23.50 13.00 24.5
3: 3 39.50 39.35 18.75 38.0
4: 4 72.75 2.70 25.00 34.0
You can just cbind this to your original data.
Note that sum is a primitive where as mean is S3 generic. Therefore, using sum(.)/length(.) is better (as if there are too many groupings, dispatching the right method with mean for every group could be quite a time-consuming operation). .N is a special variable in data.table that directly gives you the length of the group.
Here is a solution using reshape2 that is more automated when you have lots of data and uses regular expressions to extract the variable name and the month. This solution will give you a nice summary table.
# Load required package
require(reshape2)
# Melt your wide data into long format
mdf <- melt(df , id = "ID" )
# Extract relevant variable names from the variable colum
mdf$Month <- gsub( "^.*\\.(M[0-9]{1,2}).*$" , "\\1" , mdf$variable )
mdf$Var <- gsub( "^(.*)\\..*" , "\\1" , mdf$variable )
# Aggregate by month and variable
dcast( mdf , Var ~ Month , mean )
# Var M1 M2
#1 ABC 30.5875 19.275
#2 DEF 16.5625 26.750
Or to be compatible with the other solutions, and return the table by ID as well...
dcast( mdf , ID ~ Var + Month , mean )
# ID ABC_M1 ABC_M2 DEF_M1 DEF_M2
#1 1 -0.55 11.55 9.50 10.5
#2 2 10.65 23.50 13.00 24.5
#3 3 39.50 39.35 18.75 38.0
#4 4 72.75 2.70 25.00 34.0
This is pretty straight forward in base R.
mean.names <- split(names(df)[-1], gsub('Y[0-9]{4}$', '', names(df)[-1]))
means <- lapply(mean.names, function(x) rowMeans(df[, x], na.rm = TRUE))
data.frame(df, means)
This gives you your original data.frame with the following four columns at the end:
ABC.M1 ABC.M2 DEF.M1 DEF.M2
1 -0.55 11.55 9.50 10.5
2 10.65 23.50 13.00 24.5
3 39.50 39.35 18.75 38.0
4 72.75 2.70 25.00 34.0
You can use Reshape from package {splitstackshape} and then use plyr package or data.table or base R to perform mean.
library(splitstackshape) # Reshape
library(plyr) # ddply
kk<-Reshape(df,id.vars="ID",var.stubs=c("ABC.M1","ABC.M2","DEF.M1","DEF.M2"),sep="")
> kk
ID AE DB time ABC.M1 ABC.M2 DEF.M1 DEF.M2
1 1 NA NA 1 10.0 11.1 14.0 15
2 2 NA NA 1 12.3 34.0 14.0 15
3 3 NA NA 1 45.0 67.7 14.0 15
4 4 NA NA 1 89.0 -15.6 16.0 12
5 1 NA NA 2 -11.1 12.0 5.0 6
6 2 NA NA 2 9.0 13.0 12.0 34
7 3 NA NA 2 34.0 11.0 23.5 61
8 4 NA NA 2 56.5 21.0 34.0 56
ddply(kk[,c(1,5:8)],.(ID),colwise(mean))
ID ABC.M1 ABC.M2 DEF.M1 DEF.M2
1 1 -0.55 11.55 9.50 10.5
2 2 10.65 23.50 13.00 24.5
3 3 39.50 39.35 18.75 38.0
4 4 72.75 2.70 25.00 34.0

Resources