percent change over several years - r

I am trying to figure out the total percent change in of all 5 variables from 2011 to 2015. The following function will give me percent change by year, but I am aiming for the overall percentage. How would one rewrite this in R?
pcchange=function(x,lag=1) c(diff(x,lag),rep(NA,lag))/x
> ssample
year 100 100F 100I 100X
1 2011 6632 6 472 11
2 2012 6783 6 513 11
3 2013 7346 7 672 6
4 2014 8017 9 682 10
5 2015 8996 3 815 11
> dput(ssample)
structure(list(year = c(2011, 2012, 2013, 2014, 2015), `100` = c(6632L,
6783L, 7346L, 8017L, 8996L), `100F` = c(6L, 6L, 7L, 9L, 3L),
`100I` = c(472L, 513L, 672L, 682L, 815L), `100X` = c(11L,
11L, 6L, 10L, 11L)), class = "data.frame", .Names = c("year",
"100", "100F", "100I", "100X"), row.names = c(NA, -5L))

Keeping it simple, try
subset(ssample, year == 2015, -1) / subset(ssample, year == 2011, -1) * 100

Using ROC function from quantmod package for simple return calculation
require(quantmod)
apply(ssample[,-1],2,function(x) ROC(x,type="discrete"))
# 100 100F 100I 100X
#[1,] NA NA NA NA
#[2,] 0.02276840 0.0000000 0.08686441 0.0000000
#[3,] 0.08300162 0.1666667 0.30994152 -0.4545455
#[4,] 0.09134223 0.2857143 0.01488095 0.6666667
#[5,] 0.12211550 -0.6666667 0.19501466 0.1000000
As you can see the percentage change varies widely over the years. I suppose what you really require is compound annual growth rate or CAGR i.e. average return over the period defined as geometric mean of returns
annual.growth.rate <- function(a,period_length,m = 1){
FinalValue <- tail(a,1)
InitialValue <- head(a,1)
cagr <- ((FinalValue/InitialValue)^(1/(period_length/m))) -1
return(cagr)
}
num_of_years <- tail(ssample$year,1)-head(ssample$year,1)
apply(ssample[,-1],2,function(x) annual.growth.rate(a=x,period_length = num_of_years ,m = 1))
# 100 100F 100I 100X
#0.07919825 -0.15910358 0.14631481 0.00000000
The packages xts,quantmod and PerformanceAnalytics come in very handy for time series analysis

Here is one possibility for the sample data:
totalChange <- sapply(ssample[ssample$year %in% range(ssample$year), -1],
function(x) pcchange(x))[1,]

Related

Cann't remove NAs

I am trying to delete rows in my dataset, which contains NAs, but none of the functions work, What could be a reason?
Here is sample of my code,
Site_cov<- read.csv("site_cov.csv")
colnames(Site_cov)<- c("Point", "Basal", "Short.Saps", "Tall.Saps")
head(Site_cov)
Point Basal Short.Saps Tall.Saps
1 DEL001 Na 2 0
2 DEL002 Na 1 6
3 DEL003 Na 0 5
4 DEL004 10 21 22
Here, I though that upper and lower case Nas, could be a problem and this is what I run,
Site_cov$Basal<-toupper(Site_cov$Basal)
Site_cov$Short.Saps<-toupper(Site_cov$Short.Saps)
Site_cov$Tall.Saps<-toupper(Site_cov$Tall.Saps)
Then, I try to delete NAs
Site_cov_NA <- Site_cov[complete.cases(Site_cov[ , c("Point", "Basal", "Short.Saps", "Tall.Saps")]), ]
But, NAs are still here
head(Site_cov_NA)
Point Basal Short.Saps Tall.Saps
1 DEL001 NA 2 0
2 DEL002 NA 1 6
3 DEL003 NA 0 5
4 DEL004 10 21 22
5 DEL005 60 8 17
6 DEL006 80 17 13
Obviously you have 'Na' strings that are fake NAs. replace them with real ones, then your code should work.
dat <- replace(dat, dat == 'Na', NA)
dat[complete.cases(dat[, c("Point", "Basal", "Short.Saps", "Tall.Saps")]), ]
# Point Basal Short.Saps Tall.Saps
# 4 DEL004 10 21 22
Data:
dat <- structure(list(Point = c("DEL001", "DEL002", "DEL003", "DEL004"
), Basal = c("Na", "Na", "Na", "10"), Short.Saps = c(2L, 1L,
0L, 21L), Tall.Saps = c(0L, 6L, 5L, 22L)), class = "data.frame", row.names = c("1",
"2", "3", "4"))
Try the complete.cases() function (https://stat.ethz.ch/R-manual/R-patched/library/stats/html/complete.cases.html)
try <- data.frame("a"=c(1,3,NA,NA), "b"=c(3,5,2,3))
try1<-try[complete.cases(try),]
try1

How to replace a column in R by a modified column, dependent on filtered values? (removing outliers in panel data)

I have a panel dataset that goes like this
year
id
treatment_year
time_to_treatment
outcome
2000
1
2011
-11
2
2002
1
2011
-10
3
2004
2
2015
-9
22
and so on and so forth. I am trying to deal with the outliers by 'Winsorize'. The end goal is to make a scatterplot with time_to_treatment on the X axis and outcome on the Y.
I would like to replace the outcomes for each time_to_treatment by its winsorized outcomes, i.e. replace all extreme values with the 5% and 95% quantile values.
So far what I have tried to do is this but it doesn't work.
for(i in range(dataset$time_to_treatment)){
dplyr::filter(dataset, time_to_treatment == i)$outcome <- DescTools::Winsorize(dplyr::filter(dataset,time_to_treatment==i)$outcome)
}
I get the error - Error in filter(dataset, time_to_treatment == i) <- *vtmp* :
could not find function "filter<-"
Would anyone able to give a better way?
Thanks.
my actual data
where: conflicts = outcome, commission = year of treatment, CD_mun = id.
The concerned time period indicator is time_to_t
Groups: year, CD_MUN, type [6]
type
CD_MUN
year
time_to_t
conflicts
commission
chr
dbl
dbl
dbl
int
dbl
manif
1100023
2000
-11
1
2011
manif
1100189
2000
-3
2
2003
manif
1100205
2000
-9
5
2009
manif
1500602
2000
-4
1
2004
manif
3111002
2000
-11
2
2011
manif
3147006
2000
-10
1
2010
Assuming, "time periods" refer to 'commission' column, you may use ave.
transform(dat, conflicts_w=ave(conflicts, commission, FUN=DescTools::Winsorize))
# type CD_MUN year time_to_t conflicts commission conflicts_w
# 1 manif 1100023 2000 -11 1 2011 1.05
# 2 manif 1100189 2000 -3 2 2003 2.00
# 3 manif 1100205 2000 -9 5 2009 5.00
# 4 manif 1500602 2000 -4 1 2004 1.00
# 5 manif 3111002 2000 -11 2 2011 1.95
# 6 manif 3147006 2000 -10 1 2010 1.00
Data:
dat <- structure(list(type = c("manif", "manif", "manif", "manif", "manif",
"manif"), CD_MUN = c(1100023L, 1100189L, 1100205L, 1500602L,
3111002L, 3147006L), year = c(2000L, 2000L, 2000L, 2000L, 2000L,
2000L), time_to_t = c(-11L, -3L, -9L, -4L, -11L, -10L), conflicts = c(1L,
2L, 5L, 1L, 2L, 1L), commission = c(2011L, 2003L, 2009L, 2004L,
2011L, 2010L)), class = "data.frame", row.names = c(NA, -6L))
For a start you may use this:
# The data
set.seed(123)
df <- data.frame(
time_to_treatment = seq(-15, 0, 1),
outcome = sample(1:30, 16, replace=T)
)
# A solution without Winsorize based solely on dplyr
library(dplyr)
df %>%
mutate(outcome05 = quantile(outcome, probs = 0.05), # 5% quantile
outcome95 = quantile(outcome, probs = 0.95), # 95% quantile
outcome = ifelse(outcome <= outcome05, outcome05, outcome), # replace
outcome = ifelse(outcome >= outcome95, outcome95, outcome)) %>%
select(-c(outcome05, outcome95))
You may adapt this to your exact problem.

Months to integer R

This is part of the dataframe I am working on. The first column represents the year, the second the month, and the third one the number of observations for that month of that year.
2005 07 2
2005 10 4
2005 12 2
2006 01 4
2006 02 1
2006 07 2
2006 08 1
2006 10 3
I have observations from 2000 to 2018. I would like to run a Kernel Regression on this data, so I need to create a continuum integer from a date class vector. For instance Jan 2000 would be 1, Jan 2001 would be 13, Jan 2002 would be 25 and so on. With that I will be able to run the Kernel. Later on, I need to translate that back (1 would be Jan 2000, 2 would be Feb 2000 and so on) to plot my model.
Just use a little algebra:
df$cont <- (df$year - 2000L) * 12L + df$month
You could go backward with modulus and integer division.
df$year <- df$cont %/% 12 + 2000L
df$month <- df$cont %% 12 # 12 is set at 0, so fix that with next line.
df$month[df$month == 0L] <- 12L
Here, %% is the modulus operator and %/% is the integer division operator. See ?"%%" for an explanation of these and other arithmetic operators.
What you can do is something like the following. First create a dates data.frame with expand.grid so we have all the years and months from 2000 01 to 2018 12. Next put this in the correct order and last add an order column so that 2000 01 starts with 1 and 2018 12 is 228. If you merge this with your original table you get the below result. You can then remove columns you don't need. And because you have a dates table you can return the year and month columns based on the order column.
dates <- expand.grid(year = seq(2000, 2018), month = seq(1, 12))
dates <- dates[order(dates$year, dates$month), ]
dates$order <- seq_along(dates$year)
merge(df, dates, by.x = c("year", "month"), by.y = c("year", "month"))
year month obs order
1 2005 10 4 70
2 2005 12 2 72
3 2005 7 2 67
4 2006 1 4 73
5 2006 10 3 82
6 2006 2 1 74
7 2006 7 2 79
8 2006 8 1 80
data:
df <- structure(list(year = c(2005L, 2005L, 2005L, 2006L, 2006L, 2006L, 2006L, 2006L),
month = c(7L, 10L, 12L, 1L, 2L, 7L, 8L, 10L),
obs = c(2L, 4L, 2L, 4L, 1L, 2L, 1L, 3L)),
class = "data.frame",
row.names = c(NA, -8L))
An option is to use yearmon type from zoo package and then calculate difference of months from Jan 2001 using difference between yearmon type.
library(zoo)
# +1 has been added to difference so that Jan 2001 is treated as 1
df$slNum = (as.yearmon(paste0(df$year, df$month),"%Y%m")-as.yearmon("200001","%Y%m"))*12+1
# year month obs slNum
# 1 2005 7 2 67
# 2 2005 10 4 70
# 3 2005 12 2 72
# 4 2006 1 4 73
# 5 2006 2 1 74
# 6 2006 7 2 79
# 7 2006 8 1 80
# 8 2006 10 3 82
Data:
df <- read.table(text =
"year month obs
2005 07 2
2005 10 4
2005 12 2
2006 01 4
2006 02 1
2006 07 2
2006 08 1
2006 10 3",
header = TRUE, stringsAsFactors = FALSE)

Partitioning data to determine (ordered) time between observations

I am not 100% sure how to formulate my question because I don't know the formal names are for what it is that I am trying to do with my dataset. Based on previous questions, there appears to be some way to address what I am trying to, but I am unable at making the logical jump from their problem to my own.
I have attached a sample of my data here.
The first thing I did with my data was add a column indicating which species (sps) are predators (coded as 1) and which species are prey (coded as 0).
#specify which are predators and prey
d1 = d1 %>%
group_by(sps) %>% #grouped by species
mutate(pp=ifelse(sps %in% c("MUXX", "MUVI","MEME"), 1,0)) #mutate to specify predators as 1 and prey as 0
My data is structured as such:
head(d1) #visualize the first few lines of the data
# A tibble: 6 x 8
# Groups: sps [4]
ID date km culv.id type sps time pp
<int> <fctr> <dbl> <fctr> <fctr> <fctr> <fctr> <dbl>
1 2012-06-19 80 A DCC MICRO 2:19 0
2 2012-06-21 80 A DCC MUXX 23:23 1
3 2012-07-15 80 A DCC MAMO 11:38 0
4 2012-07-20 80 A DCC MICRO 22:19 0
5 2012-07-29 80 A DCC MICRO 23:03 0
6 2012-08-07 80 A DCC PRLO 2:04 0
Here is also the output for dput(head(d1)):
structure(list(ID = c(1L, 2L, 3L, 4L, 5L, 8L), date = c("2012-06-19", "2012-06-21", "2012-07-15", "2012-07-20", "2012-07-29", "2012-08-07" ), km = c(80L, 80L, 80L, 80L, 80L, 80L), culv.id = c("A", "A", "A", "A", "A", "A"), type = c("DCC", "DCC", "DCC", "DCC", "DCC", "DCC"), sps = c("MICRO", "MUXX", "MAMO", "MICRO", "MICRO", "PRLO" ), time = c("2:19", "23:23", "11:38", "22:19", "23:03", "2:04" ), pp = c(0, 1, 0, 0, 0, 0)), .Names = c("ID", "date", "km", "culv.id", "type", "sps", "time", "pp"), row.names = c(NA, 6L ), class = "data.frame")
I also converted the time and date using the following code:
d1$datetime=strftime(paste(d1$date,d1$time),'%Y-%m-%d %H:%M',usetz=FALSE) #converting the date/time into a new formatĀ 
The (most) relevant columns are date, time, and pp (where 1 = predator species and 0 = prey species).
I am now trying to figure out how to extract the following information (average +/- std):
average time between prey-prey observations
average time between prey-predator observations
average time between predator-predator observations
average time between predator-prey observations
To put one of these examples (#2) into words:
What is the average time between when a prey species (pp = 0) is first seen followed by a predator species (pp = 1)?
I am trying to figure out how to do this for my dataset overall first. I think that once I figure out how to do that, it should be fairly straightforward to restrict the data.
Here is a data.table (and lubridate) version that might address your problem:
Using a selection of your posted data (posted at bottom), with a slight modification to your datetime creation so that the format works with data.table:
d1$datetime <- as.POSIXct(strptime(paste(d1$date,d1$time),'%Y-%m-%d %H:%M'))
Convert to a data table:
d1 <- as.data.table(d1)
Calculate time differences for equal pp values for animals by specialization (prey or predator), less than (pred to prey), or greater than (prey to pred).
d1$class.class <- d1[d1, difftime(x.datetime, i.datetime, units = "days"),
on = .(datetime > datetime, pp == pp), mult = "first"]
d1$prey.pred <-d1[d1, x.datetime - i.datetime,
on = .(datetime > datetime, pp > pp ), mult = "first"]
d1$pred.prey <- d1[d1, x.datetime - i.datetime,
on = .(datetime > datetime, pp < pp), mult = "first"]
Gives you a column for each:
> head(d1[, 7:ncol(d1)])
time pp datetime class.class prey.pred pred.prey
1: 2:19 0 2012-06-19 02:19:00 26.388194 days 2.877778 days NA days
2: 23:23 1 2012-06-21 23:23:00 74.177083 days NA days 23.51042 days
3: 11:38 0 2012-07-15 11:38:00 5.445139 days 50.666667 days NA days
4: 22:19 0 2012-07-20 22:19:00 9.030556 days 45.221528 days NA days
5: 23:03 0 2012-07-29 23:03:00 8.125694 days 36.190972 days NA days
6: 2:04 0 2012-08-07 02:04:00 1.911111 days 28.065278 days NA days
And you can get summary statistics as you like:
d1[by = sps,, .(mean.same.class = mean(class.class, na.rm = TRUE),
sd.same.class = sd(class.class, na.rm = TRUE),
mean.prey.pred = mean(prey.pred, na.rm = TRUE),
sd.prey.pred = sd(prey.pred, na.rm = TRUE),
mean.pred.prey = mean(pred.prey, na.rm = TRUE),
sd.pred.prey = sd(pred.prey, na.rm = TRUE))]
sps mean.same.class sd.same.class mean.prey.pred sd.prey.pred mean.pred.prey sd.pred.prey
1: MICRO 7.886237 days 8.0547631 18.80733 days 15.504646 NaN days NA
2: MUXX 42.073611 days 45.4011658 NaN days NA 13.01366 days 9.315697
3: MAMO 5.445139 days NA 50.66667 days NA NaN days NA
4: PRLO 2.475694 days 0.7984414 26.62708 days 2.033914 NaN days NA
5: LEAM 2.897222 days NA 10.11597 days NA NaN days NA
Libraries: data.table, lubridate
Data:
> dput(d1)
structure(list(ID = c(1L, 2L, 3L, 4L, 5L, 8L, 9L, 10L, 11L, 12L,
13L, 14L, 15200L, 15201L, 15199L, 15177L, 15178L, 15204L, 15205L
), date = c("2012-06-19", "2012-06-21", "2012-07-15", "2012-07-20",
"2012-07-29", "2012-08-07", "2012-08-08", "2012-08-09", "2012-08-13",
"2012-08-13", "2012-08-25", "2012-08-27", "2012-09-04", "2012-09-09",
"2012-09-11", "2012-09-14", "2012-09-23", "2012-09-26", "2012-09-27"
), km = c(80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L,
80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L), culv.id = c("A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A"), type = c("DCC", "DCC", "DCC", "DCC", "DCC",
"DCC", "DCC", "DCC", "DCC", "DCC", "DCC", "DCC", "DCC", "DCC",
"DCC", "DCC", "DCC", "DCC", "DCC"), sps = c("MICRO", "MUXX",
"MAMO", "MICRO", "MICRO", "PRLO", "MICRO", "PRLO", "MICRO", "MICRO",
"LEAM", "MICRO", "MUXX", "MICRO", "MICRO", "MUXX", "MICRO", "MICRO",
"MICRO"), time = c("2:19", "23:23", "11:38", "22:19", "23:03",
"2:04", "23:56", "23:06", "0:04", "0:46", "0:51", "22:23", "3:38",
"21:08", "0:40", "2:55", "22:09", "20:46", "3:20"), pp = c(0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0)), class = "data.frame", .Names = c("ID",
"date", "km", "culv.id", "type", "sps", "time", "pp"), row.names = c(NA,
-19L))
Edit:
I'm not really sure about mixing tidyverse and data.table ideologies, but you could potentially do what you described in comments using do. For example, make a modified version of df:
d1 <- as.data.table(d1)
d1$datetime <- as.POSIXct(strptime(paste(d1$date,d1$time),'%Y-%m-%d %H:%M'))
d1Mod <- d1
d1Mod$km[10:nrow(d1Mod)] <- 90
Then, define the data.table bit as a function:
foo <- function(df_) {
df_$class.class <- df_[df_, difftime(x.datetime, i.datetime, units = "days"),
on = .(datetime > datetime, pp == pp), mult = "first"]
df_$prey.pred <-df_[df_, x.datetime - i.datetime,
on = .(datetime > datetime, pp > pp ), mult = "first"]
df_$pred.prey <- df_[df_, x.datetime - i.datetime,
on = .(datetime > datetime, pp < pp), mult = "first"]
return(df_)
}
Running d1 %>% group_by(km) %>% do(foo(as.data.table(.))) gets you the same output as in the original answer above (since all km values are 80). If you run it on the modified d1Mod you get an output that looks like it has been grouped by km:
> d1Mod %>%
+ group_by(km) %>%
+ do(foo(as.data.table(.)))
# A tibble: 19 x 12
# Groups: km [2]
ID date km culv.id type sps time pp datetime class.class prey.pred pred.prey
<int> <chr> <dbl> <chr> <chr> <chr> <chr> <dbl> <dttm> <time> <time> <time>
1 1 2012-06-19 80 A DCC MICRO 2:19 0 2012-06-19 02:19:00 26.3881944 days 2.877778 days NA days
2 2 2012-06-21 80 A DCC MUXX 23:23 1 2012-06-21 23:23:00 NA days NA days 23.510417 days
3 3 2012-07-15 80 A DCC MAMO 11:38 0 2012-07-15 11:38:00 5.4451389 days NA days NA days
4 4 2012-07-20 80 A DCC MICRO 22:19 0 2012-07-20 22:19:00 9.0305556 days NA days NA days
5 5 2012-07-29 80 A DCC MICRO 23:03 0 2012-07-29 23:03:00 8.1256944 days NA days NA days
6 8 2012-08-07 80 A DCC PRLO 2:04 0 2012-08-07 02:04:00 1.9111111 days NA days NA days
7 9 2012-08-08 80 A DCC MICRO 23:56 0 2012-08-08 23:56:00 0.9652778 days NA days NA days
8 10 2012-08-09 80 A DCC PRLO 23:06 0 2012-08-09 23:06:00 3.0402778 days NA days NA days
9 11 2012-08-13 80 A DCC MICRO 0:04 0 2012-08-13 00:04:00 NA days NA days NA days
10 12 2012-08-13 90 A DCC MICRO 0:46 0 2012-08-13 00:46:00 12.0034722 days 22.119444 days NA days
11 13 2012-08-25 90 A DCC LEAM 0:51 0 2012-08-25 00:51:00 2.8972222 days 10.115972 days NA days
12 14 2012-08-27 90 A DCC MICRO 22:23 0 2012-08-27 22:23:00 12.9479167 days 7.218750 days NA days
13 15200 2012-09-04 90 A DCC MUXX 3:38 1 2012-09-04 03:38:00 9.9701389 days NA days 5.729167 days
14 15201 2012-09-09 90 A DCC MICRO 21:08 0 2012-09-09 21:08:00 1.1472222 days 4.240972 days NA days
15 15199 2012-09-11 90 A DCC MICRO 0:40 0 2012-09-11 00:40:00 12.8951389 days 3.093750 days NA days
16 15177 2012-09-14 90 A DCC MUXX 2:55 1 2012-09-14 02:55:00 NA days NA days 9.801389 days
17 15178 2012-09-23 90 A DCC MICRO 22:09 0 2012-09-23 22:09:00 2.9423611 days NA days NA days
18 15204 2012-09-26 90 A DCC MICRO 20:46 0 2012-09-26 20:46:00 0.2736111 days NA days NA days
19 15205 2012-09-27 90 A DCC MICRO 3:20 0 2012-09-27 03:20:00 NA days NA days NA days
However, you'll have to do some checking to make sure that the calculations are actually doing what you need- I don't have example output or actual km/year info to truth these results against (read: I don't know what I'm looking at!).
Note also that I think arrange is irrelevant for the operations here, considering that the datetime gets sorted in the function.
I'll use the piece on the comments as an example:
d1 = structure(list(ID = c(1L, 2L, 3L, 4L, 5L, 8L), date = c("2012-06-19", "2012-06-21", "2012-07-15", "2012-07-20", "2012-07-29", "2012-08-07" ), km = c(80L, 80L, 80L, 80L, 80L, 80L), culv.id = c("A", "A", "A", "A", "A", "A"), type = c("DCC", "DCC", "DCC", "DCC", "DCC", "DCC"), sps = c("MICRO", "MUXX", "MAMO", "MICRO", "MICRO", "PRLO" ), time = c("2:19", "23:23", "11:38", "22:19", "23:03", "2:04" ), pp = c(0, 1, 0, 0, 0, 0)), .Names = c("ID", "date", "km", "culv.id", "type", "sps", "time", "pp"), row.names = c(NA, 6L ), class = "data.frame")
We add the datetime column just as you specified:
d1$datetime=strftime(paste(d1$date,d1$time),'%Y-%m-%d %H:%M',usetz=FALSE)
First, add a column indicating which sequence of happened prey/predator and the time between observations (we remove the first row because there is no information about the previous observation). Note that, the timedif is a numerical value indicating the number of days.
d1 = d1 %>% mutate(prev = lag(pp))
d1 = d1 %>% mutate(timedif = as.numeric(as.POSIXct(datetime) - lag(as.POSIXct(datetime))))
d1 = d1[2:nrow(d1),] %>% mutate(seque = as.factor(paste0(pp,prev)))
At this point, your table looks like
> d1
ID date km culv.id type sps time pp datetime prev timedif seque
1 2 2012-06-21 80 A DCC MUXX 23:23 1 2012-06-21 23:23 0 2.877778 10
2 3 2012-07-15 80 A DCC MAMO 11:38 0 2012-07-15 11:38 1 23.510417 01
3 4 2012-07-20 80 A DCC MICRO 22:19 0 2012-07-20 22:19 0 5.445139 00
4 5 2012-07-29 80 A DCC MICRO 23:03 0 2012-07-29 23:03 0 9.030556 00
5 8 2012-08-07 80 A DCC PRLO 2:04 0 2012-08-07 02:04 0 8.125694 00
After that, just take the wanted statistics for each group by using
avg = d1 %>% group_by(seque) %>% summarise(mean(timedif))
sdevs = d1 %>% group_by(seque) %>% summarise(sd(timedif))
We obtain
>avg
# A tibble: 3 x 2
seque `mean(timedif)`
<fctr> <dbl>
1 00 7.533796
2 01 23.510417
3 10 2.877778
> sdevs
# A tibble: 3 x 2
seque `sd(timedif)`
<fctr> <dbl>
1 00 1.864554
2 01 NA
3 10 NA
Note that the standard deviation is not computed because we only have one observation in the sample dataset for these categories.

R - Adding numbers within a data frame cell together

I have a data frame in which the values are stored as characters. However, many values contain two numbers that need to be added together. Example:
2014 Q1 Sales 2014 Q2 Sales 2014 Q3 Sales 2014 Q4 Sales
Product 1 3+6 2+10 8 13+2
Product 2 6 4+0 <NA> 5
Product 3 <NA> 5+9 3+1 11
Is there a way to go through the whole data frame and replace all cells containing characters like "3+6" with new values equal to their sum? I assume this would involve coercing the characters to numeric or integers, but I don't know how that would be possible for values with the + sign in them. I would like the example data frame to end up looking like this:
2014 Q1 Sales 2014 Q2 Sales 2014 Q3 Sales 2014 Q4 Sales
Product 1 9 12 8 15
Product 2 6 4 <NA> 5
Product 3 <NA> 14 4 11
Here's an easier example:
dat <- data.frame(a=c("3+6", "10"), b=c("12", NA), c=c("3+4", "5+6"))
dat
## a b c
## 1 3+6 12 3+4
## 2 10 <NA> 5+6
apply(dat, 1:2, function(x) eval(parse(text=x)))
## a b c
## [1,] 9 12 7
## [2,] 10 NA 11
Using R itself to do the computation with eval and parse does the trick.
Here is one option with gsubfn without using eval(parse. We convert the 'data.frame' to 'matrix' (as.matrix(dat)). We match the numbers ([0-9]+), capture it as a group using parentheses ((..)) followed by +, followed by second set of numbers, and replace it by converting to numeric class and then do the +. The output can be assigned back to the original dataset to get the same structure as in 'dat'.
library(gsubfn)
dat[] <- as.numeric(gsubfn('([0-9]+)\\+([0-9]+)',
~as.numeric(x)+as.numeric(y), as.matrix(dat)))
dat
# 2014 Q1 Sales 2014 Q2 Sales 2014 Q3 Sales 2014 Q4 Sales
#Product 1 9 12 8 15
#Product 2 6 4 NA 5
#Product 3 NA 14 4 11
Or we can loop the columns with lapply and perform the replacement with gsubfn for each of the columns.
dat[] <- lapply(dat, function(x) as.numeric(gsubfn('([0-9]+)\\+([0-9]+)',
~as.numeric(x)+as.numeric(y), as.character(x))))
data
dat <- structure(list(`2014 Q1 Sales` = structure(c(1L, 2L, NA), .Label = c("3+6",
"6"), class = "factor"), `2014 Q2 Sales` = structure(1:3, .Label = c("2+10",
"4+0", "5+9"), class = "factor"), `2014 Q3 Sales` = structure(c(2L,
NA, 1L), .Label = c("3+1", "8"), class = "factor"), `2014 Q4 Sales` = structure(c(2L,
3L, 1L), .Label = c("11", "13+2", "5"), class = "factor")), .Names = c("2014 Q1 Sales",
"2014 Q2 Sales", "2014 Q3 Sales", "2014 Q4 Sales"), class = "data.frame", row.names = c("Product 1",
"Product 2", "Product 3"))

Resources