I need to aggregate the previous 5 years of the N_C variable in each row.
For example: year 2017 - Sum_Five_Years = 10(2017)+21(2015)+14(2014)+16(2013) = 61
Data:
library(dplyr)
DF<-data.frame(company = c("DEL MAR PHARM","DEL MAR PHARM","DEL MAR PHARM","DEL MAR PHARM","DEL MAR PHARM","DEL MAR PHARM","DEL MAR PHARM","DEL MAR PHARM","DEL MAR PHARM","DEL MAR PHARM","DEL MAR PHARM","DEL MAR PHARM","DEL MAR PHARM","DEL MAR PHARM","DEL MAR PHARM","DEL MAR PHARM"),
year= c("2017","2015","2015","2015","2013","2012","2012","2012","2010","2010","2015","2014","2014","2013","2013","2012"),
N_C= c("0","7","5","4","3","24","52","99","43","37","5","7","7","4","9","20"), Sum_Year = c("0","21","21","21","16","195","195","195","80","80","21","14","14","16","16","195"))
DF <- DF %>% arrange(year)
company year N_C Sum_Year
1 DEL MAR PHARM 2010 43 80
2 DEL MAR PHARM 2010 37 80
3 DEL MAR PHARM 2012 24 195
4 DEL MAR PHARM 2012 52 195
5 DEL MAR PHARM 2012 99 195
6 DEL MAR PHARM 2012 20 195
7 DEL MAR PHARM 2013 3 16
8 DEL MAR PHARM 2013 4 16
9 DEL MAR PHARM 2013 9 16
10 DEL MAR PHARM 2014 7 14
11 DEL MAR PHARM 2014 7 14
12 DEL MAR PHARM 2015 7 21
13 DEL MAR PHARM 2015 5 21
14 DEL MAR PHARM 2015 4 21
15 DEL MAR PHARM 2015 5 21
16 DEL MAR PHARM 2017 10 10
Expected Outcome
DF$Sum_Five_Year <- cbind(c("80","80","275","275","275","275","291","291","291","305","305","246","246","246","246","61"))
> DF
company year N_C Sum_Year Sum_Five_Year
1 DEL MAR PHARM 2010 43 80 80
2 DEL MAR PHARM 2010 37 80 80
3 DEL MAR PHARM 2012 24 195 275
4 DEL MAR PHARM 2012 52 195 275
5 DEL MAR PHARM 2012 99 195 275
6 DEL MAR PHARM 2012 20 195 275
7 DEL MAR PHARM 2013 3 16 291
8 DEL MAR PHARM 2013 4 16 291
9 DEL MAR PHARM 2013 9 16 291
10 DEL MAR PHARM 2014 7 14 305
11 DEL MAR PHARM 2014 7 14 305
12 DEL MAR PHARM 2015 7 21 246
13 DEL MAR PHARM 2015 5 21 246
14 DEL MAR PHARM 2015 4 21 246
15 DEL MAR PHARM 2015 5 21 246
16 DEL MAR PHARM 2017 10 10 61
I have tried the following code but it does not work:
library(data.table)
setDT(DF)
DF[, `:=` (Sum_Five_Year= sum(N_C)), by= list(company,cut(year, breaks = c(5), right = F))]
Any suggestion would be very appreciated :)
With no additional packages, you could use sapply.
The code below assumes that Sum_Year has already been created. You could apply the following directly to your example:
distinct(DF, company, year, Sum_Year) %>%
group_by(company) %>%
mutate(
year = as.integer(as.character(year)),
Sum_Five_Year = sapply(year, function(x) sum(Sum_Year[between(year, x - 5 + 1, x)]))
) %>%
left_join(DF %>% select(-Sum_Year), by = c("company", "year"))
Output:
# A tibble: 16 x 5
# Groups: company [?]
company year Sum_Year Sum_Five_Year N_C
<chr> <int> <int> <int> <int>
1 DELMARPHARM 2010 80 80 43
2 DELMARPHARM 2010 80 80 37
3 DELMARPHARM 2012 195 275 24
4 DELMARPHARM 2012 195 275 52
5 DELMARPHARM 2012 195 275 99
6 DELMARPHARM 2012 195 275 20
7 DELMARPHARM 2013 16 291 3
8 DELMARPHARM 2013 16 291 4
9 DELMARPHARM 2013 16 291 9
10 DELMARPHARM 2014 14 305 7
11 DELMARPHARM 2014 14 305 7
12 DELMARPHARM 2015 21 246 7
13 DELMARPHARM 2015 21 246 5
14 DELMARPHARM 2015 21 246 4
15 DELMARPHARM 2015 21 246 5
16 DELMARPHARM 2017 10 61 10
Otherwise you can do:
DF %>%
group_by(company, year) %>%
mutate(N_C = as.numeric(as.character(N_C))) %>%
summarise(Sum_Year = sum(N_C)) %>%
mutate(
year = as.integer(as.character(year)),
Sum_Five_Year = sapply(year, function(x) sum(Sum_Year[between(year, x - 5 + 1, x)]))
) %>%
left_join(DF %>% select(-Sum_Year), by = c("company", "year"))
If you'd like to get rid of the duplicated format, just leave out the join at the end:
DF %>%
group_by(company, year) %>%
mutate(N_C = as.numeric(as.character(N_C))) %>%
summarise(Sum_Year = sum(N_C)) %>%
mutate(
year = as.integer(as.character(year)),
Sum_Five_Year = sapply(year, function(x) sum(Sum_Year[between(year, x - 5 + 1, x)]))
)
Output:
# A tibble: 6 x 4
# Groups: company [1]
company year Sum_Year Sum_Five_Year
<chr> <int> <dbl> <dbl>
1 DELMARPHARM 2010 80 80
2 DELMARPHARM 2012 195 275
3 DELMARPHARM 2013 16 291
4 DELMARPHARM 2014 14 305
5 DELMARPHARM 2015 21 246
6 DELMARPHARM 2017 10 61
Related
Let's say I have the following dataset. And, I want to change the range of values starting from 20010001-20010010 to 2001-2010.
How can I do this?
Sample data (df):
structure(list(x = c(20010001, 20010001, 20010002, 20010002,
20010003, 20010003, 20010004, 20010004, 20010005, 20010005, 20010006,
20010006, 20010007, 20010007, 20010008, 20010008, 20010009, 20010009,
20010010, 20010010, 20, 2, 19, 18, 17, 16, 15, 14965, 14964
), y = c("2001", "ORIG", "2001", "ORIG", "2001", "ORIG", "2001",
"ORIG", "2001", "ORIG", "2001", "ORIG", "2001", "ORIG", "2001",
"ORIG", "2001", "ORIG", "2001", "ORIG", "2020", "2020", "2020",
"2020", "2020", "2020", "2020", "2022", "2022")), class = "data.frame", row.names = c(NA, -29L))
Code:
library(tidyverse)
# To change a single value at a time
df["1", "x"] = 2010
# Now how to do it for a range of values wihtout having to do it one by one?
Another possible solution.
EXPLANATION
Regex demo
library(tidyverse)
df %>%
mutate(z = str_replace(x, "2001[0]+(?=\\d{2}$)", "20")) %>%
type.convert(as.is = T)
#> x y z
#> 1 20010001 2001 2001
#> 2 20010001 ORIG 2001
#> 3 20010002 2001 2002
#> 4 20010002 ORIG 2002
#> 5 20010003 2001 2003
#> 6 20010003 ORIG 2003
#> 7 20010004 2001 2004
#> 8 20010004 ORIG 2004
#> 9 20010005 2001 2005
#> 10 20010005 ORIG 2005
#> 11 20010006 2001 2006
#> 12 20010006 ORIG 2006
#> 13 20010007 2001 2007
#> 14 20010007 ORIG 2007
#> 15 20010008 2001 2008
#> 16 20010008 ORIG 2008
#> 17 20010009 2001 2009
#> 18 20010009 ORIG 2009
#> 19 200100010 2001 2010
#> 20 200100010 ORIG 2010
#> 21 20 2020 20
#> 22 2 2020 2
#> 23 19 2020 19
#> 24 18 2020 18
#> 25 17 2020 17
#> 26 16 2020 16
#> 27 15 2020 15
#> 28 14965 2022 14965
#> 29 14964 2022 14964
Perhaps there's more to it than this ...
library(dplyr)
df %>%
mutate(x2 = if_else(between(x, 20010001, 20010010), x - 20008000, x))
# x y x2
# 1 20010001 2001 2001
# 2 20010001 ORIG 2001
# 3 20010002 2001 2002
# 4 20010002 ORIG 2002
# 5 20010003 2001 2003
# 6 20010003 ORIG 2003
# 7 20010004 2001 2004
# 8 20010004 ORIG 2004
# 9 20010005 2001 2005
# 10 20010005 ORIG 2005
# 11 20010006 2001 2006
# 12 20010006 ORIG 2006
# 13 20010007 2001 2007
# 14 20010007 ORIG 2007
# 15 20010008 2001 2008
# 16 20010008 ORIG 2008
# 17 20010009 2001 2009
# 18 20010009 ORIG 2009
# 19 200100010 2001 200100010
# 20 200100010 ORIG 200100010
# 21 20 2020 20
# 22 2 2020 2
# 23 19 2020 19
# 24 18 2020 18
# 25 17 2020 17
# 26 16 2020 16
# 27 15 2020 15
# 28 14965 2022 14965
# 29 14964 2022 14964
Here is an alternative approach using stringr package:
The feature or kind of funny thing here is to use all functions from the stringr package. str_c is equvalent to paste0, str_sub is quasi equivalent to substr() -> I find it easier to use in certain places like extracting the characters from last position. And thats it.
We extract the first letter in case x has more or equal to 8 characters and so we also extract the last 3 characters and paste them together. In case x has only for example 2 characters then x will be left:
library(dplyr)
library(stringr)
df %>%
mutate(x = ifelse(nchar(x) >= 8, str_c(str_sub(x, 1, 1), str_sub(x, - 3, - 1)), x))
x y
1 2001 2001
2 2001 ORIG
3 2002 2001
4 2002 ORIG
5 2003 2001
6 2003 ORIG
7 2004 2001
8 2004 ORIG
9 2005 2001
10 2005 ORIG
11 2006 2001
12 2006 ORIG
13 2007 2001
14 2007 ORIG
15 2008 2001
16 2008 ORIG
17 2009 2001
18 2009 ORIG
19 2010 2001
20 2010 ORIG
21 20 2020
22 2 2020
23 19 2020
24 18 2020
25 17 2020
26 16 2020
27 15 2020
28 14965 2022
29 14964 2022
I have grouped data that I want to convert to ungrouped data.
year<-c(rep(2014,4),rep(2015,4))
Age<-rep(c(22,23,24,25),2)
n<-c(1,1,3,2,0,2,3,1)
mydata<-data.frame(year,Age,n)
I would like to have a dataset like the one below created from the previous one.
year Age
1 2014 22
2 2014 23
3 2014 24
4 2014 24
5 2014 24
6 2014 25
7 2014 25
8 2015 23
9 2015 23
10 2015 24
11 2015 24
12 2015 24
13 2015 25
Try
mydata[rep(1:nrow(mydata),mydata$n),]
year Age n
1 2014 22 1
2 2014 23 1
3 2014 24 3
3.1 2014 24 3
3.2 2014 24 3
4 2014 25 2
4.1 2014 25 2
6 2015 23 2
6.1 2015 23 2
7 2015 24 3
7.1 2015 24 3
7.2 2015 24 3
8 2015 25 1
Here's a tidyverse solution:
library(tidyverse)
mydata %>%
uncount(n)
which gives:
year Age
1 2014 22
2 2014 23
3 2014 24
4 2014 24
5 2014 24
6 2014 25
7 2014 25
8 2015 23
9 2015 23
10 2015 24
11 2015 24
12 2015 24
13 2015 25
You can also use tidyr syntax for this:
library(tidyr)
year<-c(rep(2014,4),rep(2015,4))
Age<-rep(c(22,23,24,25),2)
n<-c(1,1,3,2,0,2,3,1)
mydata<-data.frame(year,Age,n)
uncount(mydata, n)
#> year Age
#> 1 2014 22
#> 2 2014 23
#> 3 2014 24
#> 4 2014 24
#> 5 2014 24
#> 6 2014 25
#> 7 2014 25
#> 8 2015 23
#> 9 2015 23
#> 10 2015 24
#> 11 2015 24
#> 12 2015 24
#> 13 2015 25
But of course you shouldn't use tidyr just because it is tidyr :) An alternate view of the Tidyverse "dialect" of the R language, and its promotion by RStudio.
We can use tidyr::complete
library(tidyr)
library(dplyr)
mydata %>% group_by(year, Age) %>%
complete(n = seq_len(n)) %>%
select(-n) %>%
ungroup()
# A tibble: 14 × 2
year Age
<dbl> <dbl>
1 2014 22
2 2014 23
3 2014 24
4 2014 24
5 2014 24
6 2014 25
7 2014 25
8 2015 23
9 2015 23
10 2015 24
11 2015 24
12 2015 24
13 2015 25
14 2015 22
I would like to summarize a table using dplyr.
Here is how I would like to proceed:
I have a data.frame like this:
year region week site species gps_clutch
2017 sud 18 6 au 337
2017 sud 20 10 au 352
2017 sud 22 10 au 352
2017 sud 24 10 au 352
2017 sud 18 6 aio 337
2017 sud 20 6 aio 352
2017 sud 22 6 au 352
2018 sud 20 6 au 337
2018 sud 20 10 au 352
2018 sud 22 10 au 352
2018 sud 22 10 aio 352
2018 sud 22 6 au 352
2017 nor 19 5 au 337
2017 nor 21 2 au 352
2017 nor 23 5 au 352
2017 nor 25 2 au 352
2017 nor 19 5 aio 337
2017 nor 25 5 aio 352
2017 nor 19 5 au 337
2018 nor 21 2 aio 352
2018 nor 23 5 aio 352
2018 nor 25 2 au 352
2018 nor 23 5 aio 337
2018 nor 23 5 au 352
I would like to count the number of "gps_clutch" for each year, region, site, week and expand this all the possible weeks recorded for each region. I explain: in the region "sud" I sampled week 18, 20, 22, 24 and in the region "nor" week 19, 21, 23, 25. I would like to convert implicit missing values by "0" but only for the weeks (nested in regions) that have been sampled. I do not want to expand in a way that I would get a row for week 19 in region "sud" because this region was not sampled that specific week.
this code works well to expand the grid as I would like:
dat %>%
group_by(region) %>%
expand(year,site, species,week)
the following code works too, to get the count values but does not expand the grid as I wish (I only get the list of weeks for which I did observe something for each year, not the total number of weeks sampled across both years). Which mean that if in "sud" "2017" I only have records for weeks 20 and 22, the grid will not get expanded to week 18 and 24 :
field_subsetnord %>%
group_by(year,region,site,species,week) %>%
summarise(count_clutch=length(gps_clutch)) %>%
complete(week,nesting(year,sites,species), fill = list(count_clutch = 0))
this is the table I would like to get at the end:
year region week site species count
2017 sud 18 6 au 1
2017 sud 20 6 au 0
2017 sud 22 6 au 1
2017 sud 24 6 au 0
2017 sud 18 6 aio 1
2017 sud 20 6 aio 1
2017 sud 22 6 aio 0
2017 sud 24 6 aio 0
2017 sud 18 10 au 0
2017 sud 20 10 au 1
2017 sud 22 10 au 1
2017 sud 24 10 au 1
2017 sud 18 10 aio 0
2017 sud 20 10 aio 0
2017 sud 22 10 aio 0
2017 sud 24 10 aio 0
2018 sud 18 6 au 0
2018 sud 20 6 au 1
2018 sud 22 6 au 1
2018 sud 24 6 au 0
2018 sud 18 6 aio 0
2018 sud 20 6 aio 0
2018 sud 22 6 aio 0
2018 sud 24 6 aio 0
2018 sud 18 10 au 0
2018 sud 20 10 au 1
2018 sud 22 10 au 1
2018 sud 24 10 au 0
2018 sud 18 10 aio 0
2018 sud 20 10 aio 0
2018 sud 22 10 aio 1
2018 sud 24 10 aio 0
and so on for 2018...
any suggestions to mix these two codes would be appreciated :)
You are so close with your two approaches. Essentially they just need to be combined to get what you're after. :)
Group by region and then complete() the dataset first, then regroup by all variables and summarise(). Since the gps_clutch will now have missing values in it, you can sum up the non-missing values (via !is.na) in the summarise() statement to count the clutches.
dat %>%
group_by(region) %>%
complete(year, site, species, week) %>%
group_by(year, region, site, species, week) %>%
summarise(count_clutch = sum( !is.na(gps_clutch) ) )
# A tibble: 64 x 6
# Groups: year, region, site, species [16]
year region site species week count_clutch
<int> <fct> <int> <fct> <int> <int>
1 2017 nor 2 aio 19 0
2 2017 nor 2 aio 21 0
3 2017 nor 2 aio 23 0
4 2017 nor 2 aio 25 0
5 2017 nor 2 au 19 0
6 2017 nor 2 au 21 1
7 2017 nor 2 au 23 0
8 2017 nor 2 au 25 1
9 2017 nor 5 aio 19 1
10 2017 nor 5 aio 21 0
# ... with 54 more rows
I am a newbie in R. I have a dataset. Year & Month Active is store in the network Enterprise. Termination is store that left the network Enterprise. With these two variables, I can calculate the turnover My turnover is Termination / ((Active + Termination)) / (nb jours in the month) Example : Janv. 2013 , Active = 593 , Termination = 100 , Turnover = 1,75%
My question is with my dataset in attachment how can I calculate the active number and the termination number until 12-2015 ?
Is it possible to have a view of the scenario?
Dataset:
Year Month Active Termination To (%)
2013 1 5936 100 1,75%
2013 2 6182 190 3,21%
2013 3 6501 117 1,91%
2013 4 6675 92 1,43%
2013 5 6749 111 1,67%
2013 6 6719 145 2,20%
2013 7 6814 121 1,83%
2013 8 6854 90 1,34%
2013 9 6972 99 1,45%
2013 10 7320 99 1,42%
2013 11 7606 98 1,33%
2013 12 7976 155 1,99%
2014 1 7934 87 1,11%
2014 2 8079 127 1,61%
2014 3 8198 125 1,56%
2014 4 8135 154 1,91%
2014 5 8113 136 1,70%
2014 6 8095 173 2,17%
2014 7 8131 220 2,76%
2014 8 7950 135 1,72%
2014 9 7978 108 1,38%
2014 10 8117 199 2,51%
2014 11 8269 117 1,45%
2014 12 8471 177 2,11%
2015 1 8472 132 1,59%
2015 2 8591 117 1,39%
2015 3 8691 161 1,90%
2015 4 8647 126 1,48%
2015 5 8623 123 1,45%
2015 6 8739 177 2,07%
2015 7 8740 218 2,55%
2015 8 8548 35 0,41%
I have this data.frame:
counts <- data.frame(year = sort(rep(2000:2009, 12)), month = rep(month.abb,10), count = sample(1:500, 120, replace = T))
First 20 rows of data:
head(counts, 20)
year month count
1 2000 Jan 14
2 2000 Feb 182
3 2000 Mar 462
4 2000 Apr 395
5 2000 May 107
6 2000 Jun 127
7 2000 Jul 371
8 2000 Aug 158
9 2000 Sep 147
10 2000 Oct 41
11 2000 Nov 141
12 2000 Dec 27
13 2001 Jan 72
14 2001 Feb 7
15 2001 Mar 40
16 2001 Apr 351
17 2001 May 342
18 2001 Jun 81
19 2001 Jul 442
20 2001 Aug 389
Lets say I try to calculate the standard deviation of these data using the usual R code:
library(plyr)
ddply(counts, .(month), summarise, s.d. = sd(count))
month s.d.
1 Apr 145.3018
2 Aug 140.9949
3 Dec 173.9406
4 Feb 127.5296
5 Jan 148.2661
6 Jul 162.4893
7 Jun 133.4383
8 Mar 125.8425
9 May 168.9517
10 Nov 93.1370
11 Oct 167.9436
12 Sep 166.8740
This gives the standard deviation around the mean of each month. How can I get R to output standard deviation around maximum value of each month?
you want: "max of values per month and the average from this maximum value" [which is not the same as the standard deviation].
counts <- data.frame(year = sort(rep(2000:2009, 12)), month = rep(month.abb,10), count = sample(1:500, 120, replace = T))
library(data.table)
counts=data.table(counts)
counts[,mean(count-max(count)),by=month]
This question is highly vague. If you want to calculate the standard deviation of the differences to the maximum, you can use this code:
> library(plyr)
> ddply(counts, .(month), summarise, sd = sd(count - max(count)))
month sd
1 Apr 182.5071
2 Aug 114.3068
3 Dec 117.1049
4 Feb 184.4638
5 Jan 138.1755
6 Jul 167.0677
7 Jun 100.8841
8 Mar 144.8724
9 May 173.3452
10 Nov 132.0204
11 Oct 127.4645
12 Sep 152.2162