Suppose I have a data.frame as follows:
data = data.frame(firm = LETTERS[seq( from = 1, to = 9)],
industry = seq(1,9),
country = c("USA", "CAN", "DEU"),
var1_10 = rnorm(9),
var1_11 = rnorm(9),
var1_12 = rnorm(9),
var2_10 = rnorm(9),
var2_11 = rnorm(9),
var2_12 = rnorm(9))
Which looks like this:
head(data)
firm industry country var1_10 var1_11 var1_12 var2_10 var2_11 var2_12
A 1 USA 0.006080107 1.7089981 0.384306433 -0.2814963 -0.31852115 0.4879907
B 2 CAN 0.447786736 -0.6414333 0.683906020 -0.7500779 -0.72770473 -0.1499627
C 3 DEU 1.265955776 -1.6834242 -0.685028075 0.7192065 -0.02291059 -0.2322860
D 4 USA 0.874346857 0.6339960 -0.005798694 1.0982600 -1.57901079 -0.0510445
E 5 CAN 0.692382607 -0.4461135 -0.432249529 1.7461789 -0.49300818 1.1987289
F 6 DEU -1.098814463 0.7868190 2.281716591 -1.0006592 0.95612690 1.0244039
And I would like to have var1 and var2 in long format, but having firm and country as categories. What I mean is something like this:
firm country time var1 var2
1 A USA 10 0.6157731 1.05564854
2 A USA 11 0.2560421 0.42902183
3 D CAN 10 0.7278390 -1.81995641
4 D CAN 11 1.3241109 -0.69197609
5 B DEU 10 0.1471585 -1.93182825
6 B DEU 11 -0.5985394 1.20967201
7 E USA 10 2.1925299 -0.27900005
8 E USA 11 2.3271128 -1.09578323
9 C CAN 10 1.1348696 -0.10218604
10 C CAN 11 -0.1908846 0.35702296
11 F DEU 10 0.4748446 -0.88230257
12 F DEU 11 -0.5454749 -0.05664779
You can use the new tidyr 1.0.0 pivot_longer() and pivot_wider() functions.
#yutannihilation has an excellent presentation explaining these new functions: A Graphical Introduction to tidyr's pivot_*()
library(tidyr)
set.seed(2019)
data = data.frame(firm = LETTERS[seq( from = 1, to = 9)],
industry = seq(1,9),
country = c("USA", "CAN", "DEU"),
var1_10 = rnorm(9),
var1_11 = rnorm(9),
var1_12 = rnorm(9),
var2_10 = rnorm(9),
var2_11 = rnorm(9),
var2_12 = rnorm(9))
data
#> firm industry country var1_10 var1_11 var1_12 var2_10
#> 1 A 1 USA 0.7385227 -0.3191793 -0.3271264 0.04062997
#> 2 B 2 CAN -0.5147605 -0.2379111 -2.2632252 2.63601650
#> 3 C 3 DEU -1.6401813 1.6186229 0.2855605 -1.61599923
#> 4 D 4 USA 0.9160368 -1.1176011 0.9684286 -0.93455930
#> 5 E 5 CAN -1.2674820 0.2340028 0.8673066 0.63038569
#> 6 F 6 DEU 0.7382478 0.3161516 1.3781350 0.76075998
#> 7 G 7 USA -0.7826228 0.3707686 -0.8082596 -0.51162277
#> 8 H 8 CAN 0.5092959 0.8775886 -0.5121532 1.00190750
#> 9 I 9 DEU -1.4899391 -1.7683235 -1.8039718 -0.38339219
#> var2_11 var2_12
#> 1 -0.47713729 0.20612698
#> 2 0.25420771 0.86320623
#> 3 -1.16349174 0.13977752
#> 4 -0.43793937 -0.22809479
#> 5 -1.72413573 -0.31982812
#> 6 1.72514669 -0.05294738
#> 7 0.09215510 -0.23639840
#> 8 0.07311485 -0.33796351
#> 9 0.64014783 -0.75548467
Create a long table format first
data_longer <- data %>%
pivot_longer(
cols = starts_with("var"),
names_to = c("var", "time"),
names_sep = "_",
values_to = "value"
)
data_longer
#> # A tibble: 54 x 6
#> firm industry country var time value
#> <fct> <int> <fct> <chr> <chr> <dbl>
#> 1 A 1 USA var1 10 0.739
#> 2 A 1 USA var1 11 -0.319
#> 3 A 1 USA var1 12 -0.327
#> 4 A 1 USA var2 10 0.0406
#> 5 A 1 USA var2 11 -0.477
#> 6 A 1 USA var2 12 0.206
#> 7 B 2 CAN var1 10 -0.515
#> 8 B 2 CAN var1 11 -0.238
#> 9 B 2 CAN var1 12 -2.26
#> 10 B 2 CAN var2 10 2.64
#> # ... with 44 more rows
Then reshape to the desired wide format
data_wider <- data_longer %>%
pivot_wider(names_from = var, values_from = value)
data_wider
#> # A tibble: 27 x 6
#> firm industry country time var1 var2
#> <fct> <int> <fct> <chr> <dbl> <dbl>
#> 1 A 1 USA 10 0.739 0.0406
#> 2 A 1 USA 11 -0.319 -0.477
#> 3 A 1 USA 12 -0.327 0.206
#> 4 B 2 CAN 10 -0.515 2.64
#> 5 B 2 CAN 11 -0.238 0.254
#> 6 B 2 CAN 12 -2.26 0.863
#> 7 C 3 DEU 10 -1.64 -1.62
#> 8 C 3 DEU 11 1.62 -1.16
#> 9 C 3 DEU 12 0.286 0.140
#> 10 D 4 USA 10 0.916 -0.935
#> # ... with 17 more rows
Created on 2019-10-05 by the reprex package (v0.3.0)
Related
I am trying to find a simple way to pivot_longer a dataframe that has multiple columns containing different data for each case. Using multiple names in names_to doesn't seem to solve the problem.
Here is a worked example:
#create the dataframe:
library('dplyr')
set.seed(11)
x <- data.frame(case = c(1:10),
X1990 = runif(10, 0, 1),
flag.1990 = rep(c('a','b'), 5),
X2000 = runif(10, 0, 1),
flag.2000 = rep(c('c', 'd'), 5))
> x
case X1990 flag.1990 X2000 flag.2000
1 1 0.2772497942 a 0.1751129 c
2 2 0.0005183129 b 0.4407503 d
3 3 0.5106083730 a 0.9071830 c
4 4 0.0140479084 b 0.8510419 d
5 5 0.0646897766 a 0.7339875 c
6 6 0.9548492255 b 0.5736857 d
7 7 0.0864958912 a 0.4817655 c
8 8 0.2899750092 b 0.3306110 d
9 9 0.8806991728 a 0.1576602 c
10 10 0.1232162013 b 0.4801341 d
Obviously I cannot just pivot_longer using cols = -case as that will combine year and flag data. If i try using a chr vector in names_to (from here: https://dcl-wrangle.stanford.edu/pivot-advanced.html (6.1.3):
x %>%
setNames(c('case','value.1990', 'flag.1990', 'value.2000', 'flag.2000')) %>%
pivot_longer(cols = -case,
names_to = c('value', 'flag'),
names_sep = '.',
values_to = 'value')
Things don't work, because the flag data isn't in the variable name.
The only way I can think to solve this is to break the dataframe into two data frames, pivot them and then join them. For example:
#create temporary data frame for year data, then pivot
temp1 <- x %>%
select(1,2, 4) %>% #select year data
pivot_longer(cols = c(X1990, X2000), #pivot longer on year data
names_to = 'year',
values_to = 'value') %>%
mutate(year = gsub('X', '', year)) #remove 'X' so that I can use this to join
#create temporary data frame for flag data, then pivot
temp2 <- x %>%
select(1, 3, 5) %>% #select flag variables
pivot_longer(cols = c(flag.1990, flag.2000), #pivot longer on flag data
names_to = 'flag.year',
values_to = 'flag') %>%
mutate(year = gsub('flag.', '', flag.year)) %>% #get year data so that I can join on this
select(-flag.year) #drop flag.year as its no longer useful information
final <- full_join(temp1, temp2, by = c('case', 'year')) #full join the two datasets to get the final data
> final
# A tibble: 20 x 4
case flag year value
<int> <chr> <chr> <dbl>
1 1 a 1990 0.277
2 1 c 2000 0.175
3 2 b 1990 0.000518
4 2 d 2000 0.441
5 3 a 1990 0.511
6 3 c 2000 0.907
7 4 b 1990 0.0140
8 4 d 2000 0.851
9 5 a 1990 0.0647
10 5 c 2000 0.734
11 6 b 1990 0.955
12 6 d 2000 0.574
13 7 a 1990 0.0865
14 7 c 2000 0.482
15 8 b 1990 0.290
16 8 d 2000 0.331
17 9 a 1990 0.881
18 9 c 2000 0.158
19 10 b 1990 0.123
20 10 d 2000 0.480
I assume there is a quicker way to do this. Am I just misreading the documentation on using multiple names in names_to. Any ideas?
In this case one has to use names_to combined with names_pattern:
library(dplyr)
library(tidyr)
> head(x,3)
case X1990 flag.1990 X2000 flag.2000
1 1 0.2772497942 a 0.1751129 c
2 2 0.0005183129 b 0.4407503 d
3 3 0.5106083730 a 0.9071830 c
> x %>%
pivot_longer(cols = -case,
names_to = c(".value", "year"),
names_pattern = "([^\\.]*)\\.*(\\d{4})")
# A tibble: 20 x 4
case year X flag
<int> <chr> <dbl> <chr>
1 1 1990 0.277 a
2 1 2000 0.175 c
3 2 1990 0.000518 b
4 2 2000 0.441 d
5 3 1990 0.511 a
6 3 2000 0.907 c
7 4 1990 0.0140 b
8 4 2000 0.851 d
9 5 1990 0.0647 a
10 5 2000 0.734 c
11 6 1990 0.955 b
12 6 2000 0.574 d
13 7 1990 0.0865 a
14 7 2000 0.482 c
15 8 1990 0.290 b
16 8 2000 0.331 d
17 9 1990 0.881 a
18 9 2000 0.158 c
19 10 1990 0.123 b
20 10 2000 0.480 d
I'd like to preserve the proper yearly index names as I recast my data from wide to long.
dt = data.table(country = c(1,2,3,4,5), gdp_1990 = rnorm(5), gdp_1991 = rnorm(5), gdp_1992 = rnorm(5),
unemp_1990 = rnorm(5), unemp_1991 = rnorm(5), unemp_1992 = rnorm(5))
melt(dt, id = 'country', measure = patterns(gdp = '^gdp_', unemp = '^unemp_'), variable.name = 'year')
Desired Output:
country year gdp unemp
1: 1 1990 0.856957066 -1.42947033
2: 2 1990 -1.765995901 1.38170009
3: 3 1990 -0.298302521 -0.54070574
4: 4 1990 -0.919421829 -0.17552704
5: 5 1990 -0.189133135 1.18923546
6: 1 1991 -1.248963381 -0.10467153
7: 2 1991 -0.800931881 0.03589986
Actual Output:
country year gdp unemp
1: 1 1 0.856957066 -1.42947033
2: 2 1 -1.765995901 1.38170009
3: 3 1 -0.298302521 -0.54070574
4: 4 1 -0.919421829 -0.17552704
5: 5 1 -0.189133135 1.18923546
6: 1 2 -1.248963381 -0.10467153
7: 2 2 -0.800931881 0.03589986
With data.table (dev version - 1.14.3) we can use measure with sep as documented in ?measure
measure(..., sep, pattern, cols, multiple.keyword="value.name")
library(data.table)
melt(dt, measure.vars = measure(value.name, year, sep = "_"))
-output
country year gdp unemp
<num> <char> <num> <num>
1: 1 1990 -1.275041172 -0.75524345
2: 2 1990 1.979629503 -1.14636877
3: 3 1990 0.062272176 1.16928396
4: 4 1990 -0.210106506 -0.66517069
5: 5 1990 -1.089511759 -1.79322014
6: 1 1991 0.460566878 0.61720109
7: 2 1991 0.183378182 -0.01628616
8: 3 1991 -0.647174381 1.14346303
9: 4 1991 0.008846161 0.05223651
10: 5 1991 -0.039701540 1.40848433
11: 1 1992 0.328204416 1.44638191
12: 2 1992 -1.359373393 1.33391755
13: 3 1992 -0.538430362 -0.26828537
14: 4 1992 0.424461192 -0.32107074
15: 5 1992 -0.338010393 -0.19920506
Using tidyr::pivot_longer we can use names_sep = "_" to split the names into the variable and year. In names_to, use the special string ".value" to specify that you want multiple columns created from the gdp and unemp columns:
tidyr::pivot_longer(dt, -1, names_sep = "_", names_to = c(".value", "year"))
#> # A tibble: 15 x 4
#> country year gdp unemp
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 1990 -0.324 -1.12
#> 2 1 1991 0.307 -1.64
#> 3 1 1992 -0.0569 -1.49
#> 4 2 1990 0.0602 -0.751
#> 5 2 1991 -1.54 0.450
#> 6 2 1992 -1.91 -1.08
#> 7 3 1990 -0.589 2.09
#> 8 3 1991 -0.301 -0.0186
#> 9 3 1992 1.18 1.00
#> 10 4 1990 0.531 0.0174
#> 11 4 1991 -0.528 -0.318
#> 12 4 1992 -1.66 -0.621
#> 13 5 1990 -1.52 -1.29
#> 14 5 1991 -0.652 -0.929
#> 15 5 1992 -0.464 -1.38
I am trying to find a simple way to pivot_longer a dataframe that has multiple columns containing different data for each case. Using multiple names in names_to doesn't seem to solve the problem.
Here is a worked example:
#create the dataframe:
library('dplyr')
set.seed(11)
x <- data.frame(case = c(1:10),
X1990 = runif(10, 0, 1),
flag.1990 = rep(c('a','b'), 5),
X2000 = runif(10, 0, 1),
flag.2000 = rep(c('c', 'd'), 5))
> x
case X1990 flag.1990 X2000 flag.2000
1 1 0.2772497942 a 0.1751129 c
2 2 0.0005183129 b 0.4407503 d
3 3 0.5106083730 a 0.9071830 c
4 4 0.0140479084 b 0.8510419 d
5 5 0.0646897766 a 0.7339875 c
6 6 0.9548492255 b 0.5736857 d
7 7 0.0864958912 a 0.4817655 c
8 8 0.2899750092 b 0.3306110 d
9 9 0.8806991728 a 0.1576602 c
10 10 0.1232162013 b 0.4801341 d
Obviously I cannot just pivot_longer using cols = -case as that will combine year and flag data. If i try using a chr vector in names_to (from here: https://dcl-wrangle.stanford.edu/pivot-advanced.html (6.1.3):
x %>%
setNames(c('case','value.1990', 'flag.1990', 'value.2000', 'flag.2000')) %>%
pivot_longer(cols = -case,
names_to = c('value', 'flag'),
names_sep = '.',
values_to = 'value')
Things don't work, because the flag data isn't in the variable name.
The only way I can think to solve this is to break the dataframe into two data frames, pivot them and then join them. For example:
#create temporary data frame for year data, then pivot
temp1 <- x %>%
select(1,2, 4) %>% #select year data
pivot_longer(cols = c(X1990, X2000), #pivot longer on year data
names_to = 'year',
values_to = 'value') %>%
mutate(year = gsub('X', '', year)) #remove 'X' so that I can use this to join
#create temporary data frame for flag data, then pivot
temp2 <- x %>%
select(1, 3, 5) %>% #select flag variables
pivot_longer(cols = c(flag.1990, flag.2000), #pivot longer on flag data
names_to = 'flag.year',
values_to = 'flag') %>%
mutate(year = gsub('flag.', '', flag.year)) %>% #get year data so that I can join on this
select(-flag.year) #drop flag.year as its no longer useful information
final <- full_join(temp1, temp2, by = c('case', 'year')) #full join the two datasets to get the final data
> final
# A tibble: 20 x 4
case flag year value
<int> <chr> <chr> <dbl>
1 1 a 1990 0.277
2 1 c 2000 0.175
3 2 b 1990 0.000518
4 2 d 2000 0.441
5 3 a 1990 0.511
6 3 c 2000 0.907
7 4 b 1990 0.0140
8 4 d 2000 0.851
9 5 a 1990 0.0647
10 5 c 2000 0.734
11 6 b 1990 0.955
12 6 d 2000 0.574
13 7 a 1990 0.0865
14 7 c 2000 0.482
15 8 b 1990 0.290
16 8 d 2000 0.331
17 9 a 1990 0.881
18 9 c 2000 0.158
19 10 b 1990 0.123
20 10 d 2000 0.480
I assume there is a quicker way to do this. Am I just misreading the documentation on using multiple names in names_to. Any ideas?
In this case one has to use names_to combined with names_pattern:
library(dplyr)
library(tidyr)
> head(x,3)
case X1990 flag.1990 X2000 flag.2000
1 1 0.2772497942 a 0.1751129 c
2 2 0.0005183129 b 0.4407503 d
3 3 0.5106083730 a 0.9071830 c
> x %>%
pivot_longer(cols = -case,
names_to = c(".value", "year"),
names_pattern = "([^\\.]*)\\.*(\\d{4})")
# A tibble: 20 x 4
case year X flag
<int> <chr> <dbl> <chr>
1 1 1990 0.277 a
2 1 2000 0.175 c
3 2 1990 0.000518 b
4 2 2000 0.441 d
5 3 1990 0.511 a
6 3 2000 0.907 c
7 4 1990 0.0140 b
8 4 2000 0.851 d
9 5 1990 0.0647 a
10 5 2000 0.734 c
11 6 1990 0.955 b
12 6 2000 0.574 d
13 7 1990 0.0865 a
14 7 2000 0.482 c
15 8 1990 0.290 b
16 8 2000 0.331 d
17 9 1990 0.881 a
18 9 2000 0.158 c
19 10 1990 0.123 b
20 10 2000 0.480 d
I have the following data set:
Name Year VarA VarB Data.1 Data.2
A 2016 L H 100 101
A 2017 L H 105 99
A 2018 L H 103 105
A 2016 L A 90 95
A 2017 L A 99 92
A 2018 L A 102 101
I want to add a lagged variable by the grouping: Name, VarA, VarB so that my data would look like:
Name Year VarA VarB Data.1 Data.2 Lg1.Data.1 Lg2.Data.1
A 2016 L H 100 101 NA NA
A 2017 L H 105 99 100 NA
A 2018 L H 103 105 105 100
A 2016 L A 90 95 NA NA
A 2017 L A 99 92 90 NA
A 2018 L A 102 101 99 90
I found the following link, which is helpful: debugging: function to create multiple lags for multiple columns (dplyr)
And am using the following code:
df <- df %>%
group_by(Name) %>%
arrange(Name, VarA, VarB, Year) %>%
do(data.frame(., setNames(shift(.[,c(5:6)], 1:2), c(seq(1:8)))))
However, the lag offsetting all data associated w/ name, instead of the grouping I want, so only the 2018 years are accurately lagged.
Name Year VarA VarB Data.1 Data.2 Lg1.Data.1 Lg2.Data.1
A 2016 L H 100 101 NA NA
A 2017 L H 105 99 100 NA
A 2018 L H 103 105 105 100
A 2016 L A 90 95 103 105
A 2017 L A 99 92 90 103
A 2018 L A 102 101 99 90
How do I get the lag to reset for each new grouping combination (e.g. Name / VarA / VarB)?
dplyr::lag lets you set the distance you want to lag by. You can group by whatever variables you want—in this case, Name, VarA, and VarB—before making your lagged variables.
library(dplyr)
df %>%
group_by(Name, VarA, VarB) %>%
mutate(Lg1.Data.1 = lag(Data.1, n = 1), Lg2.Data.1 = lag(Data.1, n = 2))
#> # A tibble: 6 x 8
#> # Groups: Name, VarA, VarB [2]
#> Name Year VarA VarB Data.1 Data.2 Lg1.Data.1 Lg2.Data.1
#> <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 A 2016 L H 100 101 NA NA
#> 2 A 2017 L H 105 99 100 NA
#> 3 A 2018 L H 103 105 105 100
#> 4 A 2016 L A 90 95 NA NA
#> 5 A 2017 L A 99 92 90 NA
#> 6 A 2018 L A 102 101 99 90
If you want a version that scales to more lags, you can use some non-standard evaluation to create new lagged columns dynamically. I'll do this with purrr::map to iterate of a set of n to lag by, make a list of data frames with the new columns added, then join all the data frames together. There are probably better NSE ways to do this, so hopefully someone can improve upon it.
I'm making up some new data, just to have a wider range of years to illustrate. Inside mutate, you can create column names with quo_name.
library(dplyr)
library(purrr)
set.seed(127)
df <- tibble(
Name = "A", Year = rep(2016:2020, 2), VarA = "L", VarB = rep(c("H", "A"), each = 5),
Data.1 = sample(1:10, 10, replace = T), Data.2 = sample(1:10, 10, replace = T)
)
df_list <- purrr::map(1:4, function(i) {
df %>%
group_by(Name, VarA, VarB) %>%
mutate(!!quo_name(paste0("Lag", i)) := dplyr::lag(Data.1, n = i))
})
You don't need to save this list—I'm just doing it to show an example of one of the data frames. You could instead go straight into reduce.
df_list[[3]]
#> # A tibble: 10 x 7
#> # Groups: Name, VarA, VarB [2]
#> Name Year VarA VarB Data.1 Data.2 Lag3
#> <chr> <int> <chr> <chr> <int> <int> <int>
#> 1 A 2016 L H 3 9 NA
#> 2 A 2017 L H 1 4 NA
#> 3 A 2018 L H 3 8 NA
#> 4 A 2019 L H 2 2 3
#> 5 A 2020 L H 4 5 1
#> 6 A 2016 L A 8 4 NA
#> 7 A 2017 L A 6 8 NA
#> 8 A 2018 L A 3 2 NA
#> 9 A 2019 L A 8 6 8
#> 10 A 2020 L A 9 1 6
Then use purrr::reduce to join all the data frames in the list. Since there are columns that are the same in each of the data frames, and those are the ones you want to join by, you can get away with not specifying join-by columns in inner_join.
reduce(df_list, inner_join)
#> Joining, by = c("Name", "Year", "VarA", "VarB", "Data.1", "Data.2")
#> Joining, by = c("Name", "Year", "VarA", "VarB", "Data.1", "Data.2")
#> Joining, by = c("Name", "Year", "VarA", "VarB", "Data.1", "Data.2")
#> # A tibble: 10 x 10
#> # Groups: Name, VarA, VarB [?]
#> Name Year VarA VarB Data.1 Data.2 Lag1 Lag2 Lag3 Lag4
#> <chr> <int> <chr> <chr> <int> <int> <int> <int> <int> <int>
#> 1 A 2016 L H 3 9 NA NA NA NA
#> 2 A 2017 L H 1 4 3 NA NA NA
#> 3 A 2018 L H 3 8 1 3 NA NA
#> 4 A 2019 L H 2 2 3 1 3 NA
#> 5 A 2020 L H 4 5 2 3 1 3
#> 6 A 2016 L A 8 4 NA NA NA NA
#> 7 A 2017 L A 6 8 8 NA NA NA
#> 8 A 2018 L A 3 2 6 8 NA NA
#> 9 A 2019 L A 8 6 3 6 8 NA
#> 10 A 2020 L A 9 1 8 3 6 8
Created on 2018-12-07 by the reprex package (v0.2.1)
I want to repeat the value within each group (year), which is equal to the value of the first category "A".
For example. My data frame is:
data = expand.grid(
category = LETTERS[1:3],
year = 2000:2005)
data$value = runif(nrow(data))
I tried to do the following, however, it does not repeat the value three times
test<-data %>% group_by(year) %>% mutate(value2 =value[category == "A"])
test
# A tibble: 18 x 4
# Groups: year [6]
category year value value2
<fct> <int> <dbl> <dbl>
1 A 2000 0.783 0.783
2 B 2000 0.351 0.467
3 C 2000 0.296 0.895
4 A 2001 0.467 0.102
5 B 2001 0.168 0.546
6 C 2001 0.459 0.447
7 A 2002 0.895 0.783
I need the following result:
1 A 2000 0.783 0.783
2 B 2000 0.351 0.783
3 C 2000 0.296 0.783
4 A 2001 0.467 0.467
5 B 2001 0.168 0.467
6 C 2001 0.459 0.467
Edit: After a comment that it might relate to the packages conflict I add the list of packages that I load before:
# install packages if not installed already
list.of.packages <- c("stringr", "timeDate", "bizdays",
"lubridate", "readxl", "dplyr","plyr",
"rootSolve", "RODBC", "glue",
"ggplot2","gridExtra","bdscale", "gtools", "scales", "shiny", "leaflet", "data.table", "plotly")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
#========== Libraries to be loaded ===============
lapply(list.of.packages, require, character.only = TRUE)
#------
here it is little R freak
> data %>% group_by(year) %>%
+ mutate(value_tmp = if_else(category == "A", value, NA_real_),
+ value2 = mean(value_tmp, na.rm = TRUE))
# A tibble: 18 x 5
# Groups: year [6]
category year value value_tmp value2
<fct> <int> <dbl> <dbl> <dbl>
1 A 2000 0.01818495 0.01818495 0.01818495
2 B 2000 0.5649932 NA 0.01818495
3 C 2000 0.5483291 NA 0.01818495
4 A 2001 0.9175864 0.9175864 0.9175864
5 B 2001 0.2415837 NA 0.9175864
6 C 2001 0.2250608 NA 0.9175864
7 A 2002 0.6037224 0.6037224 0.6037224
8 B 2002 0.8712926 NA 0.6037224
9 C 2002 0.6293625 NA 0.6037224
10 A 2003 0.8126948 0.8126948 0.8126948
11 B 2003 0.7540445 NA 0.8126948
12 C 2003 0.02220114 NA 0.8126948
13 A 2004 0.3961279 0.3961279 0.3961279
14 B 2004 0.3638186 NA 0.3961279
15 C 2004 0.8682010 NA 0.3961279
16 A 2005 0.04196315 0.04196315 0.04196315
17 B 2005 0.4879482 NA 0.04196315
18 C 2005 0.8605212 NA 0.04196315
I have obtained the desired results, by slightly modifying the response of Noobie and using fill from tidyverse:
test <- data %>% group_by(year) %>%
mutate(value_tmp = if_else(category == "A", value, NA_real_))%>%
fill(value_tmp)