preserve index names when melting - r

I'd like to preserve the proper yearly index names as I recast my data from wide to long.
dt = data.table(country = c(1,2,3,4,5), gdp_1990 = rnorm(5), gdp_1991 = rnorm(5), gdp_1992 = rnorm(5),
unemp_1990 = rnorm(5), unemp_1991 = rnorm(5), unemp_1992 = rnorm(5))
melt(dt, id = 'country', measure = patterns(gdp = '^gdp_', unemp = '^unemp_'), variable.name = 'year')
Desired Output:
country year gdp unemp
1: 1 1990 0.856957066 -1.42947033
2: 2 1990 -1.765995901 1.38170009
3: 3 1990 -0.298302521 -0.54070574
4: 4 1990 -0.919421829 -0.17552704
5: 5 1990 -0.189133135 1.18923546
6: 1 1991 -1.248963381 -0.10467153
7: 2 1991 -0.800931881 0.03589986
Actual Output:
country year gdp unemp
1: 1 1 0.856957066 -1.42947033
2: 2 1 -1.765995901 1.38170009
3: 3 1 -0.298302521 -0.54070574
4: 4 1 -0.919421829 -0.17552704
5: 5 1 -0.189133135 1.18923546
6: 1 2 -1.248963381 -0.10467153
7: 2 2 -0.800931881 0.03589986

With data.table (dev version - 1.14.3) we can use measure with sep as documented in ?measure
measure(..., sep, pattern, cols, multiple.keyword="value.name")
library(data.table)
melt(dt, measure.vars = measure(value.name, year, sep = "_"))
-output
country year gdp unemp
<num> <char> <num> <num>
1: 1 1990 -1.275041172 -0.75524345
2: 2 1990 1.979629503 -1.14636877
3: 3 1990 0.062272176 1.16928396
4: 4 1990 -0.210106506 -0.66517069
5: 5 1990 -1.089511759 -1.79322014
6: 1 1991 0.460566878 0.61720109
7: 2 1991 0.183378182 -0.01628616
8: 3 1991 -0.647174381 1.14346303
9: 4 1991 0.008846161 0.05223651
10: 5 1991 -0.039701540 1.40848433
11: 1 1992 0.328204416 1.44638191
12: 2 1992 -1.359373393 1.33391755
13: 3 1992 -0.538430362 -0.26828537
14: 4 1992 0.424461192 -0.32107074
15: 5 1992 -0.338010393 -0.19920506

Using tidyr::pivot_longer we can use names_sep = "_" to split the names into the variable and year. In names_to, use the special string ".value" to specify that you want multiple columns created from the gdp and unemp columns:
tidyr::pivot_longer(dt, -1, names_sep = "_", names_to = c(".value", "year"))
#> # A tibble: 15 x 4
#> country year gdp unemp
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 1990 -0.324 -1.12
#> 2 1 1991 0.307 -1.64
#> 3 1 1992 -0.0569 -1.49
#> 4 2 1990 0.0602 -0.751
#> 5 2 1991 -1.54 0.450
#> 6 2 1992 -1.91 -1.08
#> 7 3 1990 -0.589 2.09
#> 8 3 1991 -0.301 -0.0186
#> 9 3 1992 1.18 1.00
#> 10 4 1990 0.531 0.0174
#> 11 4 1991 -0.528 -0.318
#> 12 4 1992 -1.66 -0.621
#> 13 5 1990 -1.52 -1.29
#> 14 5 1991 -0.652 -0.929
#> 15 5 1992 -0.464 -1.38

Related

Is it possible to purrr::map the function by using the elements within the same dataframe in r?

x = list(data.frame(age = c(1:4),period = c(2000:2003)),
data.frame(age = c(5:8),period = c(1998:2001)),
data.frame(age = c(11:19),period = c(1990:1998)))
map2(x, x$period, ~cbind(.x, difference = .y-.x$age))
result:
> map2(x, x$period, ~cbind(.x, difference = .y-.x$age))
list()
Is it possible to map the function by using the elements within the same dataframe?
In your context x$period is NULL since x is the list of dataframes and it has no attribute "period". I think you want to access the period column within each unnammed dataframe in the list. I would just use map which will pass along each dataframe in the list, which you can then manipulate in the function to access each column without having to explicitly pass it.
library(purrr)
library(dplyr)
x = list(data.frame(age = c(1:4),period = c(2000:2003)),
data.frame(age = c(5:8),period = c(1998:2001)),
data.frame(age = c(11:19),period = c(1990:1998)))
#Original attempt
result <- map2(x, x$period, ~cbind(.x, difference = .y-.x$age))
result
#> list()
#My solution
result2 <- map(x, function(df) cbind(df, difference = df$period - df$age))
result2
#> [[1]]
#> age period difference
#> 1 1 2000 1999
#> 2 2 2001 1999
#> 3 3 2002 1999
#> 4 4 2003 1999
#>
#> [[2]]
#> age period difference
#> 1 5 1998 1993
#> 2 6 1999 1993
#> 3 7 2000 1993
#> 4 8 2001 1993
#>
#> [[3]]
#> age period difference
#> 1 11 1990 1979
#> 2 12 1991 1979
#> 3 13 1992 1979
#> 4 14 1993 1979
#> 5 15 1994 1979
#> 6 16 1995 1979
#> 7 17 1996 1979
#> 8 18 1997 1979
#> 9 19 1998 1979
#A more readable solution using dplyr
result3 <- map(x, function(df) df %>% mutate(difference = period - age))
result3
#> [[1]]
#> age period difference
#> 1 1 2000 1999
#> 2 2 2001 1999
#> 3 3 2002 1999
#> 4 4 2003 1999
#>
#> [[2]]
#> age period difference
#> 1 5 1998 1993
#> 2 6 1999 1993
#> 3 7 2000 1993
#> 4 8 2001 1993
#>
#> [[3]]
#> age period difference
#> 1 11 1990 1979
#> 2 12 1991 1979
#> 3 13 1992 1979
#> 4 14 1993 1979
#> 5 15 1994 1979
#> 6 16 1995 1979
#> 7 17 1996 1979
#> 8 18 1997 1979
#> 9 19 1998 1979
Created on 2023-02-02 with reprex v2.0.2

R: Cumulative Mean Excluding Current Value?

I am working with the R programming language.
I have a dataset that looks something like this:
id = c(1,1,1,1,2,2,2)
year = c(2010,2011,2012,2013, 2012, 2013, 2014)
var = rnorm(7,7,7)
my_data = data.frame(id, year,var)
id year var
1 1 2010 12.186300
2 1 2011 19.069836
3 1 2012 7.456078
4 1 2013 14.875019
5 2 2012 20.827933
6 2 2013 5.029625
7 2 2014 -2.260658
For each "group" within the ID column - at each row, I want to take the CUMULATIVE MEAN of the "var" column but EXCLUDE the value of "var" within that row (i.e. most recent).
As an example:
row 1: NA
row 2: 12.186300/1
row 3: (12.186300 + 19.069836)/2
row 4: (12.186300 + 19.069836 + 7.45)/3
row 5: NA
row 6: 20.827933
row 7: (20.827933 + 5.029625)/2
I found this post here (Cumsum excluding current value) which (I think) shows how to do this for the "cumulative sum" - I tried to apply the logic here to my question:
transform(my_data, cmean = ave(var, id, FUN = cummean) - var)
id year var cmean
1 1 2010 12.186300 0.000000
2 1 2011 19.069836 -3.441768
3 1 2012 7.456078 5.447994
4 1 2013 14.875019 -1.478211
5 2 2012 20.827933 0.000000
6 2 2013 5.029625 7.899154
7 2 2014 -2.260658 10.126291
The code appears to have run - but I don't think I have done this correctly (i.e. the numbers produced don't match up with the numbers I had anticipated).
I then tried an answer provided here (Compute mean excluding current value):
my_data %>%
group_by(id) %>%
mutate(avg = (sum(var) - var)/(n() - 1))
# A tibble: 7 x 4
# Groups: id [2]
id year var avg
<dbl> <dbl> <dbl> <dbl>
1 1 2010 12.2 13.8
2 1 2011 19.1 11.5
3 1 2012 7.46 15.4
4 1 2013 14.9 12.9
5 2 2012 20.8 1.38
6 2 2013 5.03 9.28
But it is still not working.
Can someone please show me what I am doing wrong and what I can do this fix this problem?
Thanks!
df %>%
group_by(id)%>%
mutate(avg = lag(cummean(var)))
# A tibble: 7 × 4
# Groups: id [2]
id year var avg
<int> <int> <dbl> <dbl>
1 1 2010 12.2 NA
2 1 2011 19.1 12.2
3 1 2012 7.46 15.6
4 1 2013 14.9 12.9
5 2 2012 20.8 NA
6 2 2013 5.03 20.8
7 2 2014 -2.26 12.9
With the help of some intermediate variables you can do it like so:
library(dplyr)
df <- read.table(text = "
id year var
1 1 2010 12.186300
2 1 2011 19.069836
3 1 2012 7.456078
4 1 2013 14.875019
5 2 2012 20.827933
6 2 2013 5.029625
7 2 2014 -2.260658", header=T)
df |>
group_by(id) |>
#mutate(avg =lag(cummean(var)))
mutate(id_g = row_number()) |>
mutate(ms = cumsum(var)) |>
mutate(cm = ms/id_g,
cm = ifelse(ms == cm, NA, cm)) |>
select(-id_g, -ms)
#> # A tibble: 7 × 4
#> # Groups: id [2]
#> id year var cm
#> <int> <int> <dbl> <dbl>
#> 1 1 2010 12.2 NA
#> 2 1 2011 19.1 15.6
#> 3 1 2012 7.46 12.9
#> 4 1 2013 14.9 13.4
#> 5 2 2012 20.8 NA
#> 6 2 2013 5.03 12.9
#> 7 2 2014 -2.26 7.87

How to find the annual evolution rate for each firm in my data table?

So I have a data table of 5000 firms, each firm is assigned a numerical value ("id") which is 1 for the first firm, 2 for the second ...
Here is my table with only the profit variable :
|id | year | profit
|:----| :----| :----|
|1 |2001 |-0.4
|1 |2002 |-0.89
|2 |2001 |1.89
|2 |2002 |2.79
Each firm is expressed twice, one line specifies the data in 2001 and the second in 2002 (the "id" value being the same on both lines because it is the same firm one year apart).
How to calculate the annual rate of change of each firm ("id") between 2001 and 2002 ?
I'm really new to R and I don't see where to start? Separate the 2001 and 2002 data?
I did this :
years <- sort(unique(group$year))years
And I also found this on the internet but with no success :
library(dplyr)
res <-
group %>%
arrange(id,year) %>%
group_by(id) %>%
mutate(evol_rate = ("group$year$2002" / lag("group$year$2001") - 1) * 100) %>%
ungroup()
Thank you very much
From what you've written, I take it that you want to calculate the formula for ROC for the profit values of 2001 and 2002:
ROC=(current_value​/previous_value − 1) ∗ 100
To accomplish this, I suggest tidyr::pivot_wider() which reshapes your dataframe from long to wide format (see: https://r4ds.had.co.nz/tidy-data.html#pivoting).
Code:
require(tidyr)
require(dplyr)
id <- sort(rep(seq(1,250, 1), 2))
year <- rep(seq(2001, 2002, 1), 500)
value <- sample(500:2000, 500)
df <- data.frame(id, year, value)
head(df, 10)
#> id year value
#> 1 1 2001 856
#> 2 1 2002 1850
#> 3 2 2001 1687
#> 4 2 2002 1902
#> 5 3 2001 1728
#> 6 3 2002 1773
#> 7 4 2001 691
#> 8 4 2002 1691
#> 9 5 2001 1368
#> 10 5 2002 893
df_wide <- df %>%
pivot_wider(names_from = year,
names_prefix = "profit_",
values_from = value,
values_fn = mean)
res <- df_wide %>%
mutate(evol_rate = (profit_2002/profit_2001-1)*100) %>%
round(2)
head(res, 10)
#> # A tibble: 10 x 4
#> id profit_2001 profit_2002 evol_rate
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 856 1850 116.
#> 2 2 1687 1902 12.7
#> 3 3 1728 1773 2.6
#> 4 4 691 1691 145.
#> 5 5 1368 893 -34.7
#> 6 6 883 516 -41.6
#> 7 7 1280 1649 28.8
#> 8 8 1579 1383 -12.4
#> 9 9 1907 1626 -14.7
#> 10 10 1227 1134 -7.58
If you want to do it without reshaping your data into a wide format you can use
library(tidyverse)
id <- sort(rep(seq(1,250, 1), 2))
year <- rep(seq(2001, 2002, 1), 500)
value <- sample(500:2000, 500)
df <- data.frame(id, year, value)
df %>% head(n = 10)
#> id year value
#> 1 1 2001 1173
#> 2 1 2002 1648
#> 3 2 2001 1560
#> 4 2 2002 1091
#> 5 3 2001 1736
#> 6 3 2002 667
#> 7 4 2001 1840
#> 8 4 2002 1202
#> 9 5 2001 1597
#> 10 5 2002 1797
new_df <- df %>%
group_by(id) %>%
mutate(ROC = ((value / lag(value) - 1) * 100))
new_df %>% head(n = 10)
#> # A tibble: 10 × 4
#> # Groups: id [5]
#> id year value ROC
#> <dbl> <dbl> <int> <dbl>
#> 1 1 2001 1173 NA
#> 2 1 2002 1648 40.5
#> 3 2 2001 1560 NA
#> 4 2 2002 1091 -30.1
#> 5 3 2001 1736 NA
#> 6 3 2002 667 -61.6
#> 7 4 2001 1840 NA
#> 8 4 2002 1202 -34.7
#> 9 5 2001 1597 NA
#> 10 5 2002 1797 12.5
This groups the data by id and then uses lag to compare the current year to the year prior

How to reshape this data.frame having two variables as identifiers?

Suppose I have a data.frame as follows:
data = data.frame(firm = LETTERS[seq( from = 1, to = 9)],
industry = seq(1,9),
country = c("USA", "CAN", "DEU"),
var1_10 = rnorm(9),
var1_11 = rnorm(9),
var1_12 = rnorm(9),
var2_10 = rnorm(9),
var2_11 = rnorm(9),
var2_12 = rnorm(9))
Which looks like this:
head(data)
firm industry country var1_10 var1_11 var1_12 var2_10 var2_11 var2_12
A 1 USA 0.006080107 1.7089981 0.384306433 -0.2814963 -0.31852115 0.4879907
B 2 CAN 0.447786736 -0.6414333 0.683906020 -0.7500779 -0.72770473 -0.1499627
C 3 DEU 1.265955776 -1.6834242 -0.685028075 0.7192065 -0.02291059 -0.2322860
D 4 USA 0.874346857 0.6339960 -0.005798694 1.0982600 -1.57901079 -0.0510445
E 5 CAN 0.692382607 -0.4461135 -0.432249529 1.7461789 -0.49300818 1.1987289
F 6 DEU -1.098814463 0.7868190 2.281716591 -1.0006592 0.95612690 1.0244039
And I would like to have var1 and var2 in long format, but having firm and country as categories. What I mean is something like this:
firm country time var1 var2
1 A USA 10 0.6157731 1.05564854
2 A USA 11 0.2560421 0.42902183
3 D CAN 10 0.7278390 -1.81995641
4 D CAN 11 1.3241109 -0.69197609
5 B DEU 10 0.1471585 -1.93182825
6 B DEU 11 -0.5985394 1.20967201
7 E USA 10 2.1925299 -0.27900005
8 E USA 11 2.3271128 -1.09578323
9 C CAN 10 1.1348696 -0.10218604
10 C CAN 11 -0.1908846 0.35702296
11 F DEU 10 0.4748446 -0.88230257
12 F DEU 11 -0.5454749 -0.05664779
You can use the new tidyr 1.0.0 pivot_longer() and pivot_wider() functions.
#yutannihilation has an excellent presentation explaining these new functions: A Graphical Introduction to tidyr's pivot_*()
library(tidyr)
set.seed(2019)
data = data.frame(firm = LETTERS[seq( from = 1, to = 9)],
industry = seq(1,9),
country = c("USA", "CAN", "DEU"),
var1_10 = rnorm(9),
var1_11 = rnorm(9),
var1_12 = rnorm(9),
var2_10 = rnorm(9),
var2_11 = rnorm(9),
var2_12 = rnorm(9))
data
#> firm industry country var1_10 var1_11 var1_12 var2_10
#> 1 A 1 USA 0.7385227 -0.3191793 -0.3271264 0.04062997
#> 2 B 2 CAN -0.5147605 -0.2379111 -2.2632252 2.63601650
#> 3 C 3 DEU -1.6401813 1.6186229 0.2855605 -1.61599923
#> 4 D 4 USA 0.9160368 -1.1176011 0.9684286 -0.93455930
#> 5 E 5 CAN -1.2674820 0.2340028 0.8673066 0.63038569
#> 6 F 6 DEU 0.7382478 0.3161516 1.3781350 0.76075998
#> 7 G 7 USA -0.7826228 0.3707686 -0.8082596 -0.51162277
#> 8 H 8 CAN 0.5092959 0.8775886 -0.5121532 1.00190750
#> 9 I 9 DEU -1.4899391 -1.7683235 -1.8039718 -0.38339219
#> var2_11 var2_12
#> 1 -0.47713729 0.20612698
#> 2 0.25420771 0.86320623
#> 3 -1.16349174 0.13977752
#> 4 -0.43793937 -0.22809479
#> 5 -1.72413573 -0.31982812
#> 6 1.72514669 -0.05294738
#> 7 0.09215510 -0.23639840
#> 8 0.07311485 -0.33796351
#> 9 0.64014783 -0.75548467
Create a long table format first
data_longer <- data %>%
pivot_longer(
cols = starts_with("var"),
names_to = c("var", "time"),
names_sep = "_",
values_to = "value"
)
data_longer
#> # A tibble: 54 x 6
#> firm industry country var time value
#> <fct> <int> <fct> <chr> <chr> <dbl>
#> 1 A 1 USA var1 10 0.739
#> 2 A 1 USA var1 11 -0.319
#> 3 A 1 USA var1 12 -0.327
#> 4 A 1 USA var2 10 0.0406
#> 5 A 1 USA var2 11 -0.477
#> 6 A 1 USA var2 12 0.206
#> 7 B 2 CAN var1 10 -0.515
#> 8 B 2 CAN var1 11 -0.238
#> 9 B 2 CAN var1 12 -2.26
#> 10 B 2 CAN var2 10 2.64
#> # ... with 44 more rows
Then reshape to the desired wide format
data_wider <- data_longer %>%
pivot_wider(names_from = var, values_from = value)
data_wider
#> # A tibble: 27 x 6
#> firm industry country time var1 var2
#> <fct> <int> <fct> <chr> <dbl> <dbl>
#> 1 A 1 USA 10 0.739 0.0406
#> 2 A 1 USA 11 -0.319 -0.477
#> 3 A 1 USA 12 -0.327 0.206
#> 4 B 2 CAN 10 -0.515 2.64
#> 5 B 2 CAN 11 -0.238 0.254
#> 6 B 2 CAN 12 -2.26 0.863
#> 7 C 3 DEU 10 -1.64 -1.62
#> 8 C 3 DEU 11 1.62 -1.16
#> 9 C 3 DEU 12 0.286 0.140
#> 10 D 4 USA 10 0.916 -0.935
#> # ... with 17 more rows
Created on 2019-10-05 by the reprex package (v0.3.0)

Mutate repeat the value by group dplyr

I want to repeat the value within each group (year), which is equal to the value of the first category "A".
For example. My data frame is:
data = expand.grid(
category = LETTERS[1:3],
year = 2000:2005)
data$value = runif(nrow(data))
I tried to do the following, however, it does not repeat the value three times
test<-data %>% group_by(year) %>% mutate(value2 =value[category == "A"])
test
# A tibble: 18 x 4
# Groups: year [6]
category year value value2
<fct> <int> <dbl> <dbl>
1 A 2000 0.783 0.783
2 B 2000 0.351 0.467
3 C 2000 0.296 0.895
4 A 2001 0.467 0.102
5 B 2001 0.168 0.546
6 C 2001 0.459 0.447
7 A 2002 0.895 0.783
I need the following result:
1 A 2000 0.783 0.783
2 B 2000 0.351 0.783
3 C 2000 0.296 0.783
4 A 2001 0.467 0.467
5 B 2001 0.168 0.467
6 C 2001 0.459 0.467
Edit: After a comment that it might relate to the packages conflict I add the list of packages that I load before:
# install packages if not installed already
list.of.packages <- c("stringr", "timeDate", "bizdays",
"lubridate", "readxl", "dplyr","plyr",
"rootSolve", "RODBC", "glue",
"ggplot2","gridExtra","bdscale", "gtools", "scales", "shiny", "leaflet", "data.table", "plotly")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
#========== Libraries to be loaded ===============
lapply(list.of.packages, require, character.only = TRUE)
#------
here it is little R freak
> data %>% group_by(year) %>%
+ mutate(value_tmp = if_else(category == "A", value, NA_real_),
+ value2 = mean(value_tmp, na.rm = TRUE))
# A tibble: 18 x 5
# Groups: year [6]
category year value value_tmp value2
<fct> <int> <dbl> <dbl> <dbl>
1 A 2000 0.01818495 0.01818495 0.01818495
2 B 2000 0.5649932 NA 0.01818495
3 C 2000 0.5483291 NA 0.01818495
4 A 2001 0.9175864 0.9175864 0.9175864
5 B 2001 0.2415837 NA 0.9175864
6 C 2001 0.2250608 NA 0.9175864
7 A 2002 0.6037224 0.6037224 0.6037224
8 B 2002 0.8712926 NA 0.6037224
9 C 2002 0.6293625 NA 0.6037224
10 A 2003 0.8126948 0.8126948 0.8126948
11 B 2003 0.7540445 NA 0.8126948
12 C 2003 0.02220114 NA 0.8126948
13 A 2004 0.3961279 0.3961279 0.3961279
14 B 2004 0.3638186 NA 0.3961279
15 C 2004 0.8682010 NA 0.3961279
16 A 2005 0.04196315 0.04196315 0.04196315
17 B 2005 0.4879482 NA 0.04196315
18 C 2005 0.8605212 NA 0.04196315
I have obtained the desired results, by slightly modifying the response of Noobie and using fill from tidyverse:
test <- data %>% group_by(year) %>%
mutate(value_tmp = if_else(category == "A", value, NA_real_))%>%
fill(value_tmp)

Resources