I would like to oversample such that I have balance on my binary dependent variable within each group in my data set.
So my data looks like this:
library(dplyr)
library(purrr)
library(tidyr)
seed(123)
# example data
(data <- tibble(
country = c("France", "France", "France",
"UK", "UK", "UK", "UK", "UK", "UK"),
YES = c(0, 0, 1,
0, 0, 0, 0, 1, 1),
X = rnorm(9, 0 ,1)
))
# A tibble: 9 x 3
country YES X
<chr> <dbl> <dbl>
1 France 0 -1.12
2 France 0 -0.200
3 France 1 0.781
4 UK 0 0.100
5 UK 0 0.0997
6 UK 0 -0.380
7 UK 0 -0.0160
8 UK 1 -0.0265
9 UK 1 0.860
I am trying to achieve balance on YES within France and the UK by oversampling. In France I would like to have 4 observations and in the UK 8 so that one random sample could look like this):
# A tibble: 12 x 3
country YES X
<chr> <dbl> <dbl>
1 France 0 -1.12
2 France 0 -0.200
3 France 1 0.781
3 France 1 0.781
4 UK 0 0.100
5 UK 0 0.0997
6 UK 0 -0.380
7 UK 0 -0.0160
8 UK 1 -0.0265
9 UK 1 0.860
8 UK 1 -0.0265
8 UK 1 -0.0265
My approach was this:
# oversample 1's within each country
(n_data <- data %>%
group_by(country) %>%
nest(.key = "original") %>%
mutate(os = map(original, ~ group_by(., YES))) %>%
mutate(os = map(os, ~ slice_sample(., replace = TRUE, prop = 1))))
# A tibble: 2 x 3
# Groups: country [2]
country original os
<chr> <list> <list>
1 France <tibble [3 x 2]> <tibble [3 x 2]>
2 UK <tibble [6 x 2]> <tibble [6 x 2]>
Warning message:
`.key` is deprecated
So in OS the dimensions should be 4 x 2 and 8 x 2. Does anyone know how to do this?
This seems overcomplicated, but each individual step seems clear and robust:
data %>%
count(country, YES) %>%
group_by(country) %>%
## Figure out how many additional rows are needed
mutate(
goal_rows = max(n),
extra_rows = goal_rows - n
) %>%
select(country, YES, extra_rows) %>%
## Keep only the country/YES combinations that need extra rows
filter(extra_rows > 0) %>%
## Join back to original data
left_join(data, by = c("country", "YES")) %>%
group_by(country) %>%
## Randomly keep the appropriate number of rows
mutate(rand = rank(runif(n()))) %>%
filter(rand <= extra_rows) %>%
select(-extra_rows, -rand) %>%
## Combine oversampled rows with original data
bind_rows(data) %>%
arrange(country, YES)
# # A tibble: 12 x 3
# # Groups: country [2]
# country YES X
# <chr> <dbl> <dbl>
# 1 France 0 1.88
# 2 France 0 -0.0793
# 3 France 1 0.812
# 4 France 1 0.812
# 5 UK 0 -1.66
# 6 UK 0 -0.797
# 7 UK 0 0.639
# 8 UK 0 -0.141
# 9 UK 1 -0.207
# 10 UK 1 1.30
# 11 UK 1 -0.207
# 12 UK 1 1.30
Related
I am trying to find a simple way to pivot_longer a dataframe that has multiple columns containing different data for each case. Using multiple names in names_to doesn't seem to solve the problem.
Here is a worked example:
#create the dataframe:
library('dplyr')
set.seed(11)
x <- data.frame(case = c(1:10),
X1990 = runif(10, 0, 1),
flag.1990 = rep(c('a','b'), 5),
X2000 = runif(10, 0, 1),
flag.2000 = rep(c('c', 'd'), 5))
> x
case X1990 flag.1990 X2000 flag.2000
1 1 0.2772497942 a 0.1751129 c
2 2 0.0005183129 b 0.4407503 d
3 3 0.5106083730 a 0.9071830 c
4 4 0.0140479084 b 0.8510419 d
5 5 0.0646897766 a 0.7339875 c
6 6 0.9548492255 b 0.5736857 d
7 7 0.0864958912 a 0.4817655 c
8 8 0.2899750092 b 0.3306110 d
9 9 0.8806991728 a 0.1576602 c
10 10 0.1232162013 b 0.4801341 d
Obviously I cannot just pivot_longer using cols = -case as that will combine year and flag data. If i try using a chr vector in names_to (from here: https://dcl-wrangle.stanford.edu/pivot-advanced.html (6.1.3):
x %>%
setNames(c('case','value.1990', 'flag.1990', 'value.2000', 'flag.2000')) %>%
pivot_longer(cols = -case,
names_to = c('value', 'flag'),
names_sep = '.',
values_to = 'value')
Things don't work, because the flag data isn't in the variable name.
The only way I can think to solve this is to break the dataframe into two data frames, pivot them and then join them. For example:
#create temporary data frame for year data, then pivot
temp1 <- x %>%
select(1,2, 4) %>% #select year data
pivot_longer(cols = c(X1990, X2000), #pivot longer on year data
names_to = 'year',
values_to = 'value') %>%
mutate(year = gsub('X', '', year)) #remove 'X' so that I can use this to join
#create temporary data frame for flag data, then pivot
temp2 <- x %>%
select(1, 3, 5) %>% #select flag variables
pivot_longer(cols = c(flag.1990, flag.2000), #pivot longer on flag data
names_to = 'flag.year',
values_to = 'flag') %>%
mutate(year = gsub('flag.', '', flag.year)) %>% #get year data so that I can join on this
select(-flag.year) #drop flag.year as its no longer useful information
final <- full_join(temp1, temp2, by = c('case', 'year')) #full join the two datasets to get the final data
> final
# A tibble: 20 x 4
case flag year value
<int> <chr> <chr> <dbl>
1 1 a 1990 0.277
2 1 c 2000 0.175
3 2 b 1990 0.000518
4 2 d 2000 0.441
5 3 a 1990 0.511
6 3 c 2000 0.907
7 4 b 1990 0.0140
8 4 d 2000 0.851
9 5 a 1990 0.0647
10 5 c 2000 0.734
11 6 b 1990 0.955
12 6 d 2000 0.574
13 7 a 1990 0.0865
14 7 c 2000 0.482
15 8 b 1990 0.290
16 8 d 2000 0.331
17 9 a 1990 0.881
18 9 c 2000 0.158
19 10 b 1990 0.123
20 10 d 2000 0.480
I assume there is a quicker way to do this. Am I just misreading the documentation on using multiple names in names_to. Any ideas?
In this case one has to use names_to combined with names_pattern:
library(dplyr)
library(tidyr)
> head(x,3)
case X1990 flag.1990 X2000 flag.2000
1 1 0.2772497942 a 0.1751129 c
2 2 0.0005183129 b 0.4407503 d
3 3 0.5106083730 a 0.9071830 c
> x %>%
pivot_longer(cols = -case,
names_to = c(".value", "year"),
names_pattern = "([^\\.]*)\\.*(\\d{4})")
# A tibble: 20 x 4
case year X flag
<int> <chr> <dbl> <chr>
1 1 1990 0.277 a
2 1 2000 0.175 c
3 2 1990 0.000518 b
4 2 2000 0.441 d
5 3 1990 0.511 a
6 3 2000 0.907 c
7 4 1990 0.0140 b
8 4 2000 0.851 d
9 5 1990 0.0647 a
10 5 2000 0.734 c
11 6 1990 0.955 b
12 6 2000 0.574 d
13 7 1990 0.0865 a
14 7 2000 0.482 c
15 8 1990 0.290 b
16 8 2000 0.331 d
17 9 1990 0.881 a
18 9 2000 0.158 c
19 10 1990 0.123 b
20 10 2000 0.480 d
I am trying to get rolling weighted sums across a table, and have a method involving matrix multiplication, but it breaks when some of the data is missing.
So if I use
library(tidyverse)
mydata <- tibble(Country = c("Australia", "Canada"),
"1980" = c(1000, 2000),
"1981" = c(1100, 2100),
"1982" = c(1300, 2300),
"1983" = c(1200, 2400),
"1984" = c(1400, 2200),
"1985" = c(1500, 2500))
weights <- c(3, 4, 6)
n0 <- ncol(mydata) - length(weights)
matweights <- matrix(rep(c(rep(0, n0), weights), n0)[-(1:n0)], ncol=n0)
tibble(cbind(mydata[, 1], as.matrix(mydata[, -1]) %*% matweights))
I get what I want with
# A tibble: 2 x 5
Country `1` `2` `3` `4`
<chr> <dbl> <dbl> <dbl> <dbl>
1 Australia 15200 15700 17100 18200
2 Canada 28200 29900 29700 31000
where for example in the top right 18200 is 3*1200 + 4*1400 + 6*1500
But if for example one of the values is missing, say mydata[2, 3] <- NA then I would get
# A tibble: 2 x 5
Country `1` `2` `3` `4`
<chr> <dbl> <dbl> <dbl> <dbl>
1 Australia 15200 15700 17100 18200
2 Canada NA NA NA NA
when I want
# A tibble: 2 x 5
Country `1` `2` `3` `4`
<chr> <dbl> <dbl> <dbl> <dbl>
1 Australia 15200 15700 17100 18200
2 Canada NA NA 29700 31000
The problem with my matrix approach is 0 * NA giving NA when I want it to be 0. I know there are solutions using some kind of apply approach but I suspect that may be slower with a large table.
I really quite like slider for sliding functions—it's very flexible, and has a purrr-like syntax. Here, slide_index_dbl() will let us slide a function and use another variable as an index by which to decide what observations are within the window.
First, reshape to long form and group, then it's a single call within mutate(). .before here specifies how many years back to include; .complete specifies to ignore partial windows.
library(tidyverse)
out1 <- mydata %>%
gather(year, value, -Country, convert = TRUE) %>%
group_by(Country) %>%
mutate(
value_3y = slider::slide_index_dbl(
value, .i = year,
.f = ~sum(.x * weights),
.before = 2, .complete = TRUE
)
)
out1
#> # A tibble: 12 x 4
#> # Groups: Country [2]
#> Country year value value_3y
#> <chr> <int> <dbl> <dbl>
#> 1 Australia 1980 1000 NA
#> 2 Canada 1980 2000 NA
#> 3 Australia 1981 1100 NA
#> 4 Canada 1981 2100 NA
#> 5 Australia 1982 1300 15200
#> 6 Canada 1982 2300 28200
#> 7 Australia 1983 1200 15700
#> 8 Canada 1983 2400 29900
#> 9 Australia 1984 1400 17100
#> 10 Canada 1984 2200 29700
#> 11 Australia 1985 1500 18200
#> 12 Canada 1985 2500 31000
To reshape to wide form:
out1 %>%
select(-value) %>%
drop_na() %>% # omit to keep partial/empty years
spread(year, value_3y)
#> # A tibble: 2 x 5
#> # Groups: Country [2]
#> Country `1982` `1983` `1984` `1985`
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Australia 15200 15700 17100 18200
#> 2 Canada 28200 29900 29700 31000
If the data contains NAs, the code works exactly the same:
mydata[2, 3] <- NA
out2 <- mydata %>%
gather(year, value, -Country, convert = TRUE) %>%
group_by(Country) %>%
mutate(
value_3y = slider::slide_index_dbl(
value, .i = year,
.f = ~sum(.x * weights),
.before = 2, .complete = TRUE
)
)
out2
#> # A tibble: 12 x 4
#> # Groups: Country [2]
#> Country year value value_3y
#> <chr> <int> <dbl> <dbl>
#> 1 Australia 1980 1000 NA
#> 2 Canada 1980 2000 NA
#> 3 Australia 1981 1100 NA
#> 4 Canada 1981 NA NA
#> 5 Australia 1982 1300 15200
#> 6 Canada 1982 2300 NA
#> 7 Australia 1983 1200 15700
#> 8 Canada 1983 2400 NA
#> 9 Australia 1984 1400 17100
#> 10 Canada 1984 2200 29700
#> 11 Australia 1985 1500 18200
#> 12 Canada 1985 2500 31000
out2 %>%
select(-value) %>%
drop_na() %>%
spread(year, value_3y)
#> # A tibble: 2 x 5
#> # Groups: Country [2]
#> Country `1982` `1983` `1984` `1985`
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Australia 15200 15700 17100 18200
#> 2 Canada NA NA 29700 31000
Using rollapply we have the following matrix:
library(zoo)
t(rollapply(t(mydata[, -1]), 3, function(x) sum(x * weights)))
## [,1] [,2] [,3] [,4]
## [1,] 15200 15700 17100 18200
## [2,] NA NA 29700 31000
Linear filtering option:
t(apply(mydata[-1], 1, stats::filter, filter=rev(weights), sides=1))
# [,1] [,2] [,3] [,4] [,5] [,6]
#[1,] NA NA 15200 15700 17100 18200
#[2,] NA NA NA NA 29700 31000
I am trying to find a simple way to pivot_longer a dataframe that has multiple columns containing different data for each case. Using multiple names in names_to doesn't seem to solve the problem.
Here is a worked example:
#create the dataframe:
library('dplyr')
set.seed(11)
x <- data.frame(case = c(1:10),
X1990 = runif(10, 0, 1),
flag.1990 = rep(c('a','b'), 5),
X2000 = runif(10, 0, 1),
flag.2000 = rep(c('c', 'd'), 5))
> x
case X1990 flag.1990 X2000 flag.2000
1 1 0.2772497942 a 0.1751129 c
2 2 0.0005183129 b 0.4407503 d
3 3 0.5106083730 a 0.9071830 c
4 4 0.0140479084 b 0.8510419 d
5 5 0.0646897766 a 0.7339875 c
6 6 0.9548492255 b 0.5736857 d
7 7 0.0864958912 a 0.4817655 c
8 8 0.2899750092 b 0.3306110 d
9 9 0.8806991728 a 0.1576602 c
10 10 0.1232162013 b 0.4801341 d
Obviously I cannot just pivot_longer using cols = -case as that will combine year and flag data. If i try using a chr vector in names_to (from here: https://dcl-wrangle.stanford.edu/pivot-advanced.html (6.1.3):
x %>%
setNames(c('case','value.1990', 'flag.1990', 'value.2000', 'flag.2000')) %>%
pivot_longer(cols = -case,
names_to = c('value', 'flag'),
names_sep = '.',
values_to = 'value')
Things don't work, because the flag data isn't in the variable name.
The only way I can think to solve this is to break the dataframe into two data frames, pivot them and then join them. For example:
#create temporary data frame for year data, then pivot
temp1 <- x %>%
select(1,2, 4) %>% #select year data
pivot_longer(cols = c(X1990, X2000), #pivot longer on year data
names_to = 'year',
values_to = 'value') %>%
mutate(year = gsub('X', '', year)) #remove 'X' so that I can use this to join
#create temporary data frame for flag data, then pivot
temp2 <- x %>%
select(1, 3, 5) %>% #select flag variables
pivot_longer(cols = c(flag.1990, flag.2000), #pivot longer on flag data
names_to = 'flag.year',
values_to = 'flag') %>%
mutate(year = gsub('flag.', '', flag.year)) %>% #get year data so that I can join on this
select(-flag.year) #drop flag.year as its no longer useful information
final <- full_join(temp1, temp2, by = c('case', 'year')) #full join the two datasets to get the final data
> final
# A tibble: 20 x 4
case flag year value
<int> <chr> <chr> <dbl>
1 1 a 1990 0.277
2 1 c 2000 0.175
3 2 b 1990 0.000518
4 2 d 2000 0.441
5 3 a 1990 0.511
6 3 c 2000 0.907
7 4 b 1990 0.0140
8 4 d 2000 0.851
9 5 a 1990 0.0647
10 5 c 2000 0.734
11 6 b 1990 0.955
12 6 d 2000 0.574
13 7 a 1990 0.0865
14 7 c 2000 0.482
15 8 b 1990 0.290
16 8 d 2000 0.331
17 9 a 1990 0.881
18 9 c 2000 0.158
19 10 b 1990 0.123
20 10 d 2000 0.480
I assume there is a quicker way to do this. Am I just misreading the documentation on using multiple names in names_to. Any ideas?
In this case one has to use names_to combined with names_pattern:
library(dplyr)
library(tidyr)
> head(x,3)
case X1990 flag.1990 X2000 flag.2000
1 1 0.2772497942 a 0.1751129 c
2 2 0.0005183129 b 0.4407503 d
3 3 0.5106083730 a 0.9071830 c
> x %>%
pivot_longer(cols = -case,
names_to = c(".value", "year"),
names_pattern = "([^\\.]*)\\.*(\\d{4})")
# A tibble: 20 x 4
case year X flag
<int> <chr> <dbl> <chr>
1 1 1990 0.277 a
2 1 2000 0.175 c
3 2 1990 0.000518 b
4 2 2000 0.441 d
5 3 1990 0.511 a
6 3 2000 0.907 c
7 4 1990 0.0140 b
8 4 2000 0.851 d
9 5 1990 0.0647 a
10 5 2000 0.734 c
11 6 1990 0.955 b
12 6 2000 0.574 d
13 7 1990 0.0865 a
14 7 2000 0.482 c
15 8 1990 0.290 b
16 8 2000 0.331 d
17 9 1990 0.881 a
18 9 2000 0.158 c
19 10 1990 0.123 b
20 10 2000 0.480 d
I'm trying to take a function and iterate over a data frame of values. The goal here is to summarize the airport delays by groups of 10.
How do you take the value of what is passed into a function as a name? The column origin (EWR, LGA, JFK) should be saved as a column, and it still needs to be passed into the group by function.
library(tidyverse)
library(nycflights13)
head(flights)
#> # A tibble: 6 x 19
#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
#> <int> <int> <int> <int> <int> <dbl> <int> <int>
#> 1 2013 1 1 517 515 2 830 819
#> 2 2013 1 1 533 529 4 850 830
#> 3 2013 1 1 542 540 2 923 850
#> 4 2013 1 1 544 545 -1 1004 1022
#> 5 2013 1 1 554 600 -6 812 837
#> 6 2013 1 1 554 558 -4 740 728
#> # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#> # hour <dbl>, minute <dbl>, time_hour <dttm>
ntile_summary <- function(data, by, var) {
by <- enquo(by)
var <- enquo(var)
data %>%
mutate(pcts = ntile(!!by, n = 10),
col_nm = !!by)
group_by(pcts, col_nm) %>%
summarize(avg = mean(!!var, na.ram = TRUE))
}
params <- expand_grid(
flights %>% count(origin) %>% select(origin),
flights %>% count(day) %>% head(2) %>% select(day)
)
ntile_summary(flights, day, arr_delay)
#> Error in group_by(pcts, col_nm): object 'pcts' not found
purrr::walk(params, ~ntile_summary(flights, !origin, arr_delay))
#> Error in !origin: invalid argument type
Created on 2020-03-15 by the reprex package (v0.3.0)
After the mutate, the connection is. not there %>%
ntile_summary <- function(data, by, var) {
by <- enquo(by)
var <- enquo(var)
data %>%
mutate(pcts = ntile(!!by, n = 10),
col_nm = !!by) %>%
group_by(pcts, col_nm) %>%
summarize(avg = mean(!!var, na.ram = TRUE))
}
ntile_summary(flights, day, arr_delay)
# A tibble: 40 x 3
# Groups: pcts [10]
# pcts col_nm avg
# <int> <int> <dbl>
# 1 1 1 NA
# 2 1 2 NA
# 3 1 3 NA
# 4 1 4 -4.44
# 5 2 4 NA
# 6 2 5 NA
# 7 2 6 NA
# 8 2 7 NA
# 9 3 7 NA
#10 3 8 NA
# … with 30 more rows
We could also make use of curly-curly operator ({{}}) instead of enquo + `!!~
ntile_summary <- function(data, by, var) {
data %>%
mutate(col_nm = {{by}}, pcts = ntile({{by}}, n = 10)) %>%
group_by(pcts, col_nm) %>%
summarize(avg = mean({{var}}, na.ram = TRUE))
}
ntile_summary(flights, day, arr_delay)
# A tibble: 40 x 3
# Groups: pcts [10]
# pcts col_nm avg
# <int> <int> <dbl>
# 1 1 1 NA
# 2 1 2 NA
# 3 1 3 NA
# 4 1 4 -4.44
# 5 2 4 NA
# 6 2 5 NA
# 7 2 6 NA
# 8 2 7 NA
# 9 3 7 NA
#10 3 8 NA
# … with 30 more rows
Suppose I have a data.frame as follows:
data = data.frame(firm = LETTERS[seq( from = 1, to = 9)],
industry = seq(1,9),
country = c("USA", "CAN", "DEU"),
var1_10 = rnorm(9),
var1_11 = rnorm(9),
var1_12 = rnorm(9),
var2_10 = rnorm(9),
var2_11 = rnorm(9),
var2_12 = rnorm(9))
Which looks like this:
head(data)
firm industry country var1_10 var1_11 var1_12 var2_10 var2_11 var2_12
A 1 USA 0.006080107 1.7089981 0.384306433 -0.2814963 -0.31852115 0.4879907
B 2 CAN 0.447786736 -0.6414333 0.683906020 -0.7500779 -0.72770473 -0.1499627
C 3 DEU 1.265955776 -1.6834242 -0.685028075 0.7192065 -0.02291059 -0.2322860
D 4 USA 0.874346857 0.6339960 -0.005798694 1.0982600 -1.57901079 -0.0510445
E 5 CAN 0.692382607 -0.4461135 -0.432249529 1.7461789 -0.49300818 1.1987289
F 6 DEU -1.098814463 0.7868190 2.281716591 -1.0006592 0.95612690 1.0244039
And I would like to have var1 and var2 in long format, but having firm and country as categories. What I mean is something like this:
firm country time var1 var2
1 A USA 10 0.6157731 1.05564854
2 A USA 11 0.2560421 0.42902183
3 D CAN 10 0.7278390 -1.81995641
4 D CAN 11 1.3241109 -0.69197609
5 B DEU 10 0.1471585 -1.93182825
6 B DEU 11 -0.5985394 1.20967201
7 E USA 10 2.1925299 -0.27900005
8 E USA 11 2.3271128 -1.09578323
9 C CAN 10 1.1348696 -0.10218604
10 C CAN 11 -0.1908846 0.35702296
11 F DEU 10 0.4748446 -0.88230257
12 F DEU 11 -0.5454749 -0.05664779
You can use the new tidyr 1.0.0 pivot_longer() and pivot_wider() functions.
#yutannihilation has an excellent presentation explaining these new functions: A Graphical Introduction to tidyr's pivot_*()
library(tidyr)
set.seed(2019)
data = data.frame(firm = LETTERS[seq( from = 1, to = 9)],
industry = seq(1,9),
country = c("USA", "CAN", "DEU"),
var1_10 = rnorm(9),
var1_11 = rnorm(9),
var1_12 = rnorm(9),
var2_10 = rnorm(9),
var2_11 = rnorm(9),
var2_12 = rnorm(9))
data
#> firm industry country var1_10 var1_11 var1_12 var2_10
#> 1 A 1 USA 0.7385227 -0.3191793 -0.3271264 0.04062997
#> 2 B 2 CAN -0.5147605 -0.2379111 -2.2632252 2.63601650
#> 3 C 3 DEU -1.6401813 1.6186229 0.2855605 -1.61599923
#> 4 D 4 USA 0.9160368 -1.1176011 0.9684286 -0.93455930
#> 5 E 5 CAN -1.2674820 0.2340028 0.8673066 0.63038569
#> 6 F 6 DEU 0.7382478 0.3161516 1.3781350 0.76075998
#> 7 G 7 USA -0.7826228 0.3707686 -0.8082596 -0.51162277
#> 8 H 8 CAN 0.5092959 0.8775886 -0.5121532 1.00190750
#> 9 I 9 DEU -1.4899391 -1.7683235 -1.8039718 -0.38339219
#> var2_11 var2_12
#> 1 -0.47713729 0.20612698
#> 2 0.25420771 0.86320623
#> 3 -1.16349174 0.13977752
#> 4 -0.43793937 -0.22809479
#> 5 -1.72413573 -0.31982812
#> 6 1.72514669 -0.05294738
#> 7 0.09215510 -0.23639840
#> 8 0.07311485 -0.33796351
#> 9 0.64014783 -0.75548467
Create a long table format first
data_longer <- data %>%
pivot_longer(
cols = starts_with("var"),
names_to = c("var", "time"),
names_sep = "_",
values_to = "value"
)
data_longer
#> # A tibble: 54 x 6
#> firm industry country var time value
#> <fct> <int> <fct> <chr> <chr> <dbl>
#> 1 A 1 USA var1 10 0.739
#> 2 A 1 USA var1 11 -0.319
#> 3 A 1 USA var1 12 -0.327
#> 4 A 1 USA var2 10 0.0406
#> 5 A 1 USA var2 11 -0.477
#> 6 A 1 USA var2 12 0.206
#> 7 B 2 CAN var1 10 -0.515
#> 8 B 2 CAN var1 11 -0.238
#> 9 B 2 CAN var1 12 -2.26
#> 10 B 2 CAN var2 10 2.64
#> # ... with 44 more rows
Then reshape to the desired wide format
data_wider <- data_longer %>%
pivot_wider(names_from = var, values_from = value)
data_wider
#> # A tibble: 27 x 6
#> firm industry country time var1 var2
#> <fct> <int> <fct> <chr> <dbl> <dbl>
#> 1 A 1 USA 10 0.739 0.0406
#> 2 A 1 USA 11 -0.319 -0.477
#> 3 A 1 USA 12 -0.327 0.206
#> 4 B 2 CAN 10 -0.515 2.64
#> 5 B 2 CAN 11 -0.238 0.254
#> 6 B 2 CAN 12 -2.26 0.863
#> 7 C 3 DEU 10 -1.64 -1.62
#> 8 C 3 DEU 11 1.62 -1.16
#> 9 C 3 DEU 12 0.286 0.140
#> 10 D 4 USA 10 0.916 -0.935
#> # ... with 17 more rows
Created on 2019-10-05 by the reprex package (v0.3.0)