Making long format selectively pivot_longer() tidyverse in R - r

I'm trying to make 4 variables (id, uerate, lnw, exper) in my data.frame (d) into long format.
I also want to add two binary (0/1) columns called DL and DE; if the long-formatted value represents lnw the DL==1 and DE==0 and vice versa.
I was wondering how to correctly specify tidyr::pivot_longer() to obtain my EXPECTED OUTPUT below?
d <- read.csv("https://stats.idre.ucla.edu/wp-content/uploads/2016/02/wages_pp-1.txt")
select(d, id, uerate, lnw, exper) %>% pivot_longer(everything()) # tried without success
#### EXPECTED OUTPUT:
## id uerate variable value DE DL
## 1 31 3.21 lnw 1.491 0 1
## 2 31 3.21 lnw 1.433 0 1
## 3 31 3.21 lnw 1.469 0 1
## 4 31 3.29 lnw 1.749 0 1
## 5 31 2.90 lnw 1.931 0 1
## 6 31 2.50 lnw 1.709 0 1
## 7 31 2.60 lnw 2.086 0 1
## 8 31 4.79 lnw 2.129 0 1
## 6403 31 3.21 exper 0.015 1 0
## 6404 31 3.21 exper 0.715 1 0
## 6405 31 3.21 exper 1.734 1 0
## 6406 31 3.29 exper 2.773 1 0
## 6407 31 2.90 exper 3.927 1 0
## 6408 31 2.50 exper 4.946 1 0
## 6409 31 2.60 exper 5.965 1 0
## 6410 31 4.79 exper 6.984 1 0

We select the columns of interest, and instead of everything() in the columns to select with cols, it should select columns other than 'id', 'uerate' (-c(id, uerate)), then mutate to create new columns 'DE' and 'DL'
library(dplyr)
library(tidyr)
d %>%
select(id, uerate, lnw, exper) %>%
pivot_longer(cols = -c(id, uerate), names_to = 'variable') %>%
mutate(DE = +(variable == 'exper'), DL = +(!DE)) %>%
arrange(id, variable)
-output
# A tibble: 12,804 x 6
# id uerate variable value DE DL
# <int> <dbl> <chr> <dbl> <int> <int>
# 1 31 3.22 exper 0.015 1 0
# 2 31 3.22 exper 0.715 1 0
# 3 31 3.22 exper 1.73 1 0
# 4 31 3.30 exper 2.77 1 0
# 5 31 2.90 exper 3.93 1 0
# 6 31 2.50 exper 4.95 1 0
# 7 31 2.60 exper 5.96 1 0
# 8 31 4.80 exper 6.98 1 0
# 9 31 3.22 lnw 1.49 0 1
#10 31 3.22 lnw 1.43 0 1
# … with 12,794 more rows

Related

Difference in values between rows after group_by

I want to calculate the difference in values for the following row after the previous. However, I am getting this error:
Error in mutate():
! Problem while computing ..1 = across(where(is.numeric), diff).
ℹ The error occurred in group 1: vs = 0
Caused by error in across():
! Problem while computing column mpg.
Caused by error in dplyr_internal_error():
Run rlang::last_error() to see where the error occurred.
Here is what I have tried:
mtcars %>% group_by(vs) %>% mutate(across(where(is.numeric), diff))
This seems to do the trick:
mtcars %>% group_by(vs) %>% aggregate(. ~ vs, data=., diff) %>% as.data.frame() %>% unnest()
#//--
# A tibble: 30 × 11
vs mpg cyl disp hp drat wt qsec am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 0 0 0 0 0 0.255 0.560 0 0 0
2 0 -2.3 2 200 65 -0.75 0.565 0 -1 -1 -2
3 0 -4.4 0 0 70 0.0600 0.130 -1.18 0 0 2
4 0 2.10 0 -84.2 -65 -0.140 0.500 1.56 0 0 -1
5 0 0.900 0 0 0 0 -0.340 0.200 0 0 0
6 0 -2.10 0 0 0 0 0.0500 0.400 0 0 0
7 0 -4.8 0 196. 25 -0.140 1.47 -0.0200 0 0 1
8 0 0 0 -12 10 0.0700 0.174 -0.160 0 0 0
9 0 4.3 0 -20 15 0.23 -0.0790 -0.400 0 0 0
10 0 0.800 0 -122 -80 -0.47 -1.82 -0.550 0 0 -2
# … with 20 more rows
You could explicitly define the calculation using lag. Or you could do this in base R:
library(tidyverse)
#tidyverse
mtcars %>%
group_by(vs) %>%
mutate(across(where(is.numeric), ~.-lag(., default = first(.)))) |>
arrange(vs)
#> # A tibble: 32 x 11
#> # Groups: vs [2]
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0 0 0 0 0 0 0 0 0 0 0
#> 2 0 0 0 0 0 0.255 0.560 0 0 0 0
#> 3 -2.3 2 200 65 -0.75 0.565 0 0 -1 -1 -2
#> 4 -4.4 0 0 70 0.0600 0.130 -1.18 0 0 0 2
#> 5 2.10 0 -84.2 -65 -0.140 0.500 1.56 0 0 0 -1
#> 6 0.900 0 0 0 0 -0.340 0.200 0 0 0 0
#> 7 -2.10 0 0 0 0 0.0500 0.400 0 0 0 0
#> 8 -4.8 0 196. 25 -0.140 1.47 -0.0200 0 0 0 1
#> 9 0 0 -12 10 0.0700 0.174 -0.160 0 0 0 0
#> 10 4.3 0 -20 15 0.23 -0.0790 -0.400 0 0 0 0
#> # ... with 22 more rows
#base R
by(mtcars, mtcars$vs, \(x) apply(x, 2, diff)) |>
do.call(what = rbind.data.frame)
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> 0.Mazda RX4 Wag 0.0 0 0.0 0 0.00 0.255 0.56 0 0 0 0
#> 0.Hornet Sportabout -2.3 2 200.0 65 -0.75 0.565 0.00 0 -1 -1 -2
#> 0.Duster 360 -4.4 0 0.0 70 0.06 0.130 -1.18 0 0 0 2
#> 0.Merc 450SE 2.1 0 -84.2 -65 -0.14 0.500 1.56 0 0 0 -1
#> 0.Merc 450SL 0.9 0 0.0 0 0.00 -0.340 0.20 0 0 0 0
#> 0.Merc 450SLC -2.1 0 0.0 0 0.00 0.050 0.40 0 0 0 0
#> 0.Cadillac Fleetwood -4.8 0 196.2 25 -0.14 1.470 -0.02 0 0 0 1
#> 0.Lincoln Continental 0.0 0 -12.0 10 0.07 0.174 -0.16 0 0 0 0
#> 0.Chrysler Imperial 4.3 0 -20.0 15 0.23 -0.079 -0.40 0 0 0 0
#> 0.Dodge Challenger 0.8 0 -122.0 -80 -0.47 -1.825 -0.55 0 0 0 -2
#> 0.AMC Javelin -0.3 0 -14.0 0 0.39 -0.085 0.43 0 0 0 0
#> 0.Camaro Z28 -1.9 0 46.0 95 0.58 0.405 -1.89 0 0 0 2
#> 0.Pontiac Firebird 5.9 0 50.0 -70 -0.65 0.005 1.64 0 0 0 -2
#> 0.Porsche 914-2 6.8 -4 -279.7 -84 1.35 -1.705 -0.35 0 1 2 0
#> 0.Ford Pantera L -10.2 4 230.7 173 -0.21 1.030 -2.20 0 0 0 2
#> 0.Ferrari Dino 3.9 -2 -206.0 -89 -0.60 -0.400 1.00 0 0 0 2
#> 0.Maserati Bora -4.7 2 156.0 160 -0.08 0.800 -0.90 0 0 0 2
#> 1.Hornet 4 Drive -1.4 2 150.0 17 -0.77 0.895 0.83 0 -1 -1 0
#> 1.Valiant -3.3 0 -33.0 -5 -0.32 0.245 0.78 0 0 0 0
#> 1.Merc 240D 6.3 -2 -78.3 -43 0.93 -0.270 -0.22 0 0 1 1
#> 1.Merc 230 -1.6 0 -5.9 33 0.23 -0.040 2.90 0 0 0 0
#> 1.Merc 280 -3.6 2 26.8 28 0.00 0.290 -4.60 0 0 0 2
#> 1.Merc 280C -1.4 0 0.0 0 0.00 0.000 0.60 0 0 0 0
#> 1.Fiat 128 14.6 -2 -88.9 -57 0.16 -1.240 0.57 0 1 0 -3
#> 1.Honda Civic -2.0 0 -3.0 -14 0.85 -0.585 -0.95 0 0 0 1
#> 1.Toyota Corolla 3.5 0 -4.6 13 -0.71 0.220 1.38 0 0 0 -1
#> 1.Toyota Corona -12.4 0 49.0 32 -0.52 0.630 0.11 0 -1 -1 0
#> 1.Fiat X1-9 5.8 0 -41.1 -31 0.38 -0.530 -1.11 0 1 1 0
#> 1.Lotus Europa 3.1 0 16.1 47 -0.31 -0.422 -2.00 0 0 1 1
#> 1.Volvo 142E -9.0 0 25.9 -4 0.34 1.267 1.70 0 0 -1 0

Scatterplot with multi variables

Scatterplot reference
data set
Can someone help me create three scatter plots as in the first picture? Ideally using the plot() function.
require(tidyverse)
require(ggplot2)
df <- tibble(
image = 1:18,
m_r_exsal = rnorm(18, 5, 2),
m_r_sal = rnorm(18, 6, 2),
female = c(rep(1, 18/2), rep(0, 18/2)),
lg_salary = rnorm(18, 5, 1.5),
deviation = rnorm(18, 1, 1),
chinese = c(rep(1, 6), rep(0, 18/3*2)),
european = c(rep(0, 6), rep(1, 6), rep(0, 6)),
american = c(rep(0, 18/3*2), rep(1, 6))
)
Example data:
# A tibble: 18 x 9
image m_r_exsal m_r_sal female lg_salary deviation chinese european american
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 6.19 9.56 1 5.48 1.54 1 0 0
2 2 10.1 5.17 1 3.77 -0.755 1 0 0
3 3 4.96 1.91 1 6.75 0.381 1 0 0
4 4 5.10 4.57 1 4.61 -0.207 1 0 0
5 5 -1.25 6.57 1 2.33 0.880 1 0 0
6 6 6.77 9.10 1 3.07 1.03 1 0 0
7 7 4.04 4.84 1 4.56 1.95 0 1 0
8 8 3.72 4.72 1 5.32 1.17 0 1 0
9 9 7.59 7.05 1 6.24 -0.224 0 1 0
10 10 4.09 3.94 0 5.60 2.52 0 1 0
11 11 4.15 6.05 0 7.08 -0.152 0 1 0
12 12 6.07 5.27 0 5.79 -0.323 0 1 0
13 13 4.49 4.64 0 5.97 0.457 0 0 1
14 14 6.74 4.70 0 3.38 0.377 0 0 1
15 15 7.46 9.02 0 6.65 1.85 0 0 1
16 16 4.29 5.26 0 4.07 2.18 0 0 1
17 17 2.33 1.58 0 8.43 1.06 0 0 1
18 18 4.78 8.75 0 5.03 0.101 0 0 1
Making the plot:
df %>%
mutate(chinese = case_when(chinese == 1 ~ "chinese"),
european = case_when(european == 1 ~ "european"),
american = case_when(american == 1 ~ "american"),
female = case_when(female == 1 ~ "female",
TRUE ~ "male")) %>%
unite(country, chinese:american, remove = TRUE, sep = "") %>%
mutate(country = country %>% str_remove_all("NA")) %>%
ggplot() +
aes(lg_salary, deviation, col = female) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~ country)
The output:

How do I apply a function to rows in columns with names containing a partilcular stringin R

I have a dataframe with column names: X, time, "indoors_c" and "outdoors_c". Where time is >= 13, I want to replace the value of the rows in columns with names containing the string "_c" with 0. How do I do this without referring to the entire column name? I am using R studio
Using functions from tidyverse, sse across() together with contains() to select the columns over which to apply the function. Then use if_else for the function, putting 0 values if column time is above/below some value.
Small reproducible example:
library(tidyverse)
data <- tibble(X=rnorm(20), time=1:20, indoors_c=rnorm(20), outdoors_c=rnorm(20))
data %>%
mutate(across(contains("_c"), ~ if_else(time>=13, 0, .)))
#> # A tibble: 20 x 4
#> X time indoors_c outdoors_c
#> <dbl> <int> <dbl> <dbl>
#> 1 -1.67 1 -2.41 -1.44
#> 2 1.00 2 0.113 0.701
#> 3 0.386 3 0.0248 -0.425
#> 4 0.266 4 -0.431 -0.0722
#> 5 -0.206 5 0.255 -1.34
#> 6 -0.617 6 0.441 0.761
#> 7 1.42 7 0.481 -0.892
#> 8 0.207 8 -0.112 -0.906
#> 9 1.42 9 -0.465 -0.527
#> 10 -0.934 10 -2.21 2.95
#> 11 -0.419 11 -0.639 -0.113
#> 12 0.812 12 -0.180 0.440
#> 13 0.331 13 0 0
#> 14 -0.454 14 0 0
#> 15 -0.0290 15 0 0
#> 16 0.167 16 0 0
#> 17 -0.150 17 0 0
#> 18 0.922 18 0 0
#> 19 1.77 19 0 0
#> 20 1.62 20 0 0
Created on 2021-05-23 by the reprex package (v2.0.0)
We can use case_when
library(dplyr)
data %>%
mutate(across(ends_with('_C'), ~ case_when(time > 0 ~ 0, TRUE ~ .)))

How do I create a dummy variable that depends on values in multiple columns?

I am trying to create a treatment dummy for the states whose 1970 legal1820 is different from their legal1820 in 1979. So I need the proper syntax for somethihng like this treat = ifelse((legal1820 when (year == 1970)) != (legal1820 when (year == 1979)) , 1,0)
this is the data I am using
mlda <- read_dta("http://masteringmetrics.com/wp-content/uploads/2015/01/deaths.dta")
dft <- mlda %>%
filter(year <= 1990) %>%
mutate(dtype = as_factor(dtype, levels = "labels"),
age_cat = agegr,
agegr = as_factor(agegr, levels = "labels"))
library(tidycensus)
data("fips_codes")
fips_codes <- fips_codes %>%
mutate(state_code = as.numeric(state_code)) %>%
select(state, state_code) %>%
distinct()
dft <- dft %>%
rename(state_code = state) %>%
right_join(fips_codes, by = "state_code") %>%
select(-state_code)%>%
group_by(state)%>%
filter(agegr == "18-20 yrs", year <= 1983)%>%
pivot_wider(names_from = dtype, values_from = mrate)%>%
mutate(post = ifelse(year >= 1975 ,1,0)
these are the libraries I am using (most of them are for other parts of my code)
library(tidyverse)
library(AER)
library(stargazer)
library(haven)
library(lfe)
library(estimatr)
library(stringr)
library(dplyr)
library(modelsummary)
library(ggplot2)
library(haven)
Is this what you are looking for?
library(dplyr)
mlda %>% group_by(state) %>% mutate(treat = +(first(legal1820[year == 1970] != legal1820[year == 1979])))
Output
# A tibble: 24,786 x 16
# Groups: state [51]
year state legal1820 dtype agegr count pop age legal beertaxa beerpercap winepercap spiritpercap totpercap mrate treat
<dbl> <dbl> <dbl> <dbl+lbl> <dbl+lbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 1970 1 0 1 [all] 1 [15-17 yrs] 224 213574 16.0 0 1.37 0.600 0.0900 0.700 1.38 105. 1
2 1971 1 0 1 [all] 1 [15-17 yrs] 241 220026 16.0 0 1.32 0.660 0.0900 0.760 1.52 110. 1
3 1972 1 0 1 [all] 1 [15-17 yrs] 270 224877 16.0 0 1.28 0.740 0.0900 0.780 1.61 120. 1
4 1973 1 0 1 [all] 1 [15-17 yrs] 258 227256 16.0 0 1.20 0.790 0.100 0.790 1.69 114. 1
5 1974 1 0 1 [all] 1 [15-17 yrs] 224 229025 16.0 0 1.08 0.830 0.160 0.810 1.80 97.8 1
6 1975 1 0.294 1 [all] 1 [15-17 yrs] 207 229739 16.0 0 0.991 0.880 0.160 0.850 1.88 90.1 1
7 1976 1 0.665 1 [all] 1 [15-17 yrs] 231 230696 16.0 0 0.937 0.890 0.150 0.860 1.89 100. 1
8 1977 1 0.668 1 [all] 1 [15-17 yrs] 219 230086 16.0 0 0.880 0.990 0.130 0.840 1.96 95.2 1
9 1978 1 0.667 1 [all] 1 [15-17 yrs] 234 229519 16.0 0 0.817 0.980 0.120 0.880 1.97 102. 1
10 1979 1 0.668 1 [all] 1 [15-17 yrs] 176 227140 16.0 0 0.734 0.980 0.120 0.840 1.94 77.5 1
# ... with 24,776 more rows

Calculating group differences in a "badly" partitioned data set

I tried to solve the problem with questions here on SO but I could not find a satisfying answer. My data frame has the structure
X = data_frame(
treat = c(rep(1,4), rep(2,4), rep(3,4), rep(4,4)),
id = seq(1:16),
x = rnorm(16),
y = rnorm(16),
z = rnorm(16)
)
Looks like
# A tibble: 16 x 5
treat id x y z
<int> <int> <dbl> <dbl> <dbl>
1 1 1 -0.0724 1.26 0.317
2 1 2 -0.486 -0.628 0.392
3 1 3 -0.406 -0.706 1.18
4 1 4 -1.35 -1.27 2.36
5 2 5 -0.0751 -0.0394 0.568
6 2 6 0.243 0.873 0.132
7 2 7 0.138 0.611 -0.700
8 2 8 -0.732 1.02 -0.811
9 3 9 -0.0278 1.78 0.568
10 3 10 0.526 1.18 1.03
11 3 11 1.43 0.0937 -0.0825
12 3 12 -0.299 -0.117 0.367
13 4 13 1.05 2.04 0.678
14 4 14 -1.93 0.201 0.250
15 4 15 0.624 1.09 0.852
16 4 16 0.502 0.119 -0.843
Every fourth value in treat is a control and now I want to calculate the difference in x, y and z between the treatments and the controls. For example I would like to calculate for the first treatment
-0.724 - (-1.35) #x
1.26 - (-1.27) #y
0.317 - 2.36 #z
for the first treatment. For the second treatment accordingly,
-0.486 - (-1.35) #x
-0.628 - (-1.27) #y
0.392 - 2.36 #z
... and so on.
I would like to use a dplyr / tidyverse solution but I have no idea how to do that in a "smooth" way. I found a solution already by using joins but this seems rather tedious compared to the "smooth" solution dplyr usually offers.
With dplyr, we can group_by treat and use mutate_at to select specific columns (x:z) and subtract each value with 4th value using the nth function.
library(dplyr)
X %>%
group_by(treat) %>%
mutate_at(vars(x:z), funs(. - nth(., 4)))
#treat id x y z
# <dbl> <int> <dbl> <dbl> <dbl>
# 1 1 1 -0.631 0.971 0.206
# 2 1 2 -0.301 -1.49 0.189
# 3 1 3 1.49 1.17 0.133
# 4 1 4 0 0 0
# 5 2 5 1.39 -0.339 0.934
# 6 2 6 2.98 0.511 0.319
# 7 2 7 1.73 -0.297 0.0745
# 8 2 8 0 0 0
# 9 3 9 -1.05 -0.778 -2.86
#10 3 10 -0.805 -1.84 -2.38
#11 3 11 0.864 0.684 -3.43
#12 3 12 0 0 0
#13 4 13 -1.39 -0.843 1.67
#14 4 14 -1.68 1.55 -0.656
#15 4 15 -2.34 0.722 0.0638
#16 4 16 0 0 0
This can be also written as
X %>%
group_by(treat) %>%
mutate_at(vars(x:z), funs(. - .[4]))
data
set.seed(123)
X = data_frame(
treat = c(rep(1,4), rep(2,4), rep(3,4), rep(4,4)),
id = seq(1:16),
x = rnorm(16),
y = rnorm(16),
z = rnorm(16)
)

Resources