Scatterplot with multi variables - r

Scatterplot reference
data set
Can someone help me create three scatter plots as in the first picture? Ideally using the plot() function.

require(tidyverse)
require(ggplot2)
df <- tibble(
image = 1:18,
m_r_exsal = rnorm(18, 5, 2),
m_r_sal = rnorm(18, 6, 2),
female = c(rep(1, 18/2), rep(0, 18/2)),
lg_salary = rnorm(18, 5, 1.5),
deviation = rnorm(18, 1, 1),
chinese = c(rep(1, 6), rep(0, 18/3*2)),
european = c(rep(0, 6), rep(1, 6), rep(0, 6)),
american = c(rep(0, 18/3*2), rep(1, 6))
)
Example data:
# A tibble: 18 x 9
image m_r_exsal m_r_sal female lg_salary deviation chinese european american
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 6.19 9.56 1 5.48 1.54 1 0 0
2 2 10.1 5.17 1 3.77 -0.755 1 0 0
3 3 4.96 1.91 1 6.75 0.381 1 0 0
4 4 5.10 4.57 1 4.61 -0.207 1 0 0
5 5 -1.25 6.57 1 2.33 0.880 1 0 0
6 6 6.77 9.10 1 3.07 1.03 1 0 0
7 7 4.04 4.84 1 4.56 1.95 0 1 0
8 8 3.72 4.72 1 5.32 1.17 0 1 0
9 9 7.59 7.05 1 6.24 -0.224 0 1 0
10 10 4.09 3.94 0 5.60 2.52 0 1 0
11 11 4.15 6.05 0 7.08 -0.152 0 1 0
12 12 6.07 5.27 0 5.79 -0.323 0 1 0
13 13 4.49 4.64 0 5.97 0.457 0 0 1
14 14 6.74 4.70 0 3.38 0.377 0 0 1
15 15 7.46 9.02 0 6.65 1.85 0 0 1
16 16 4.29 5.26 0 4.07 2.18 0 0 1
17 17 2.33 1.58 0 8.43 1.06 0 0 1
18 18 4.78 8.75 0 5.03 0.101 0 0 1
Making the plot:
df %>%
mutate(chinese = case_when(chinese == 1 ~ "chinese"),
european = case_when(european == 1 ~ "european"),
american = case_when(american == 1 ~ "american"),
female = case_when(female == 1 ~ "female",
TRUE ~ "male")) %>%
unite(country, chinese:american, remove = TRUE, sep = "") %>%
mutate(country = country %>% str_remove_all("NA")) %>%
ggplot() +
aes(lg_salary, deviation, col = female) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~ country)
The output:

Related

Difference in values between rows after group_by

I want to calculate the difference in values for the following row after the previous. However, I am getting this error:
Error in mutate():
! Problem while computing ..1 = across(where(is.numeric), diff).
ℹ The error occurred in group 1: vs = 0
Caused by error in across():
! Problem while computing column mpg.
Caused by error in dplyr_internal_error():
Run rlang::last_error() to see where the error occurred.
Here is what I have tried:
mtcars %>% group_by(vs) %>% mutate(across(where(is.numeric), diff))
This seems to do the trick:
mtcars %>% group_by(vs) %>% aggregate(. ~ vs, data=., diff) %>% as.data.frame() %>% unnest()
#//--
# A tibble: 30 × 11
vs mpg cyl disp hp drat wt qsec am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 0 0 0 0 0 0.255 0.560 0 0 0
2 0 -2.3 2 200 65 -0.75 0.565 0 -1 -1 -2
3 0 -4.4 0 0 70 0.0600 0.130 -1.18 0 0 2
4 0 2.10 0 -84.2 -65 -0.140 0.500 1.56 0 0 -1
5 0 0.900 0 0 0 0 -0.340 0.200 0 0 0
6 0 -2.10 0 0 0 0 0.0500 0.400 0 0 0
7 0 -4.8 0 196. 25 -0.140 1.47 -0.0200 0 0 1
8 0 0 0 -12 10 0.0700 0.174 -0.160 0 0 0
9 0 4.3 0 -20 15 0.23 -0.0790 -0.400 0 0 0
10 0 0.800 0 -122 -80 -0.47 -1.82 -0.550 0 0 -2
# … with 20 more rows
You could explicitly define the calculation using lag. Or you could do this in base R:
library(tidyverse)
#tidyverse
mtcars %>%
group_by(vs) %>%
mutate(across(where(is.numeric), ~.-lag(., default = first(.)))) |>
arrange(vs)
#> # A tibble: 32 x 11
#> # Groups: vs [2]
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0 0 0 0 0 0 0 0 0 0 0
#> 2 0 0 0 0 0 0.255 0.560 0 0 0 0
#> 3 -2.3 2 200 65 -0.75 0.565 0 0 -1 -1 -2
#> 4 -4.4 0 0 70 0.0600 0.130 -1.18 0 0 0 2
#> 5 2.10 0 -84.2 -65 -0.140 0.500 1.56 0 0 0 -1
#> 6 0.900 0 0 0 0 -0.340 0.200 0 0 0 0
#> 7 -2.10 0 0 0 0 0.0500 0.400 0 0 0 0
#> 8 -4.8 0 196. 25 -0.140 1.47 -0.0200 0 0 0 1
#> 9 0 0 -12 10 0.0700 0.174 -0.160 0 0 0 0
#> 10 4.3 0 -20 15 0.23 -0.0790 -0.400 0 0 0 0
#> # ... with 22 more rows
#base R
by(mtcars, mtcars$vs, \(x) apply(x, 2, diff)) |>
do.call(what = rbind.data.frame)
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> 0.Mazda RX4 Wag 0.0 0 0.0 0 0.00 0.255 0.56 0 0 0 0
#> 0.Hornet Sportabout -2.3 2 200.0 65 -0.75 0.565 0.00 0 -1 -1 -2
#> 0.Duster 360 -4.4 0 0.0 70 0.06 0.130 -1.18 0 0 0 2
#> 0.Merc 450SE 2.1 0 -84.2 -65 -0.14 0.500 1.56 0 0 0 -1
#> 0.Merc 450SL 0.9 0 0.0 0 0.00 -0.340 0.20 0 0 0 0
#> 0.Merc 450SLC -2.1 0 0.0 0 0.00 0.050 0.40 0 0 0 0
#> 0.Cadillac Fleetwood -4.8 0 196.2 25 -0.14 1.470 -0.02 0 0 0 1
#> 0.Lincoln Continental 0.0 0 -12.0 10 0.07 0.174 -0.16 0 0 0 0
#> 0.Chrysler Imperial 4.3 0 -20.0 15 0.23 -0.079 -0.40 0 0 0 0
#> 0.Dodge Challenger 0.8 0 -122.0 -80 -0.47 -1.825 -0.55 0 0 0 -2
#> 0.AMC Javelin -0.3 0 -14.0 0 0.39 -0.085 0.43 0 0 0 0
#> 0.Camaro Z28 -1.9 0 46.0 95 0.58 0.405 -1.89 0 0 0 2
#> 0.Pontiac Firebird 5.9 0 50.0 -70 -0.65 0.005 1.64 0 0 0 -2
#> 0.Porsche 914-2 6.8 -4 -279.7 -84 1.35 -1.705 -0.35 0 1 2 0
#> 0.Ford Pantera L -10.2 4 230.7 173 -0.21 1.030 -2.20 0 0 0 2
#> 0.Ferrari Dino 3.9 -2 -206.0 -89 -0.60 -0.400 1.00 0 0 0 2
#> 0.Maserati Bora -4.7 2 156.0 160 -0.08 0.800 -0.90 0 0 0 2
#> 1.Hornet 4 Drive -1.4 2 150.0 17 -0.77 0.895 0.83 0 -1 -1 0
#> 1.Valiant -3.3 0 -33.0 -5 -0.32 0.245 0.78 0 0 0 0
#> 1.Merc 240D 6.3 -2 -78.3 -43 0.93 -0.270 -0.22 0 0 1 1
#> 1.Merc 230 -1.6 0 -5.9 33 0.23 -0.040 2.90 0 0 0 0
#> 1.Merc 280 -3.6 2 26.8 28 0.00 0.290 -4.60 0 0 0 2
#> 1.Merc 280C -1.4 0 0.0 0 0.00 0.000 0.60 0 0 0 0
#> 1.Fiat 128 14.6 -2 -88.9 -57 0.16 -1.240 0.57 0 1 0 -3
#> 1.Honda Civic -2.0 0 -3.0 -14 0.85 -0.585 -0.95 0 0 0 1
#> 1.Toyota Corolla 3.5 0 -4.6 13 -0.71 0.220 1.38 0 0 0 -1
#> 1.Toyota Corona -12.4 0 49.0 32 -0.52 0.630 0.11 0 -1 -1 0
#> 1.Fiat X1-9 5.8 0 -41.1 -31 0.38 -0.530 -1.11 0 1 1 0
#> 1.Lotus Europa 3.1 0 16.1 47 -0.31 -0.422 -2.00 0 0 1 1
#> 1.Volvo 142E -9.0 0 25.9 -4 0.34 1.267 1.70 0 0 -1 0

How to Convert Factor to Dummy Variables?

> dput(head(final,10))
structure(list(Y = c(93.433, 104.456, 163.792, 125.249, 146.837,
78.196, 52.192, 191.33, 75.02, 145.785), X1 = c(5.9701, 9.3506,
9.718, 14.1317, 9.9278, 1.9318, 2.2236, 12.612, 13.8961, 8.1844
), X2 = c(6.047, 9.4063, 9.4967, 13.9422, 10.0581, 1.6575, 1.8749,
12.3052, 13.7316, 8.2732), X3 = c(8.1105, 8.365, 16.8862, 14.8049,
14.1477, 15.9753, 12.0362, 16.5604, 8.1691, 16.4479), x4 = c(1.70843,
0.34726, 4.76446, 2.19965, 2.80567, 7.58081, 5.59927, 3.56611,
-1.10324, 4.76204), x5 = c(1, 1, 1, 2, 1, 1, 3, 1, 2, 1)), row.names = c(NA,
10L), class = "data.frame")
x5 is my factor variable, which has type 1, 2, 3. Now I want to create x6 and x7 such that:
Types x6 x7
type1 0 0
type2 1 0
type3 0 1
How to do?
You can use model.matrix:
# Packages
library(magrittr)
library(dplyr)
library(stringr)
# Make x5 a factor
final <- final %>%
as_tibble() %>%
mutate(
x5 = as.factor(x5)
)
# Make the dummy variables
final <- model.matrix(~0+final$x5) %>%
as_tibble() %>%
rename_all(~str_remove_all(., '.*\\$')) %>%
mutate(
across(everything(), as.factor)
) %>%
bind_cols(final, .) %>%
select(-x5)
# A tibble: 10 x 8
Y X1 X2 X3 x4 x51 x52 x53
<dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct>
1 93.4 5.97 6.05 8.11 1.71 1 0 0
2 104. 9.35 9.41 8.36 0.347 1 0 0
3 164. 9.72 9.50 16.9 4.76 1 0 0
4 125. 14.1 13.9 14.8 2.20 0 1 0
5 147. 9.93 10.1 14.1 2.81 1 0 0
6 78.2 1.93 1.66 16.0 7.58 1 0 0
7 52.2 2.22 1.87 12.0 5.60 0 0 1
8 191. 12.6 12.3 16.6 3.57 1 0 0
9 75.0 13.9 13.7 8.17 -1.10 0 1 0
10 146. 8.18 8.27 16.4 4.76 1 0 0

How do I create a dummy variable that depends on values in multiple columns?

I am trying to create a treatment dummy for the states whose 1970 legal1820 is different from their legal1820 in 1979. So I need the proper syntax for somethihng like this treat = ifelse((legal1820 when (year == 1970)) != (legal1820 when (year == 1979)) , 1,0)
this is the data I am using
mlda <- read_dta("http://masteringmetrics.com/wp-content/uploads/2015/01/deaths.dta")
dft <- mlda %>%
filter(year <= 1990) %>%
mutate(dtype = as_factor(dtype, levels = "labels"),
age_cat = agegr,
agegr = as_factor(agegr, levels = "labels"))
library(tidycensus)
data("fips_codes")
fips_codes <- fips_codes %>%
mutate(state_code = as.numeric(state_code)) %>%
select(state, state_code) %>%
distinct()
dft <- dft %>%
rename(state_code = state) %>%
right_join(fips_codes, by = "state_code") %>%
select(-state_code)%>%
group_by(state)%>%
filter(agegr == "18-20 yrs", year <= 1983)%>%
pivot_wider(names_from = dtype, values_from = mrate)%>%
mutate(post = ifelse(year >= 1975 ,1,0)
these are the libraries I am using (most of them are for other parts of my code)
library(tidyverse)
library(AER)
library(stargazer)
library(haven)
library(lfe)
library(estimatr)
library(stringr)
library(dplyr)
library(modelsummary)
library(ggplot2)
library(haven)
Is this what you are looking for?
library(dplyr)
mlda %>% group_by(state) %>% mutate(treat = +(first(legal1820[year == 1970] != legal1820[year == 1979])))
Output
# A tibble: 24,786 x 16
# Groups: state [51]
year state legal1820 dtype agegr count pop age legal beertaxa beerpercap winepercap spiritpercap totpercap mrate treat
<dbl> <dbl> <dbl> <dbl+lbl> <dbl+lbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 1970 1 0 1 [all] 1 [15-17 yrs] 224 213574 16.0 0 1.37 0.600 0.0900 0.700 1.38 105. 1
2 1971 1 0 1 [all] 1 [15-17 yrs] 241 220026 16.0 0 1.32 0.660 0.0900 0.760 1.52 110. 1
3 1972 1 0 1 [all] 1 [15-17 yrs] 270 224877 16.0 0 1.28 0.740 0.0900 0.780 1.61 120. 1
4 1973 1 0 1 [all] 1 [15-17 yrs] 258 227256 16.0 0 1.20 0.790 0.100 0.790 1.69 114. 1
5 1974 1 0 1 [all] 1 [15-17 yrs] 224 229025 16.0 0 1.08 0.830 0.160 0.810 1.80 97.8 1
6 1975 1 0.294 1 [all] 1 [15-17 yrs] 207 229739 16.0 0 0.991 0.880 0.160 0.850 1.88 90.1 1
7 1976 1 0.665 1 [all] 1 [15-17 yrs] 231 230696 16.0 0 0.937 0.890 0.150 0.860 1.89 100. 1
8 1977 1 0.668 1 [all] 1 [15-17 yrs] 219 230086 16.0 0 0.880 0.990 0.130 0.840 1.96 95.2 1
9 1978 1 0.667 1 [all] 1 [15-17 yrs] 234 229519 16.0 0 0.817 0.980 0.120 0.880 1.97 102. 1
10 1979 1 0.668 1 [all] 1 [15-17 yrs] 176 227140 16.0 0 0.734 0.980 0.120 0.840 1.94 77.5 1
# ... with 24,776 more rows

Making long format selectively pivot_longer() tidyverse in R

I'm trying to make 4 variables (id, uerate, lnw, exper) in my data.frame (d) into long format.
I also want to add two binary (0/1) columns called DL and DE; if the long-formatted value represents lnw the DL==1 and DE==0 and vice versa.
I was wondering how to correctly specify tidyr::pivot_longer() to obtain my EXPECTED OUTPUT below?
d <- read.csv("https://stats.idre.ucla.edu/wp-content/uploads/2016/02/wages_pp-1.txt")
select(d, id, uerate, lnw, exper) %>% pivot_longer(everything()) # tried without success
#### EXPECTED OUTPUT:
## id uerate variable value DE DL
## 1 31 3.21 lnw 1.491 0 1
## 2 31 3.21 lnw 1.433 0 1
## 3 31 3.21 lnw 1.469 0 1
## 4 31 3.29 lnw 1.749 0 1
## 5 31 2.90 lnw 1.931 0 1
## 6 31 2.50 lnw 1.709 0 1
## 7 31 2.60 lnw 2.086 0 1
## 8 31 4.79 lnw 2.129 0 1
## 6403 31 3.21 exper 0.015 1 0
## 6404 31 3.21 exper 0.715 1 0
## 6405 31 3.21 exper 1.734 1 0
## 6406 31 3.29 exper 2.773 1 0
## 6407 31 2.90 exper 3.927 1 0
## 6408 31 2.50 exper 4.946 1 0
## 6409 31 2.60 exper 5.965 1 0
## 6410 31 4.79 exper 6.984 1 0
We select the columns of interest, and instead of everything() in the columns to select with cols, it should select columns other than 'id', 'uerate' (-c(id, uerate)), then mutate to create new columns 'DE' and 'DL'
library(dplyr)
library(tidyr)
d %>%
select(id, uerate, lnw, exper) %>%
pivot_longer(cols = -c(id, uerate), names_to = 'variable') %>%
mutate(DE = +(variable == 'exper'), DL = +(!DE)) %>%
arrange(id, variable)
-output
# A tibble: 12,804 x 6
# id uerate variable value DE DL
# <int> <dbl> <chr> <dbl> <int> <int>
# 1 31 3.22 exper 0.015 1 0
# 2 31 3.22 exper 0.715 1 0
# 3 31 3.22 exper 1.73 1 0
# 4 31 3.30 exper 2.77 1 0
# 5 31 2.90 exper 3.93 1 0
# 6 31 2.50 exper 4.95 1 0
# 7 31 2.60 exper 5.96 1 0
# 8 31 4.80 exper 6.98 1 0
# 9 31 3.22 lnw 1.49 0 1
#10 31 3.22 lnw 1.43 0 1
# … with 12,794 more rows

Ggplot2 : bubbles representing propotions by category?

I've this data :
# A tibble: 19 x 8
country Prop_A Prop_B Prop_C Prop_D Prop_E Prop_F Prop_G
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Austria 1 1 0.912 0.912 0.518 0.999 0.567
2 Belgium 1 1 0.821 1 0.687 0.0990 0.925
3 Denmark NA NA NA NA NA NA NA
4 France 0.750 1 0.361 0.345 0 0.0658 0.563
5 Germany 0.928 1 0.674 0.783 0.128 0.635 0.0828
6 Greece 0 1 0 0 0 1 0
7 Hungary 0.812 1 0.812 0.812 0 0.375 0.188
8 Israel 1 1 1 0.755 0.450 0.241 0.292
9 Italy 0.962 1 0.881 0.516 0.533 0 0.0230
10 Latvia 0 1 1 0 0 0 0
11 Lithuania 0.507 1 1 0.507 0 0 0
12 Malta 1 1 1 1 0 1 0
13 Netherlands 0.818 1 1 0.682 0.5 0.182 0.682
14 Portugal 0.829 1 1 0.829 0 0.610 0.509
15 Romania 1 1 1 1 0 0.273 1
16 Spain 1 1 1 0.787 0.215 0.191 0.653
17 Sweden 0.792 1 0.792 0.167 0.375 0 0
18 Switzerland 0.697 1 1 0.547 0.126 0.724 0.210
19 Turkey 1 1 0.842 0.775 0.585 0.810 0.117
>
0.812 represent 81% for the proposal A in Hungary (7)
What I want is this kind of graphic :
https://zupimages.net/viewer.php?id=20/13/ob6z.png
I want to have "81%" in the bubble , countries in rows and the different "props" in columns.
I've tried geom_tile, but doesn't work. I don't understand if my data are not well built, or if i just don't find the good command.
Thank for your help !
Here is one approach to making a bubble plot.
library(tidyverse)
df %>%
mutate_at(vars(starts_with("Prop")), list(~. * 100)) %>%
pivot_longer(cols = starts_with("Prop"), names_to = c("Prop", "Type"), names_sep = "_") %>%
ggplot(aes(x = Type, y = country, size = value, label = value)) +
geom_point(shape = 21, fill = "white") +
geom_text(size = 3) +
scale_size(range = c(5, 15), guide = F)
Plot

Resources