How to Convert Factor to Dummy Variables?

How to Convert Factor to Dummy Variables? - r

> dput(head(final,10))
structure(list(Y = c(93.433, 104.456, 163.792, 125.249, 146.837,
78.196, 52.192, 191.33, 75.02, 145.785), X1 = c(5.9701, 9.3506,
9.718, 14.1317, 9.9278, 1.9318, 2.2236, 12.612, 13.8961, 8.1844
), X2 = c(6.047, 9.4063, 9.4967, 13.9422, 10.0581, 1.6575, 1.8749,
12.3052, 13.7316, 8.2732), X3 = c(8.1105, 8.365, 16.8862, 14.8049,
14.1477, 15.9753, 12.0362, 16.5604, 8.1691, 16.4479), x4 = c(1.70843,
0.34726, 4.76446, 2.19965, 2.80567, 7.58081, 5.59927, 3.56611,
-1.10324, 4.76204), x5 = c(1, 1, 1, 2, 1, 1, 3, 1, 2, 1)), row.names = c(NA,
10L), class = "data.frame")
x5 is my factor variable, which has type 1, 2, 3. Now I want to create x6 and x7 such that:
Types x6 x7
type1 0 0
type2 1 0
type3 0 1
How to do?

You can use model.matrix:
# Packages
library(magrittr)
library(dplyr)
library(stringr)
# Make x5 a factor
final <- final %>%
as_tibble() %>%
mutate(
x5 = as.factor(x5)
)
# Make the dummy variables
final <- model.matrix(~0+final$x5) %>%
as_tibble() %>%
rename_all(~str_remove_all(., '.*\\$')) %>%
mutate(
across(everything(), as.factor)
) %>%
bind_cols(final, .) %>%
select(-x5)
# A tibble: 10 x 8
Y X1 X2 X3 x4 x51 x52 x53
<dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct>
1 93.4 5.97 6.05 8.11 1.71 1 0 0
2 104. 9.35 9.41 8.36 0.347 1 0 0
3 164. 9.72 9.50 16.9 4.76 1 0 0
4 125. 14.1 13.9 14.8 2.20 0 1 0
5 147. 9.93 10.1 14.1 2.81 1 0 0
6 78.2 1.93 1.66 16.0 7.58 1 0 0
7 52.2 2.22 1.87 12.0 5.60 0 0 1
8 191. 12.6 12.3 16.6 3.57 1 0 0
9 75.0 13.9 13.7 8.17 -1.10 0 1 0
10 146. 8.18 8.27 16.4 4.76 1 0 0

Related

Scatterplot with multi variables

Scatterplot reference
data set
Can someone help me create three scatter plots as in the first picture? Ideally using the plot() function.

require(tidyverse)
require(ggplot2)
df <- tibble(
image = 1:18,
m_r_exsal = rnorm(18, 5, 2),
m_r_sal = rnorm(18, 6, 2),
female = c(rep(1, 18/2), rep(0, 18/2)),
lg_salary = rnorm(18, 5, 1.5),
deviation = rnorm(18, 1, 1),
chinese = c(rep(1, 6), rep(0, 18/3*2)),
european = c(rep(0, 6), rep(1, 6), rep(0, 6)),
american = c(rep(0, 18/3*2), rep(1, 6))
)
Example data:
# A tibble: 18 x 9
image m_r_exsal m_r_sal female lg_salary deviation chinese european american
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 6.19 9.56 1 5.48 1.54 1 0 0
2 2 10.1 5.17 1 3.77 -0.755 1 0 0
3 3 4.96 1.91 1 6.75 0.381 1 0 0
4 4 5.10 4.57 1 4.61 -0.207 1 0 0
5 5 -1.25 6.57 1 2.33 0.880 1 0 0
6 6 6.77 9.10 1 3.07 1.03 1 0 0
7 7 4.04 4.84 1 4.56 1.95 0 1 0
8 8 3.72 4.72 1 5.32 1.17 0 1 0
9 9 7.59 7.05 1 6.24 -0.224 0 1 0
10 10 4.09 3.94 0 5.60 2.52 0 1 0
11 11 4.15 6.05 0 7.08 -0.152 0 1 0
12 12 6.07 5.27 0 5.79 -0.323 0 1 0
13 13 4.49 4.64 0 5.97 0.457 0 0 1
14 14 6.74 4.70 0 3.38 0.377 0 0 1
15 15 7.46 9.02 0 6.65 1.85 0 0 1
16 16 4.29 5.26 0 4.07 2.18 0 0 1
17 17 2.33 1.58 0 8.43 1.06 0 0 1
18 18 4.78 8.75 0 5.03 0.101 0 0 1
Making the plot:
df %>%
mutate(chinese = case_when(chinese == 1 ~ "chinese"),
european = case_when(european == 1 ~ "european"),
american = case_when(american == 1 ~ "american"),
female = case_when(female == 1 ~ "female",
TRUE ~ "male")) %>%
unite(country, chinese:american, remove = TRUE, sep = "") %>%
mutate(country = country %>% str_remove_all("NA")) %>%
ggplot() +
aes(lg_salary, deviation, col = female) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~ country)
The output:

Column median for each five elements of a column Using a dataframe

I have a dataframe with 20 columns and 20 rows. I want to calculate the median of each five row for each column using a loop or a function. I need 20 columns with five median for each of them.

In base R you could accomplish the same by using tapply:
set.seed(1)
m <- matrix(rexp(400, rate=.1), ncol=20)
t(tapply(m, list(col(m), (row(m)-1) %/% 5), median))
1 2 3 4 5 6 7
0 4.360686 5.658655 10.798811 11.081767 8.185142 5.162061 7.430436 ...
1 9.565675 9.968130 9.945558 8.370065 13.456440 7.631800 11.910946 ...
2 12.376036 3.240102 5.946177 17.847654 6.812291 14.195492 3.788268 ...
3 6.547466 7.252143 7.741878 8.145358 2.637383 2.991589 7.851209 ...

library(dplyr)
set.seed(1)
matrix(rexp(400, rate=.1), ncol=20) %>%
as_tibble(.name_repair = ~paste0('X', 1:20)) %>%
group_by(id = rep(1:4, each = 5)) %>%
summarise(
across(everything(), median)
)
# A tibble: 4 x 21
id X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 4.36 5.66 10.8 11.1 8.19 5.16 7.43 10.3 14.8 7.51 6.75 10.9 13.2 7.37 6.92 17.8 4.63 9.56 7.17 5.03
2 2 9.57 9.97 9.95 8.37 13.5 7.63 11.9 8.05 21.1 5.42 6.73 6.22 4.79 10.1 4.33 12.8 6.55 5.00 7.61 3.43
3 3 12.4 3.24 5.95 17.8 6.81 14.2 3.79 17.4 12.9 8.02 10.3 5.62 7.22 5.21 2.92 2.65 6.85 4.29 5.85 0.542
4 4 6.55 7.25 7.74 8.15 2.64 2.99 7.85 7.12 7.62 7.37 6.63 6.46 7.50 12.9 10.9 6.59 1.93 10.5 4.68 11.4

pivot_longer on a mix of matrix columns and regular vector columns

I have a tibble where some columns are matrices. Here's a toy example:
library(dplyr)
library(tidyr)
dat <- structure(list(id = 0:5, matrix_column = structure(c(-1.34333431222985,
-1.54123232044003, -1.7260282725816, -1.8924463753132, -2.0376516335872,
-2.16069643164938, -0.250406602741403, -0.287716094522968, -0.32269823315914,
-0.354360193430544, -0.382155662949252, -0.405883260458378, 1.53709630050992,
1.76715755374983, 1.98313378488307, 2.17881959842109, 2.35072520728221,
2.4974704619887), .Dim = c(6L, 3L)), vector_column = c(10.453112322311,
10.3019556236512, 10.1273409693709, 9.91474471968391, 9.65093549479026,
9.32601906868098)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
The tibble looks like this.
> dat
# A tibble: 6 x 3
id matrix_column[,1] [,2] [,3] vector_column
<int> <dbl> <dbl> <dbl> <dbl>
1 0 -1.34 -0.250 1.54 10.5
2 1 -1.54 -0.288 1.77 10.3
3 2 -1.73 -0.323 1.98 10.1
4 3 -1.89 -0.354 2.18 9.91
5 4 -2.04 -0.382 2.35 9.65
6 5 -2.16 -0.406 2.50 9.33
If I apply pivot_longer from tidyr to the non-id columns, the values in vector_column get replicated to fill the two additional columns required to accommodate matrix_column.
dat %>%
pivot_longer(cols = -id, values_to = "new_column")
# A tibble: 12 x 3
id name new_column[,1] [,2] [,3]
<int> <chr> <dbl> <dbl> <dbl>
1 0 matrix_column -1.34 -0.250 1.54
2 0 vector_column 10.5 10.5 10.5
3 1 matrix_column -1.54 -0.288 1.77
4 1 vector_column 10.3 10.3 10.3
5 2 matrix_column -1.73 -0.323 1.98
6 2 vector_column 10.1 10.1 10.1
7 3 matrix_column -1.89 -0.354 2.18
8 3 vector_column 9.91 9.91 9.91
9 4 matrix_column -2.04 -0.382 2.35
10 4 vector_column 9.65 9.65 9.65
11 5 matrix_column -2.16 -0.406 2.50
12 5 vector_column 9.33 9.33 9.33
Is there a way to have the [,2] and the [,3] columns of new_column to be NA (instead of the same value of [,1]) when name equals vector_column?
Something like
# A tibble: 12 x 3
id name new_column[,1] [,2] [,3]
<int> <chr> <dbl> <dbl> <dbl>
1 0 matrix_column -1.34 -0.250 1.54
2 0 vector_column 10.5 NA NA
3 1 matrix_column -1.54 -0.288 1.77
4 1 vector_column 10.3 NA NA
My real life data have dozens of matrix columns and vector columns.

If you continue with the format of data that you currently have (having dataframe and matrix together) you'll keep on running into trouble to work with it. I would suggest to convert the matrix into dataframe and add them as their separate columns.
library(dplyr)
library(tidyr)
dat$matrix_column %>%
data.frame() %>%
bind_cols(dat %>% select(-matrix_column)) %>%
pivot_longer(cols = -id, values_to = "new_column")
# id name new_column
# <int> <chr> <dbl>
# 1 0 X1 -1.34
# 2 0 X2 -0.250
# 3 0 X3 1.54
# 4 0 vector_column 10.5
# 5 1 X1 -1.54
# 6 1 X2 -0.288
# 7 1 X3 1.77
# 8 1 vector_column 10.3
# 9 2 X1 -1.73
#10 2 X2 -0.323
# … with 14 more rows

How do I create a dummy variable that depends on values in multiple columns?

I am trying to create a treatment dummy for the states whose 1970 legal1820 is different from their legal1820 in 1979. So I need the proper syntax for somethihng like this treat = ifelse((legal1820 when (year == 1970)) != (legal1820 when (year == 1979)) , 1,0)
this is the data I am using
mlda <- read_dta("http://masteringmetrics.com/wp-content/uploads/2015/01/deaths.dta")
dft <- mlda %>%
filter(year <= 1990) %>%
mutate(dtype = as_factor(dtype, levels = "labels"),
age_cat = agegr,
agegr = as_factor(agegr, levels = "labels"))
library(tidycensus)
data("fips_codes")
fips_codes <- fips_codes %>%
mutate(state_code = as.numeric(state_code)) %>%
select(state, state_code) %>%
distinct()
dft <- dft %>%
rename(state_code = state) %>%
right_join(fips_codes, by = "state_code") %>%
select(-state_code)%>%
group_by(state)%>%
filter(agegr == "18-20 yrs", year <= 1983)%>%
pivot_wider(names_from = dtype, values_from = mrate)%>%
mutate(post = ifelse(year >= 1975 ,1,0)
these are the libraries I am using (most of them are for other parts of my code)
library(tidyverse)
library(AER)
library(stargazer)
library(haven)
library(lfe)
library(estimatr)
library(stringr)
library(dplyr)
library(modelsummary)
library(ggplot2)
library(haven)

Is this what you are looking for?
library(dplyr)
mlda %>% group_by(state) %>% mutate(treat = +(first(legal1820[year == 1970] != legal1820[year == 1979])))
Output
# A tibble: 24,786 x 16
# Groups: state [51]
year state legal1820 dtype agegr count pop age legal beertaxa beerpercap winepercap spiritpercap totpercap mrate treat
<dbl> <dbl> <dbl> <dbl+lbl> <dbl+lbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 1970 1 0 1 [all] 1 [15-17 yrs] 224 213574 16.0 0 1.37 0.600 0.0900 0.700 1.38 105. 1
2 1971 1 0 1 [all] 1 [15-17 yrs] 241 220026 16.0 0 1.32 0.660 0.0900 0.760 1.52 110. 1
3 1972 1 0 1 [all] 1 [15-17 yrs] 270 224877 16.0 0 1.28 0.740 0.0900 0.780 1.61 120. 1
4 1973 1 0 1 [all] 1 [15-17 yrs] 258 227256 16.0 0 1.20 0.790 0.100 0.790 1.69 114. 1
5 1974 1 0 1 [all] 1 [15-17 yrs] 224 229025 16.0 0 1.08 0.830 0.160 0.810 1.80 97.8 1
6 1975 1 0.294 1 [all] 1 [15-17 yrs] 207 229739 16.0 0 0.991 0.880 0.160 0.850 1.88 90.1 1
7 1976 1 0.665 1 [all] 1 [15-17 yrs] 231 230696 16.0 0 0.937 0.890 0.150 0.860 1.89 100. 1
8 1977 1 0.668 1 [all] 1 [15-17 yrs] 219 230086 16.0 0 0.880 0.990 0.130 0.840 1.96 95.2 1
9 1978 1 0.667 1 [all] 1 [15-17 yrs] 234 229519 16.0 0 0.817 0.980 0.120 0.880 1.97 102. 1
10 1979 1 0.668 1 [all] 1 [15-17 yrs] 176 227140 16.0 0 0.734 0.980 0.120 0.840 1.94 77.5 1
# ... with 24,776 more rows

pmap_ variants operating on data.frames as lists

I have a recollection that purrr::pmap_* can treat a data.frame as a list but the syntax eludes me.
Imagine we wanted to fit a separate lm object for each value of mtcars$vs and mtcars$am
library(tidyverse)
library(broom)
d1 <- mtcars %>%
group_by(
vs, am
) %>%
nest %>%
mutate(
coef = data %>%
map(
~lm(mpg ~ wt, data =.) %>%
tidy
)
)
If I wanted to extract the coefficient estimates as an un-nested data.frame, and append the values of am and vs, I might try
d1[, -3] %>%
pmap_dfr(
function(i, j, k)
k %>%
mutate(
vs = i,
am = j
)
)
But this results in an error. More explicitly declaring these variables as separate lists has the desired effect
list(
d1$vs,
d1$am,
d1$coef
) %>%
pmap_dfr(
function(i, j, k)
k %>%
mutate(
vs = i,
am = j
)
)
Is there a succinct way for pmap_* to treat a data.frame as a list?

We can use the standard option to extract the components (..1, ..2, etc)
d1[, -3] %>%
pmap_dfr(~ ..3 %>%
mutate(vs = ..1, am = ..2))
# A tibble: 8 x 7
# term estimate std.error statistic p.value vs am
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 (Intercept) 42.4 3.30 12.8 0.000213 0 1
#2 wt -7.91 1.14 -6.93 0.00227 0 1
#3 (Intercept) 44.1 6.96 6.34 0.00144 1 1
#4 wt -7.77 3.36 -2.31 0.0689 1 1
#5 (Intercept) 31.5 8.98 3.51 0.0171 1 0
#6 wt -3.38 2.80 -1.21 0.281 1 0
#7 (Intercept) 25.1 3.51 7.14 0.0000315 0 0
#8 wt -2.44 0.842 -2.90 0.0159 0 0

This is because the second list has no names attribute. If you unname d1 it works. The fact that you used the list function in the second example doesn't make a difference (except that it removed the names), because both objects are lists (data frames are lists).
d1[, -3] %>%
unname %>%
pmap_dfr(
function(i, j, k)
k %>%
mutate(
vs = i,
am = j
)
)
# # A tibble: 8 x 7
# term estimate std.error statistic p.value vs am
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 (Intercept) 42.4 3.30 12.8 0.000213 0 1
# 2 wt -7.91 1.14 -6.93 0.00227 0 1
# 3 (Intercept) 44.1 6.96 6.34 0.00144 1 1
# 4 wt -7.77 3.36 -2.31 0.0689 1 1
# 5 (Intercept) 31.5 8.98 3.51 0.0171 1 0
# 6 wt -3.38 2.80 -1.21 0.281 1 0
# 7 (Intercept) 25.1 3.51 7.14 0.0000315 0 0
# 8 wt -2.44 0.842 -2.90 0.0159 0 0
You can also name the arguments in your first code block's function to match (or use ..1 etc) for the same result
d1[, -3] %>%
pmap_dfr(
function(vs, am, coef)
coef %>%
mutate(
vs = vs,
am = am
)
)
# # A tibble: 8 x 7
# term estimate std.error statistic p.value vs am
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 (Intercept) 42.4 3.30 12.8 0.000213 0 1
# 2 wt -7.91 1.14 -6.93 0.00227 0 1
# 3 (Intercept) 44.1 6.96 6.34 0.00144 1 1
# 4 wt -7.77 3.36 -2.31 0.0689 1 1
# 5 (Intercept) 31.5 8.98 3.51 0.0171 1 0
# 6 wt -3.38 2.80 -1.21 0.281 1 0
# 7 (Intercept) 25.1 3.51 7.14 0.0000315 0 0
# 8 wt -2.44 0.842 -2.90 0.0159 0 0
You could also use wap from the experimental rap package
library(rap)
d1[, -3] %>%
wap( ~ coef %>%
mutate(
vs = vs,
am = am)) %>%
bind_rows
# # A tibble: 8 x 7
# term estimate std.error statistic p.value vs am
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 (Intercept) 42.4 3.30 12.8 0.000213 0 1
# 2 wt -7.91 1.14 -6.93 0.00227 0 1
# 3 (Intercept) 44.1 6.96 6.34 0.00144 1 1
# 4 wt -7.77 3.36 -2.31 0.0689 1 1
# 5 (Intercept) 31.5 8.98 3.51 0.0171 1 0
# 6 wt -3.38 2.80 -1.21 0.281 1 0
# 7 (Intercept) 25.1 3.51 7.14 0.0000315 0 0
# 8 wt -2.44 0.842 -2.90 0.0159 0 0

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

How to Convert Factor to Dummy Variables? - r

Related

Scatterplot with multi variables

Column median for each five elements of a column Using a dataframe

pivot_longer on a mix of matrix columns and regular vector columns

How do I create a dummy variable that depends on values in multiple columns?

pmap_ variants operating on data.frames as lists

Categories

Resources