I have a list like below
d1 <- data.frame(y1=c(1,2,3),y2=c(4,5,6))
d2 <- data.frame(y1=c(3,2,1),y2=c(6,5,4))
my.list <- list(d1, d2)
str(my.list)
List of 2
$ :'data.frame': 3 obs. of 2 variables:
..$ y1: num [1:3] 1 2 3
..$ y2: num [1:3] 4 5 6
$ :'data.frame': 3 obs. of 2 variables:
..$ y1: num [1:3] 3 2 1
..$ y2: num [1:3] 6 5 4
what i want to have is to add names as i wish to each object of this list
for example
$myFirst :'data.frame': 3 obs. of 2 variables:
..$ y1: num [1:3] 1 2 3
..$ y2: num [1:3] 4 5 6
$mySecond :'data.frame': 3 obs. of 2 variables:
..$ y1: num [1:3] 3 2 1
..$ y2: num [1:3] 6 5 4
I tired to do it by
myNam<-c("myFirst","mySecond")
names(myNam) <- sapply(my.list,paste)
where is the problem ?
A new data which the function does not work on it
df<- structure(list(A = structure(list(breaks = c(-10, -9, -8, -7,
-6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4), counts = c(1L, 0L, 1L,
5L, 9L, 38L, 56L, 105L, 529L, 2858L, 17L, 2L, 0L, 2L), density = c(0.000276014352746343,
0, 0.000276014352746343, 0.00138007176373171, 0.00248412917471709,
0.010488545404361, 0.0154568037537952, 0.028981507038366, 0.146011592602815,
0.788849020149048, 0.00469224399668783, 0.000552028705492686,
0, 0.000552028705492686), mids = c(-9.5, -8.5, -7.5, -6.5, -5.5,
-4.5, -3.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5), xname = "x",
equidist = TRUE), .Names = c("breaks", "counts", "density",
"mids", "xname", "equidist"), class = "histogram"), B = structure(list(
breaks = c(-7, -6, -5, -4, -3, -2, -1, 0), counts = c(2L,
0L, 6L, 2L, 2L, 1L, 3L), density = c(0.125, 0, 0.375, 0.125,
0.125, 0.0625, 0.1875), mids = c(-6.5, -5.5, -4.5, -3.5,
-2.5, -1.5, -0.5), xname = "x", equidist = TRUE), .Names = c("breaks",
"counts", "density", "mids", "xname", "equidist"), class = "histogram"),
C = structure(list(breaks = c(-7, -6, -5, -4, -3, -2, -1,
0, 1), counts = c(2L, 2L, 4L, 5L, 14L, 22L, 110L, 3L), density = c(0.0123456790123457,
0.0123456790123457, 0.0246913580246914, 0.0308641975308642,
0.0864197530864197, 0.135802469135802, 0.679012345679012,
0.0185185185185185), mids = c(-6.5, -5.5, -4.5, -3.5, -2.5,
-1.5, -0.5, 0.5), xname = "x", equidist = TRUE), .Names = c("breaks",
"counts", "density", "mids", "xname", "equidist"), class = "histogram")), .Names = c("A",
"B", "C"))
We need to assign the 'myNam' to the names of my.list
names(my.list) <- myNam
str(my.list)
#List of 2
#$ myFirst :'data.frame': 3 obs. of 2 variables:
# ..$ y1: num [1:3] 1 2 3
# ..$ y2: num [1:3] 4 5 6
#$ mySecond:'data.frame': 3 obs. of 2 variables:
# ..$ y1: num [1:3] 3 2 1
# ..$ y2: num [1:3] 6 5 4
or with setNames
setNames(my.list, myNam)
The OP's code
sapply(my.list,paste)
is looping through the list elements and pasteing the elements of the columns to a single string.
Related
I was able to run the fisher's exact tet in R ion my data:
fisherTest <- fisher.test(table(dtQ1$IHD_other_healthy, dtQ1$ACE_FREQ_SUM), workspace = 2e8, simulate.p.value=TRUE)
fisherTest
and the results are singificant
Fisher's Exact Test for Count Data with simulated p-value (based on 2000 replicates)
data: table(dtQ1$IHD_other_healthy, dtQ1$ACE_FREQ_SUM)
p-value = 0.0004998
alternative hypothesis: two.sided
Now, i am trying to run the pairwise test:
pairwise_fisher_test(table(dtQ1$IHD_other_healthy, dtQ1$ACE_FREQ_SUM), p.adjust.method = "bonferroni")
but i get an error:
Error in pairwise_fisher_test(table(dtQ1$IHD_other_healthy, dtQ1$ACE_FREQ_SUM), :
A two-dimensionnal contingency table required.
How can i create a two dimenssional contingency table from my data?
str(dtQ1)
Classes ‘data.table’ and 'data.frame': 1018 obs. of 2 variables:
$ IHD_other_healthy: dbl+lbl [1:1018] 1, 1, 1, 2, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 2, 1...
..# format.spss : chr "F8.2"
..# display_width: int 19
..# labels : Named num 0 1 2
.. ..- attr(*, "names")= chr [1:3] "no illness" "other than IHD" "IHD"
$ ACE_FREQ_SUM : num 3 0 1 0 1 1 0 2 1 1 ...
..- attr(*, "format.spss")= chr "F8.2"
..- attr(*, "display_width")= int 14
- attr(*, ".internal.selfref")=<externalptr>
I have the following dataframe df:
adj_coords
1 2, 3, 4, 5, 6, 7
2 1, 3, 7, 8, 9, 10
3 1, 2, 4, 10, 11, 12
4 1, 3, 5, 12, 13, 14
5 1, 4, 6, 14, 15, 16
6 1, 5, 7, 16, 17, 18
adj_coords_material_amounts
1 0.0000, 0.0000, 0.0000, 0.0000, 632.6667, 264.3333
2 263.0000, 0.0000, 264.3333, 262.6667, 0.0000, 238.6667
3 263.0000, 0.0000, 0.0000, 238.6667, 0.0000, 298.3333
4 263.0000, 0.0000, 0.0000, 298.3333, 300.6667, 279.3333
5 263.0000, 0.0000, 632.6667, 279.3333, 0.0000, 273.3333
6 263.0000, 0.0000, 264.3333, 273.3333, 0.0000, 0.0000
df<-structure(list(adj_coords = list(2:7, c(1L, 3L, 7L, 8L, 9L, 10L
), c(1L, 2L, 4L, 10L, 11L, 12L), c(1L, 3L, 5L, 12L, 13L, 14L),
c(1L, 4L, 6L, 14L, 15L, 16L), c(1L, 5L, 7L, 16L, 17L, 18L
)), adj_coords_material_amounts = list(c(0, 0, 0, 0, 632.666666666666,
264.333333333334), c(263, 0, 264.333333333334, 262.666666666667,
0, 238.666666666667), c(263, 0, 0, 238.666666666667, 0, 298.333333333333
), c(263, 0, 0, 298.333333333333, 300.666666666667, 279.333333333334
), c(263, 0, 632.666666666666, 279.333333333334, 0, 273.333333333334
), c(263, 0, 264.333333333334, 273.333333333334, 0, 0))), row.names = c(NA,
6L), class = "data.frame")
I would like to sample one element from each row of adj_coords but only where the corresponding element in adj_coords_material_amounts is >0.
Loop over each paired set of adj_coords and adj_coords__material_amounts using mapply and sample one value with the selection > 0.
##set.seed(1)
mapply(
\(co,ma) sample(co[ma > 0], 1),
df[["adj_coords"]], df[["adj_coords_material_amounts"]]
)
#[1] 6 10 12 1 6 1
I am not that familar with dplyr, but below is one of my attempt
df %>%
mutate(id = 1:n()) %>%
unnest(c(adj_coords, adj_coords_material_amounts)) %>%
filter(adj_coords_material_amounts > 0) %>%
group_by(id) %>%
slice_sample(n = 1) %>%
ungroup() %>%
select(!id)
and you will see
# A tibble: 6 × 2
adj_coords adj_coords_material_amounts
<int> <dbl>
1 7 264.
2 8 263.
3 1 263
4 14 279.
5 16 273.
6 1 263
I'm wondering if there is a way to simplify this code to avoid repetition givent that the column names are similar excepting one character that increases for each operation.
out <- df %>%
mutate (ATN1.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch1/RefCh1)),
ATN2.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch2/RefCh2)),
ATN3.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch3/RefCh3)),
ATN4.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch4/RefCh4)),
ATN5.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch5/RefCh5)),
ATN6.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch6/RefCh6)),
ATN7.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch7/RefCh7)))
This is a small subset of my data if you wanna play with it
df = structure(list(Status = c(1, 17, 1, 1, 1, 1, 2, 0, 0, 0), ATN1.1 = c(NA,
NA, NA, NA, NA, NA, 0, 0.187761662304176, 0.373310604025045,
0.570139498143909), ATN2.1 = c(NA, NA, NA, NA, NA, NA, 0, 0.136443172947395,
0.269071359915515, 0.407552762179439), ATN3.1 = c(NA, NA, NA,
NA, NA, NA, 0, 0.113733164068766, 0.224219770615697, 0.336923929839777
), ATN4.1 = c(NA, NA, NA, NA, NA, NA, 0, 0.0942969310983806,
0.186894753425896, 0.279629737677226), ATN5.1 = c(NA, NA, NA,
NA, NA, NA, 0, 0.0753327883349684, 0.149617411430523, 0.22690457078205
), ATN6.1 = c(NA, NA, NA, NA, NA, NA, 0, 0.0493106158715682,
0.100348708536177, 0.155828822066352), ATN7.1 = c(NA, NA, NA,
NA, NA, NA, 0, 0.0526398637123631, 0.103191368342497, 0.154644102801848
), ATN0.1.1 = c(NA, NA, NA, NA, NA, NA, 15.054824247419, 15.054824247419,
15.054824247419, 15.054824247419), ATN0.2.1 = c(NA, NA, NA, NA,
NA, NA, 24.1338734012274, 24.1338734012274, 24.1338734012274,
24.1338734012274), ATN0.3.1 = c(NA, NA, NA, NA, NA, NA, 27.4233147524393,
27.4233147524393, 27.4233147524393, 27.4233147524393), ATN0.4.1 = c(NA,
NA, NA, NA, NA, NA, 20.8560560826831, 20.8560560826831, 20.8560560826831,
20.8560560826831), ATN0.5.1 = c(NA, NA, NA, NA, NA, NA, 17.1645092239121,
17.1645092239121, 17.1645092239121, 17.1645092239121), ATN0.6.1 = c(NA,
NA, NA, NA, NA, NA, 4.4180613710882, 4.4180613710882, 4.4180613710882,
4.4180613710882), ATN0.7.1 = c(NA, NA, NA, NA, NA, NA, 10.8192165605015,
10.8192165605015, 10.8192165605015, 10.8192165605015), Sen1Ch1 = c(0,
99, 0, 783198, 785643, 787093, 786717, 785935, 784922, 783784
), Sen2Ch1 = c(0, 324, 0, 793643, 796398, 798041, 798658, 798957,
799003, 798951), Sen1Ch2 = c(0, 53, 0, 739627, 741339, 742308,
741804, 741195, 740403, 739520), Sen2Ch2 = c(0, 416, 0, 743716,
745420, 746399, 746532, 746599, 746467, 746279), Sen1Ch3 = c(0,
49, 0, 720709, 722113, 722900, 722515, 722002, 721364, 720681
), Sen2Ch3 = c(0, 294, 0, 734485, 735877, 736650, 736749, 736783,
736664, 736513), Sen1Ch4 = c(0, 61, 0, 732332, 732529, 732487,
731524, 730678, 729723, 728756), Sen2Ch4 = c(0, 222, 0, 737261,
737172, 736976, 736329, 735869, 735302, 734762), Sen1Ch5 = c(0,
59, 0, 765776, 767327, 768116, 767883, 767617, 767121, 766567
), Sen2Ch5 = c(0, 248, 0, 775632, 777074, 777800, 777883, 777970,
777832, 777655), Sen1Ch6 = c(0, 57, 0, 899145, 901398, 902644,
902723, 902737, 902436, 902095), Sen2Ch6 = c(0, 352, 0, 926157,
928263, 929423, 929746, 930043, 930042, 930025), Sen1Ch7 = c(0,
45, 0, 845802, 848332, 849736, 849960, 850137, 849979, 849764
), Sen2Ch7 = c(0, 360, 0, 867160, 869852, 871321, 871830, 872308,
872428, 872500), RefCh1 = c(0, 10100, 0, 908802, 911770, 913546,
914536, 915344, 915862, 916336), RefCh2 = c(0, 6200, 0, 940232,
942473, 943743, 944281, 944794, 945037, 945218), RefCh3 = c(0,
6200, 0, 947069, 948944, 950017, 950484, 950890, 951100, 951271
), RefCh4 = c(0, 14700, 0, 900977, 901433, 901543, 901167, 900974,
900630, 900271), RefCh5 = c(0, 8250, 0, 908355, 910304, 911295,
911674, 912045, 912133, 912179), RefCh6 = c(0, 6200, 0, 939365,
941703, 942978, 943500, 943980, 944147, 944314), RefCh7 = c(0,
6200, 0, 941728, 944713, 946375, 947078, 947774, 948077, 948325
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
))
You can feed dynamic variable names to mutate with !!sym for example:
for(i in 1:7){
out <- df %>%
mutate(!!sym(sprintf("ATN%s.1",i)) := ifelse(Status == 1, NA_integer_, -100 * log(!!sym(paste0("Sen1Ch",i))/!!sym(paste0("RefCh",i)))))
}
Note you need := inside the mutate.
Here is a base r solution with mapply. First define an auxiliary function f to make the code more readable, then get the column names to be changed and that take part in the formula with regular expressions, finally, csall the function f in a mapply loop.
f <- function(x, y, Status) {
ifelse(Status == 1, NA_integer_, -100 * log(x/y))
}
atn <- grep("^ATN\\d\\.1$", names(df), value = TRUE)
sen1ch <- grep("^Sen1Ch", names(df), value = TRUE)
refch <- grep("^RefCh", names(df), value = TRUE)
df[atn] <- mapply(f, df[sen1ch], df[refch], MoreArgs = list(Status = df$Status))
df
#> # A tibble: 10 x 36
#> Status ATN1.1 ATN2.1 ATN3.1 ATN4.1 ATN5.1 ATN6.1 ATN7.1 ATN0.1.1 ATN0.2.1
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 NA NA NA NA NA NA NA NA NA
#> 2 17 463. 476. 484. 548. 494. 469. 493. NA NA
#> 3 1 NA NA NA NA NA NA NA NA NA
#> 4 1 NA NA NA NA NA NA NA NA NA
#> 5 1 NA NA NA NA NA NA NA NA NA
#> 6 1 NA NA NA NA NA NA NA NA NA
#> 7 2 15.1 24.1 27.4 20.9 17.2 4.42 10.8 15.1 24.1
#> 8 0 15.2 24.3 27.5 21.0 17.2 4.47 10.9 15.1 24.1
#> 9 0 15.4 24.4 27.6 21.0 17.3 4.52 10.9 15.1 24.1
#> 10 0 15.6 24.5 27.8 21.1 17.4 4.57 11.0 15.1 24.1
#> # ... with 26 more variables: ATN0.3.1 <dbl>, ATN0.4.1 <dbl>, ATN0.5.1 <dbl>,
#> # ATN0.6.1 <dbl>, ATN0.7.1 <dbl>, Sen1Ch1 <dbl>, Sen2Ch1 <dbl>,
#> # Sen1Ch2 <dbl>, Sen2Ch2 <dbl>, Sen1Ch3 <dbl>, Sen2Ch3 <dbl>, Sen1Ch4 <dbl>,
#> # Sen2Ch4 <dbl>, Sen1Ch5 <dbl>, Sen2Ch5 <dbl>, Sen1Ch6 <dbl>, Sen2Ch6 <dbl>,
#> # Sen1Ch7 <dbl>, Sen2Ch7 <dbl>, RefCh1 <dbl>, RefCh2 <dbl>, RefCh3 <dbl>,
#> # RefCh4 <dbl>, RefCh5 <dbl>, RefCh6 <dbl>, RefCh7 <dbl>
Created on 2022-04-14 by the reprex package (v2.0.1)
I try to calculate the marginal effects after I conducted a multinomial logistic regression using mlogit.
reg1 <- mlogit::mlogit(formula = value ~ 1 | ScoreEnvAtt, data = listDatasets[[2]])
reg1
summary(reg1)
z <- with(listDatasets[[2]], data.frame(ScoreEnvAtt = tapply(ScoreEnvAtt, index(reg1)$alt, mean)))
effects(reg1,covariate="ScoreEnvAtt",data=z)
But I get an error when defining the z.
z <- with(listDatasets[[2]], data.frame(ScoreEnvAtt = tapply(ScoreEnvAtt, index(reg1)$alt, mean)))
Error in tapply(ScoreEnvAtt, index(reg1)$alt, mean) :
Argumente müssen die selbe Länge haben
(Arguments must have the same length). And I just can't solve it. Anyone has an idea?
head(listDatasets[[2]])
index Age ScoreEnvAtt MoneyInvested Gender Beliefs_eff_Green Beliefs_eff_ESG Beliefs_eff_Comp Beliefs_perf_ESG Beliefs_perf_Green Beliefs_perf_Comp Guilt Social.Altruistic Biospheric Egoistic DummyMedium
1.SS_Green_1 1 26 4.2 13 2 4 3 3 2 3 2 71 6.000000 6.000000 5 1
1.SS_Green_2 1 26 4.2 13 2 4 3 3 2 3 2 71 6.000000 6.000000 5 1
1.SS_Green_3 1 26 4.2 13 2 4 3 3 2 3 2 71 6.000000 6.000000 5 1
1.SS_Green_4 1 26 4.2 13 2 4 3 3 2 3 2 71 6.000000 6.000000 5 1
2.SS_Green_1 2 30 4.8 2 2 4 3 4 2 3 3 26 6.666667 5.333333 5 1
2.SS_Green_2 2 30 4.8 2 2 4 3 4 2 3 3 26 6.666667 5.333333 5 1
DummyHigh CompensationGroup Past_compensation Knowledge_CO2 variable value price
1.SS_Green_1 0 Group1 2 1 SS_Green_1 FALSE 1.5
1.SS_Green_2 0 Group1 2 1 SS_Green_2 FALSE 1.5
1.SS_Green_3 0 Group1 2 1 SS_Green_3 FALSE 1.5
1.SS_Green_4 0 Group1 2 1 SS_Green_4 TRUE 1.3
2.SS_Green_1 0 Group2 2 2 SS_Green_1 FALSE 1.5
2.SS_Green_2 0 Group2 2 2 SS_Green_2 FALSE 1.5
Edit:
> dput(head(listDatasets[[2]]))
structure(list(index = c(`1.SS_Green_1` = 1, `1.SS_Green_2` = 1,
`1.SS_Green_3` = 1, `1.SS_Green_4` = 1, `2.SS_Green_1` = 2, `2.SS_Green_2` = 2
), Age = c(`1.SS_Green_1` = 26, `1.SS_Green_2` = 26, `1.SS_Green_3` = 26,
`1.SS_Green_4` = 26, `2.SS_Green_1` = 30, `2.SS_Green_2` = 30
), ScoreEnvAtt = c(`1.SS_Green_1` = 4.2, `1.SS_Green_2` = 4.2,
`1.SS_Green_3` = 4.2, `1.SS_Green_4` = 4.2, `2.SS_Green_1` = 4.8,
`2.SS_Green_2` = 4.8), MoneyInvested = c(`1.SS_Green_1` = 13,
`1.SS_Green_2` = 13, `1.SS_Green_3` = 13, `1.SS_Green_4` = 13,
`2.SS_Green_1` = 2, `2.SS_Green_2` = 2), Gender = c(`1.SS_Green_1` = 2,
`1.SS_Green_2` = 2, `1.SS_Green_3` = 2, `1.SS_Green_4` = 2, `2.SS_Green_1` = 2,
`2.SS_Green_2` = 2), Beliefs_eff_Green = c(`1.SS_Green_1` = 4,
`1.SS_Green_2` = 4, `1.SS_Green_3` = 4, `1.SS_Green_4` = 4, `2.SS_Green_1` = 4,
`2.SS_Green_2` = 4), Beliefs_eff_ESG = c(`1.SS_Green_1` = 3,
`1.SS_Green_2` = 3, `1.SS_Green_3` = 3, `1.SS_Green_4` = 3, `2.SS_Green_1` = 3,
`2.SS_Green_2` = 3), Beliefs_eff_Comp = c(`1.SS_Green_1` = 3,
`1.SS_Green_2` = 3, `1.SS_Green_3` = 3, `1.SS_Green_4` = 3, `2.SS_Green_1` = 4,
`2.SS_Green_2` = 4), Beliefs_perf_ESG = c(`1.SS_Green_1` = 2,
`1.SS_Green_2` = 2, `1.SS_Green_3` = 2, `1.SS_Green_4` = 2, `2.SS_Green_1` = 2,
`2.SS_Green_2` = 2), Beliefs_perf_Green = c(`1.SS_Green_1` = 3,
`1.SS_Green_2` = 3, `1.SS_Green_3` = 3, `1.SS_Green_4` = 3, `2.SS_Green_1` = 3,
`2.SS_Green_2` = 3), Beliefs_perf_Comp = c(`1.SS_Green_1` = 2,
`1.SS_Green_2` = 2, `1.SS_Green_3` = 2, `1.SS_Green_4` = 2, `2.SS_Green_1` = 3,
`2.SS_Green_2` = 3), Guilt = c(`1.SS_Green_1` = 71, `1.SS_Green_2` = 71,
`1.SS_Green_3` = 71, `1.SS_Green_4` = 71, `2.SS_Green_1` = 26,
`2.SS_Green_2` = 26), Social.Altruistic = c(`1.SS_Green_1` = 6,
`1.SS_Green_2` = 6, `1.SS_Green_3` = 6, `1.SS_Green_4` = 6, `2.SS_Green_1` = 6.66666666666667,
`2.SS_Green_2` = 6.66666666666667), Biospheric = c(`1.SS_Green_1` = 6,
`1.SS_Green_2` = 6, `1.SS_Green_3` = 6, `1.SS_Green_4` = 6, `2.SS_Green_1` = 5.33333333333333,
`2.SS_Green_2` = 5.33333333333333), Egoistic = c(`1.SS_Green_1` = 5,
`1.SS_Green_2` = 5, `1.SS_Green_3` = 5, `1.SS_Green_4` = 5, `2.SS_Green_1` = 5,
`2.SS_Green_2` = 5), DummyMedium = structure(c(`1.SS_Green_1` = 2L,
`1.SS_Green_2` = 2L, `1.SS_Green_3` = 2L, `1.SS_Green_4` = 2L,
`2.SS_Green_1` = 2L, `2.SS_Green_2` = 2L), .Label = c("0", "1"
), class = c("pseries", "factor")), DummyHigh = structure(c(`1.SS_Green_1` = 1L,
`1.SS_Green_2` = 1L, `1.SS_Green_3` = 1L, `1.SS_Green_4` = 1L,
`2.SS_Green_1` = 1L, `2.SS_Green_2` = 1L), .Label = c("0", "1"
), class = c("pseries", "factor")), CompensationGroup = structure(c(`1.SS_Green_1` = 1L,
`1.SS_Green_2` = 1L, `1.SS_Green_3` = 1L, `1.SS_Green_4` = 1L,
`2.SS_Green_1` = 2L, `2.SS_Green_2` = 2L), .Label = c("Group1",
"Group2", "Group3"), class = c("pseries", "factor")), Past_compensation = c(`1.SS_Green_1` = 2,
`1.SS_Green_2` = 2, `1.SS_Green_3` = 2, `1.SS_Green_4` = 2, `2.SS_Green_1` = 2,
`2.SS_Green_2` = 2), Knowledge_CO2 = c(`1.SS_Green_1` = 1, `1.SS_Green_2` = 1,
`1.SS_Green_3` = 1, `1.SS_Green_4` = 1, `2.SS_Green_1` = 2, `2.SS_Green_2` = 2
), variable = structure(c(`1.SS_Green_1` = 1L, `1.SS_Green_2` = 2L,
`1.SS_Green_3` = 3L, `1.SS_Green_4` = 4L, `2.SS_Green_1` = 1L,
`2.SS_Green_2` = 2L), .Label = c("SS_Green_1", "SS_Green_2",
"SS_Green_3", "SS_Green_4"), class = c("pseries", "factor")),
value = c(`1.SS_Green_1` = FALSE, `1.SS_Green_2` = FALSE,
`1.SS_Green_3` = FALSE, `1.SS_Green_4` = TRUE, `2.SS_Green_1` = FALSE,
`2.SS_Green_2` = FALSE), price = c(`1.SS_Green_1` = 1.5,
`1.SS_Green_2` = 1.5, `1.SS_Green_3` = 1.5, `1.SS_Green_4` = 1.3,
`2.SS_Green_1` = 1.5, `2.SS_Green_2` = 1.5)), index = structure(list(
chid = structure(c(1L, 1L, 1L, 1L, 2L, 2L), .Label = c("1",
"2"), class = "factor"), alt = structure(c(1L, 2L, 3L, 4L,
1L, 2L), .Label = c("SS_Green_1", "SS_Green_2", "SS_Green_3",
"SS_Green_4"), class = "factor")), class = "data.frame", row.names = c("1.SS_Green_1",
"1.SS_Green_2", "1.SS_Green_3", "1.SS_Green_4", "2.SS_Green_1",
"2.SS_Green_2")), choice = "value", row.names = c("1.SS_Green_1",
"1.SS_Green_2", "1.SS_Green_3", "1.SS_Green_4", "2.SS_Green_1",
"2.SS_Green_2"), class = c("mlogit.data", "data.frame"))
I have this kind of data.
library(dplyr)
glimpse(samp)
Observations: 5
Variables: 5
$ review_count <int> 68, 3, 7, 9, 5
$ Alcohol <fct> full_bar, NA, full_bar, beer_and_wi...
$ BikeParking <fct> True, NA, False, NA, NA
$ BusinessAcceptsBitcoin <fct> NA, NA, NA, NA, NA
$ BusinessAcceptsCreditCards <fct> True, NA, NA, True, True
I want to create 1-p dummy features. The createDummyFeatures function of the mlr package has the option reference to do this.
library(mlr)
dummy = createDummyFeatures(samp, target = "review_count", method = "reference")
The problem is that it doesn´t keep the original column names.
glimpse(dummy)
Observations: 5
Variables: 6
$ review_count <int> 68, 3, 7, 9, 5
$ Alcohol.full_bar <dbl> 1, NA, 1, 0, NA
$ Alcohol.none <dbl> 0, NA, 0, 0, NA
$ True <dbl> 1, NA, 0, NA, NA
$ True.1 <dbl> NA, NA, NA, NA, NA
$ True.2 <dbl> 1, NA, NA, 1, 1
The question is how can I keep them?
An Idea is to create them by the 1-of-nmethod and then remove all columns which contain "False".
dummy2 = createDummyFeatures(samp, target = "review_count")
dummy2 = dummy2 %>%
select(-contains("False"))
glimpse(dummy2)
Observations: 5
Variables: 7
$ review_count <int> 68, 3, 7, 9, 5
$ Alcohol.beer_and_wine <dbl> 0, NA, 0, 1, NA
$ Alcohol.full_bar <dbl> 1, NA, 1, 0, NA
$ Alcohol.none <dbl> 0, NA, 0, 0, NA
$ BikeParking.True <dbl> 1, NA, 0, NA, NA
$ BusinessAcceptsBitcoin.True <dbl> NA, NA, NA, NA, NA
$ BusinessAcceptsCreditCards.True <dbl> 1, NA, NA, 1, 1
However, I don´t know if it is the same as n-1 especially for the factors with more then 2 levels (The dummy coding is for an XGBoost regression where "review count" is the target variable).
dput(samp)
structure(list(review_count = c(68L, 3L, 7L, 9L, 5L), Alcohol = structure(c(2L,
NA, 2L, 1L, NA), .Label = c("beer_and_wine", "full_bar", "none"
), class = "factor"), BikeParking = structure(c(2L, NA, 1L, NA,
NA), .Label = c("False", "True"), class = "factor"), BusinessAcceptsBitcoin = structure(c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), .Label = c("False",
"True"), class = "factor"), BusinessAcceptsCreditCards = structure(c(2L,
NA, NA, 2L, 2L), .Label = c("False", "True"), class = "factor")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -5L))
Edit
For those who have the same problem, I fixed this issue using caret.
library(caret)
dummy_dat = dummyVars("~ .", data = samp, fullRank = T)
dat = data.frame(predict(dummy_dat, newdata = samp))