Adding a row with Sum and mean of the columns - r

I'm having a dataframe as like below.
`> am_me
Group.1 Group.2 x.x x.y
2 AM clearterminate 3 21.00000
3 AM display.cryptic 86 30.12791
4 AM price 71 898.00000`
I would like to get result as like below.
`> am_me_t
Group.2 x.x x.y
2 clearterminate 3 21
3 display.cryptic 86 30.1279069767442
4 price 71 898
41 AM 160 316.375968992248`
I have taken out the first column and got the result like below
`> am_res
Group.2 x.x x.y
2 clearterminate 3 21.00000
3 display.cryptic 86 30.12791
4 price 71 898.00000`
When I try rbind to Add "AM" to new row, as like below, I'm getting a warning message and getting NA.
`> am_me_t <- rbind(am_res, c("AM", colSums(am_res[2]), colMeans(am_res[3])))
Warning message:
invalid factor level, NAs generated in: "[<-.factor"(`*tmp*`, ri, value = "AM")
Group.2 x.x x.y
2 clearterminate 3 21
3 display.cryptic 86 30.1279069767442
4 price 71 898
41 <NA> 160 316.375968992248`
For your information, Output of edit(am_me)
`> edit(am_me)
structure(list(Group.1 = structure(as.integer(c(2, 2, 2)), .Label = c("1Y",
"AM", "BE", "CM", "CO", "LX", "SN", "US", "VK", "VS"), class = "factor"),
Group.2 = structure(as.integer(c(2, 5, 9)), .Label = c("bestbuy",
"clearterminate", "currency.display", "display", "display.cryptic",
"fqa", "mileage.display", "ping", "price", "reissue", "reissuedisplay",
"shortaccess.followon"), class = "factor"), x.x = as.integer(c(3,
86, 71)), x.y = c(21, 30.1279069767442, 898)), .Names = c("Group.1",
"Group.2", "x.x", "x.y"), row.names = c("2", "3", "4"), class = "data.frame")`
Also
`> edit(me)
structure(list(Group.1 = structure(as.integer(c(1, 2, 2, 2, 3,
4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 8, 8,
8, 8, 9, 9, 10, 10, 10, 10, 10, 10)), .Label = c("1Y", "AM",
"BE", "CM", "CO", "LX", "SN", "US", "VK", "VS"), class = "factor"),
Group.2 = structure(as.integer(c(8, 2, 5, 9, 10, 1, 2, 5,
9, 1, 2, 5, 9, 1, 2, 3, 4, 7, 9, 11, 12, 2, 4, 6, 1, 2, 5,
9, 2, 5, 1, 2, 3, 5, 9, 10)), .Label = c("bestbuy", "clearterminate",
"currency.display", "display", "display.cryptic", "fqa",
"mileage.display", "ping", "price", "reissue", "reissuedisplay",
"shortaccess.followon"), class = "factor"), x.x = as.integer(c(1,
3, 86, 71, 1, 2, 5, 1, 52, 10, 7, 27, 15, 5, 267, 14, 4,
1, 256, 1, 1, 80, 1, 78, 2, 10, 23, 6, 1, 2, 4, 3, 3, 11,
1, 1)), x.y = c(5, 21, 30.1279069767442, 898, 12280, 800,
56.4, 104, 490.442307692308, 1759.1, 18.1428571428571, 1244.81481481481,
518.533333333333, 3033.2, 18.5468164794007, 20, 3788.5, 23,
2053.49609375, 3863, 6376, 17.825, 240, 1752.21794871795,
1114.5, 34, 1369.60869565217, 1062.16666666667, 23, 245,
5681.5, 11.3333333333333, 13.3333333333333, 1273.81818181818,
2076, 5724)), .Names = c("Group.1", "Group.2", "x.x", "x.y"
), row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
"21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31",
"32", "33", "34", "35", "36"), class = "data.frame")
Group.1 Group.2 x.x x.y
1 1Y ping 1 5.00000
2 AM clearterminate 3 21.00000
3 AM display.cryptic 86 30.12791
4 AM price 71 898.00000
5 BE reissue 1 12280.00000
6 CM bestbuy 2 800.00000
7 CM clearterminate 5 56.40000
8 CM display.cryptic 1 104.00000
9 CM price 52 490.44231
10 CO bestbuy 10 1759.10000
11 CO clearterminate 7 18.14286
12 CO display.cryptic 27 1244.81481
13 CO price 15 518.53333
14 LX bestbuy 5 3033.20000
15 LX clearterminate 267 18.54682
16 LX currency.display 14 20.00000
17 LX display 4 3788.50000
18 LX mileage.display 1 23.00000
19 LX price 256 2053.49609
20 LX reissuedisplay 1 3863.00000
21 LX shortaccess.followon 1 6376.00000
22 SN clearterminate 80 17.82500
23 SN display 1 240.00000
24 SN fqa 78 1752.21795
25 US bestbuy 2 1114.50000
26 US clearterminate 10 34.00000
27 US display.cryptic 23 1369.60870
28 US price 6 1062.16667
29 VK clearterminate 1 23.00000
30 VK display.cryptic 2 245.00000
31 VS bestbuy 4 5681.50000
32 VS clearterminate 3 11.33333
33 VS currency.display 3 13.33333
34 VS display.cryptic 11 1273.81818
35 VS price 1 2076.00000
36 VS reissue 1 5724.00000`

The type of the Group.2 column is factor, and that limits the possible values. You can transform it to character with am_me$Group.2 <- as.character(am_me$Group.2), after that the AM value will be added without errors.
Note that you can also use sum() and mean() for single column operations.

Related

Using Tidyverse to Output a Series of Summary Statistic Tables per User/Participant

I am trying to produce a table of mean scores for each participant in my tibble. The number of observants is much larger than the data given below, but this tibble should be sufficient. I need to produce a table for each unique user_id. I would like the table to have 10 rows, 8 of which are the means of the indicators 1-8 per timepoint, and the other two are domain means per timepoint. The mean of domain 0 is the mean of indicators 1-4, and the mean of domain 1 is the mean of indicators 5-8. I would also like the outputted tables to have four columns, one per timepoint. Thus, each teacher_id's outputted table should be a 10 by 4. I have attempted this with tidyverse and would appreciate help. Also, some users (read several) will not have values at all timepoints.
structure(list(Group = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1), user_id = c("Kim", "Kim",
"Kim", "Kim", "Kim", "Kim", "Kim",
"Kim", "Bob", "Bob", "Bob", "Bob",
"Bob", "Bob", "Bob", "Bob", "Bob",
"Bob", "Bob", "Bob", "Bob", "Bob",
"Bob", "Bob", "George", "George", "George", "George",
"George", "George", "George", "George", "George", "George", "George",
"George", "George", "George", "George", "George"), indicator = c("1",
"2", "3", "4", "5", "6", "7", "8", "1", "1", "2", "2", "3", "3",
"4", "4", "5", "5", "6", "6", "7", "7", "8", "8", "1", "1", "2",
"2", "3", "3", "4", "4", "5", "5", "6", "6", "7", "7", "8", "8"
), Timepoint = c(1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 3, 4, 3,
4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4,
3, 4, 3, 4, 3, 4), score = c(3.5, 3.5, 2, 3, 3.5, 4,
3, 4, 2, 3, 2.5, 3, 1.5, 1.5, 0.5, 3, 2, 4, 2.5, 4, 2.5, 3.5,
3, 3.5, 3.5, 3, 2.5, 2.5, 2.5, 2, 2, 3, 3.5, 3.5, 3.5, 3.5, 3,
3, 3, 2.5)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-40L))
Attempted tidyverse code:
user_tables <- d %>%
group_by(user_id,indicator,Timepoint) %>%
summarise(Time1 = mean[which(indicator == 1 & Timepoint == 1)], mean[which(indicator == 2 & Timepoint == 1)], mean[which(indicator == 3 & Timepoint == 1)], mean[which(indicator == 4 & Timepoint == 1)], mean[which(indicator == 5 & Timepoint == 1)], mean[which(indicator == 6 & Timepoint == 1)], mean[which(indicator == 7 & Timepoint == 1)], mean[which(indicator == 8 & Timepoint == 1)],
Time2 = mean[which(indicator == 1 & Timepoint == 2)], mean[which(indicator == 2 & Timepoint == 2)], mean[which(indicator == 3 & Timepoint == 2)], mean[which(indicator == 4 & Timepoint == 2)], mean[which(indicator == 5 & Timepoint == 2)], mean[which(indicator == 6 & Timepoint == 2)], mean[which(indicator == 7 & Timepoint == 2)], mean[which(indicator == 8 & Timepoint == 2)],
Time3 = mean[which(indicator == 1 & Timepoint == 3)], mean[which(indicator == 2 & Timepoint == 3)], mean[which(indicator == 3 & Timepoint == 3)], mean[which(indicator == 4 & Timepoint == 3)], mean[which(indicator == 5 & Timepoint == 3)], mean[which(indicator == 6 & Timepoint == 3)], mean[which(indicator == 7 & Timepoint == 3)], mean[which(indicator == 8 & Timepoint == 3)],
Time4 = mean[which(indicator == 1 & Timepoint == 4)], mean[which(indicator == 2 & Timepoint == 4)], mean[which(indicator == 3 & Timepoint == 4)], mean[which(indicator == 4 & Timepoint == 4)], mean[which(indicator == 5 & Timepoint == 4)], mean[which(indicator == 6 & Timepoint == 4)], mean[which(indicator == 7 & Timepoint == 4)], mean[which(indicator == 8 & Timepoint == 4)]) %>%
split(., .$user_id)
Ultimately, I would like a table like this per user (where the NAs are the appropriate means) (Note: This one is for Bob - he didn't have scores for time 1 or time 2):
structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 1.625, 2, 2.5, 1.5, 0.5, 2.5, 2,
2.5, 2.5, 3, 2.625, 3, 3, 1.5, 3, 3.75, 4, 4, 3.5, 3.5), .Dim = c(10L,
4L), .Dimnames = list(c("Domain 0", "Ind 1", "Ind 2", "Ind 3",
"Ind 4", "Domain 1", "Ind 5", "Ind 6", "Ind 7", "Ind 8"), c("Time 1",
"Time 2", "Time 3", "Time 4")))
Thank you!
Since you are adding rows, you could do:
df %>%
group_by(Group, user_id, Timepoint, domain = +(indicator>4), indicator) %>%
summarise(sc=mean(score),.groups ='drop_last') %>%
pivot_wider(c(Group, user_id, indicator, domain), Timepoint,'Time_', values_from = sc) %>%
group_nest()%>%
mutate(data = map(data,
~rbind(c(NA,colMeans(select_if(.x,is.numeric), na.rm = TRUE)),.x)))%>%
unnest(data)%>%
mutate(indicator = ifelse(is.na(indicator),
paste0('Domain ', domain), paste0('Ind ', indicator)),
domain = NULL)
A tibble: 30 x 6
Group user_id indicator Time_3 Time_4 Time_1
<dbl> <chr> <chr> <dbl> <dbl> <dbl>
1 1 Bob Domain 0 1.62 2.62 NaN
2 1 Bob Ind 1 2 3 NA
3 1 Bob Ind 2 2.5 3 NA
4 1 Bob Ind 3 1.5 1.5 NA
5 1 Bob Ind 4 0.5 3 NA
6 1 Bob Domain 1 2.5 3.75 NaN
7 1 Bob Ind 5 2 4 NA
8 1 Bob Ind 6 2.5 4 NA
9 1 Bob Ind 7 2.5 3.5 NA
10 1 Bob Ind 8 3 3.5 NA
# ... with 20 more rows
Same basic idea as in #Onyambu’s answer, but simplified a bit with new dplyr
1.0.0 features that allow summarise() to increase the row count:
library(tidyverse)
have %>%
mutate(domain = (as.numeric(indicator) - 1) %/% 4) %>%
group_by(user_id, Timepoint, domain, indicator) %>%
summarise(score = mean(score)) %>%
summarise(
cur_data() %>% add_row(score = mean(score), .before = 1)
) %>%
arrange(Timepoint) %>%
pivot_wider(
values_from = score,
names_from = Timepoint,
names_prefix = "Time "
) %>%
filter(user_id == "Bob")
#> `summarise()` regrouping output by 'user_id', 'Timepoint', 'domain' (override with `.groups` argument)
#> `summarise()` regrouping output by 'user_id', 'Timepoint', 'domain' (override with `.groups` argument)
#> # A tibble: 10 x 6
#> # Groups: user_id, domain [2]
#> user_id domain indicator `Time 1` `Time 3` `Time 4`
#> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 Bob 0 <NA> NA 1.62 2.62
#> 2 Bob 0 1 NA 2 3
#> 3 Bob 0 2 NA 2.5 3
#> 4 Bob 0 3 NA 1.5 1.5
#> 5 Bob 0 4 NA 0.5 3
#> 6 Bob 1 <NA> NA 2.5 3.75
#> 7 Bob 1 5 NA 2 4
#> 8 Bob 1 6 NA 2.5 4
#> 9 Bob 1 7 NA 2.5 3.5
#> 10 Bob 1 8 NA 3 3.5
Data setup:
have <- structure(list(
Group = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
user_id = c(
"Kim", "Kim", "Kim", "Kim", "Kim", "Kim", "Kim", "Kim", "Bob", "Bob",
"Bob", "Bob", "Bob", "Bob", "Bob", "Bob", "Bob", "Bob", "Bob", "Bob",
"Bob", "Bob", "Bob", "Bob", "George", "George", "George", "George",
"George", "George", "George", "George", "George", "George", "George",
"George", "George", "George", "George", "George"
),
indicator = c(
"1", "2", "3", "4", "5", "6", "7", "8", "1", "1", "2", "2", "3", "3",
"4", "4", "5", "5", "6", "6", "7", "7", "8", "8", "1", "1", "2",
"2", "3", "3", "4", "4", "5", "5", "6", "6", "7", "7", "8", "8"
), Timepoint = c(
1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4,
3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4
), score = c(
3.5, 3.5, 2, 3, 3.5, 4, 3, 4, 2, 3, 2.5, 3, 1.5, 1.5,
0.5, 3, 2, 4, 2.5, 4, 2.5, 3.5, 3, 3.5, 3.5, 3, 2.5,
2.5, 2.5, 2, 2, 3, 3.5, 3.5, 3.5, 3.5, 3, 3, 3, 2.5
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -40L))
want <- structure(
c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 1.625, 2, 2.5, 1.5, 0.5, 2.5,
2, 2.5, 2.5, 3, 2.625, 3, 3, 1.5, 3, 3.75, 4, 4, 3.5, 3.5),
.Dim = c(10L, 4L),
.Dimnames = list(
c("Domain 0", "Ind 1", "Ind 2", "Ind 3","Ind 4",
"Domain 1", "Ind 5", "Ind 6", "Ind 7", "Ind 8"),
c("Time 1", "Time 2", "Time 3", "Time 4")
)
)
Great question. How about a nest solution? Here, you create a function to summarise, nest the data by user_id, then apply the function to each participant.
library(tidyverse)
df <- structure(list(Group = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1), user_id = c("Kim", "Kim",
"Kim", "Kim", "Kim", "Kim", "Kim",
"Kim", "Bob", "Bob", "Bob", "Bob",
"Bob", "Bob", "Bob", "Bob", "Bob",
"Bob", "Bob", "Bob", "Bob", "Bob",
"Bob", "Bob", "George", "George", "George", "George",
"George", "George", "George", "George", "George", "George", "George",
"George", "George", "George", "George", "George"), indicator = c("1",
"2", "3", "4", "5", "6", "7", "8", "1", "1", "2", "2", "3", "3",
"4", "4", "5", "5", "6", "6", "7", "7", "8", "8", "1", "1", "2",
"2", "3", "3", "4", "4", "5", "5", "6", "6", "7", "7", "8", "8"
), Timepoint = c(1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 3, 4, 3,
4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4,
3, 4, 3, 4, 3, 4), score = c(3.5, 3.5, 2, 3, 3.5, 4,
3, 4, 2, 3, 2.5, 3, 1.5, 1.5, 0.5, 3, 2, 4, 2.5, 4, 2.5, 3.5,
3, 3.5, 3.5, 3, 2.5, 2.5, 2.5, 2, 2, 3, 3.5, 3.5, 3.5, 3.5, 3,
3, 3, 2.5)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-40L))
output <- structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 1.625, 2, 2.5, 1.5, 0.5, 2.5, 2,
2.5, 2.5, 3, 2.625, 3, 3, 1.5, 3, 3.75, 4, 4, 3.5, 3.5), .Dim = c(10L,
4L), .Dimnames = list(c("Domain 0", "Ind 1", "Ind 2", "Ind 3",
"Ind 4", "Domain 1", "Ind 5", "Ind 6", "Ind 7", "Ind 8"), c("Time 1",
"Time 2", "Time 3", "Time 4")))
dfnest <- df %>%
group_nest(user_id)
my_summarise <- function(.data) {
.data %>%
group_by(indicator,Timepoint) %>%
summarise(mean = mean(score, na.rm = TRUE)) %>%
pivot_wider(values_from = mean,names_from = Timepoint,names_prefix = 'Timepoint')
}
map(dfnest$data,my_summarise)
#> `summarise()` regrouping output by 'indicator' (override with `.groups` argument)
#> `summarise()` regrouping output by 'indicator' (override with `.groups` argument)
#> `summarise()` regrouping output by 'indicator' (override with `.groups` argument)
#> [[1]]
#> # A tibble: 8 x 3
#> # Groups: indicator [8]
#> indicator Timepoint3 Timepoint4
#> <chr> <dbl> <dbl>
#> 1 1 2 3
#> 2 2 2.5 3
#> 3 3 1.5 1.5
#> 4 4 0.5 3
#> 5 5 2 4
#> 6 6 2.5 4
#> 7 7 2.5 3.5
#> 8 8 3 3.5
#>
#> [[2]]
#> # A tibble: 8 x 3
#> # Groups: indicator [8]
#> indicator Timepoint3 Timepoint4
#> <chr> <dbl> <dbl>
#> 1 1 3.5 3
#> 2 2 2.5 2.5
#> 3 3 2.5 2
#> 4 4 2 3
#> 5 5 3.5 3.5
#> 6 6 3.5 3.5
#> 7 7 3 3
#> 8 8 3 2.5
#>
#> [[3]]
#> # A tibble: 8 x 2
#> # Groups: indicator [8]
#> indicator Timepoint1
#> <chr> <dbl>
#> 1 1 3.5
#> 2 2 3.5
#> 3 3 2
#> 4 4 3
#> 5 5 3.5
#> 6 6 4
#> 7 7 3
#> 8 8 4
Created on 2020-11-12 by the reprex package (v0.3.0)

System is computationally singular using mlogit in R

I am conducting multinomial logistic regressions. They seem to work for all variables except for the price variable, where I get the following error:
reg.M <- mlogit::mlogit(formula = value ~ 1 | price, data = listDatasets[[2]])
Error in solve.default(H, g[!fixed]) :
system is computationally singular: reciprocal condition number = 7.4671e-18
That is my the head of the dataset used (I have 15 like these, with different prices):
> head(listDatasets[[2]])
index Age ScoreEnvAtt MoneyInvested Gender Beliefs_eff_Green Beliefs_eff_ESG Beliefs_eff_Comp Beliefs_perf_ESG Beliefs_perf_Green Beliefs_perf_Comp Guilt Social.Altruistic Biospheric Egoistic DummyMedium
1.SS_Green_1 1 26 4.2 13 2 4 3 3 2 3 2 71 6.000000 6.000000 5 1
1.SS_Green_2 1 26 4.2 13 2 4 3 3 2 3 2 71 6.000000 6.000000 5 1
1.SS_Green_3 1 26 4.2 13 2 4 3 3 2 3 2 71 6.000000 6.000000 5 1
1.SS_Green_4 1 26 4.2 13 2 4 3 3 2 3 2 71 6.000000 6.000000 5 1
2.SS_Green_1 2 30 4.8 2 2 4 3 4 2 3 3 26 6.666667 5.333333 5 1
2.SS_Green_2 2 30 4.8 2 2 4 3 4 2 3 3 26 6.666667 5.333333 5 1
DummyHigh CompensationGroup Past_compensation Knowledge_CO2 variable value price
1.SS_Green_1 0 Group1 2 1 SS_Green_1 FALSE 1.5
1.SS_Green_2 0 Group1 2 1 SS_Green_2 FALSE 1.5
1.SS_Green_3 0 Group1 2 1 SS_Green_3 FALSE 1.5
1.SS_Green_4 0 Group1 2 1 SS_Green_4 TRUE 1.3
2.SS_Green_1 0 Group2 2 2 SS_Green_1 FALSE 1.5
2.SS_Green_2 0 Group2 2 2 SS_Green_2 FALSE 1.5
I checked the other threads on this error already but can not find a solution to my problem. Any suggestions? Thanks!
Edit: I tried to do it with multinom instead of mlogit. It does not seem to recognize the levels of the dependent variable.
test <- multinom(value ~ price, listDatasets[[2]])
summary(test)
> summary(test)
Call:
multinom(formula = value ~ price, data = listDatasets[[2]])
Coefficients:
Values Std. Err.
(Intercept) 25.77925 1.464741
price -18.86853 1.037180
Residual Deviance: 653.8391
AIC: 657.8391
Edit 2:
dput(head(data_long))
structure(list(Pride = c(17, 71, 1, 50, 0, 13), Guilt = c(71,
26, 89, 50, 100, 13), Shame = c(36, 77, 5, 50, 67, 8), Joy = c(12,
50, 0, 50, 30, 37), Attitudes1 = c(6, 5, 7, 5, 7, 5), Attitudes2 = c(5,
7, 3, 4, 5, 3), Attitudes3 = c(6, 6, 7, 6, 7, 6), Attitudes4 = c(5,
3, 5, 5, 6, 6), Attitudes5 = c(4, 7, 6, 6, 5, 6), Attitudes6 = c(5,
7, 3, 7, 7, 6), Attitudes7 = c(3, 4, 1, 3, 1, 6), Attitudes8 = c(3,
4, 5, 6, 7, 2), Attitudes9 = c(2, 3, 7, 4, 7, 6), Attitudes10 = c(3,
2, 5, 4, 7, 6), Concern1 = c(6, 4, 7, 6, 7, 6), Concern2 = c(6,
6, 7, 6, 7, 6), Concern3 = c(5, 5, 7, 6, 4, 6), Concern4 = c(6,
6, 5, 6, 4, 5), Concern5 = c(6, 7, 7, 6, 4, 5), Concern6 = c(6,
6, 7, 6, 5, 6), Concern7 = c(6, 6, 7, 6, 7, 6), Concern8 = c(4,
4, 5, 6, 3, 6), Concern9 = c(6, 7, 6, 6, 7, 7), Beliefs_perf_ESG = c(2,
2, 3, 2, 3, 2), Beliefs_perf_Comp = c(2, 3, 2, 3, 3, 2), Beliefs_perf_Green = c(3,
3, 3, NA, 3, 2), Beliefs_eff_ESG = c(3, 3, 4, 3, 4, 3), Beliefs_eff_Comp = c(3,
4, 2, 4, 2, 3), Beliefs_eff_Green = c(4, 4, 5, 5, 5, 3), Eval_Ego = c(3,
2, 3, 4, 2, 3), Eval_Nature1 = c(3, 3, 3, 4, 2, 2), Eval_Nature2 = c(4,
4, 4, 4, 2, 3), Eval_Social = c(3, 2, 4, 4, 2, 3), Reforestation = c(2,
1, 1, 2, 2, 3), Renewable_Energy = c(1, 2, 3, 1, 1, 1), Efficient_Energy = c(3,
4, 4, 3, 3, 2), Methane = c(4, 3, 2, 4, 4, 4), France = c(3,
3, 1, 1, 3, 2), Europe = c(2, 2, 3, 3, 2, 3), Development = c(1,
1, 2, 2, 1, 1), Co_benefits = c(5, 4, 5, 3, 4, 4), Poverty = c(1,
1, 2, 2, 1, 1), Health = c(4, 2, 4, 1, 3, 3), Biodiversity = c(2,
4, 1, 3, 2, 2), Equality = c(3, 5, 5, 5, 5, 5), Economic_Growth = c(5,
3, 3, 4, 4, 4), Knowledge_CO2 = c(1, 2, 3, 2, 2, 2), Past_compensation = c(2,
2, 1, 2, 1, 2), Age = c(26, 30, 30, 30, 21, 40), Gender = c(2,
2, 2, 2, 2, 2), MoneyInvested = c(13, 2, 1, 3, 13, 1), Investment_Experience = c(2,
1, 1, 1, 2, 1), Participant_s_ID = c("1234asdf", "Password04",
"hiquet8350", "masmas2121", "1712flju", "Lemurien4555"), Compensationproject1 = c(1,
NA, NA, 1, NA, 1), Compensationproject2 = c(NA, 1, 1, NA, NA,
NA), Compensationproject3 = c(NA, NA, NA, NA, 1, NA), FL_42_DO_ChoiceExperiment1 = c(6,
14, 3, 13, 10, 7), FL_42_DO_ChoiceExperiment2 = c(3, 1, 4, 6,
13, 2), FL_42_DO_ChoiceExperiment3 = c(7, 10, 1, 7, 11, 13),
FL_42_DO_ChoiceExperiment4 = c(11, 15, 5, 2, 4, 11), FL_42_DO_ChoiceExperiment5 = c(14,
7, 2, 11, 2, 10), FL_42_DO_ChoiceExperiment6 = c(2, 11, 8,
15, 5, 6), FL_42_DO_ChoiceExperiment7 = c(1, 6, 6, 12, 14,
1), FL_42_DO_ChoiceExperiment8 = c(10, 2, 14, 5, 9, 5), FL_42_DO_ChoiceExperiment9 = c(4,
8, 13, 1, 1, 8), FL_42_DO_ChoiceExperiment10 = c(12, 5, 7,
14, 6, 3), FL_42_DO_ChoiceExperiment11 = c(15, 13, 10, 10,
12, 14), FL_42_DO_ChoiceExperiment12 = c(13, 3, 12, 8, 7,
9), FL_42_DO_ChoiceExperiment13 = c(9, 9, 11, 9, 8, 15),
FL_42_DO_ChoiceExperiment14 = c(8, 12, 9, 3, 3, 12), FL_42_DO_ChoiceExperiment15 = c(5,
4, 15, 4, 15, 4), ScoreEnvAtt = c(4.2, 4.8, 4.9, 5, 5.9,
5.2), Eval_NatureScore = c(3.5, 3.5, 3.5, 4, 2, 2.5), Social.Altruistic = c(6,
6.66666666666667, 6, 6, 5, 5.66666666666667), Biospheric = c(6,
5.33333333333333, 7, 6, 7, 6), Egoistic = c(5, 5, 6.33333333333333,
6, 4, 6), GroupEnvAtt = structure(c(2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("low", "medium", "high"), class = "factor"),
DummyMedium = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), CompensationGroup = structure(c(1L,
2L, 2L, 1L, 3L, 1L), .Label = c("Group1", "Group2", "Group3"
), class = "factor"), ID = structure(1:6, .Label = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12",
"13", "14", "15", "16", "17", "18", "19", "20", "21", "22",
"23", "24", "25", "26", "27", "28", "29", "30", "31", "32",
"33", "34", "35", "36", "37", "38", "39", "40", "41", "42",
"43", "44", "45", "46", "47", "48", "49", "50", "51", "52",
"53", "54", "55", "56", "57", "58", "59", "60", "61", "62",
"63", "64", "65", "66", "67", "68", "69", "70", "71", "72",
"73", "74", "75", "76", "77", "78", "79", "80", "81", "82",
"83", "84", "85", "86", "87", "88", "89", "90", "91", "92",
"93", "94", "95", "96", "97", "98", "99", "100", "101", "102",
"103", "104", "105", "106", "107", "108", "109", "110", "111",
"112", "113", "114", "115", "116", "117", "118", "119", "120",
"121", "122", "123", "124", "125", "126", "127", "128", "129",
"130", "131", "132", "133", "134", "135", "136", "137", "138",
"139", "140", "141", "142", "143", "144", "145", "146", "147",
"148", "149", "150", "151", "152", "153", "154", "155", "156",
"157", "158", "159", "160", "161", "162", "163", "164", "165",
"166", "167", "168", "169", "170", "171", "172", "173", "174",
"175", "176", "177", "178", "179", "180", "181", "182", "183",
"184", "185", "186", "187", "188", "189", "190", "191", "192",
"193", "194", "195", "196", "197", "198", "199", "200", "201",
"202", "203", "204", "205", "206", "207", "208", "209", "210",
"211", "212", "213", "214", "215", "216", "217", "218", "219",
"220", "221", "222", "223", "224", "225", "226", "227", "228",
"229", "230", "231", "232", "233", "234", "235", "236", "237",
"238", "239", "240", "241", "242", "243", "244", "245", "246",
"247"), class = "factor"), ChoiceSet = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Baseline_Choice_1", "Baseline_Choice_2",
"Baseline_Choice_3", "Baseline_Choice_4", "SS_Green_1", "SS_Green_2",
"SS_Green_3", "SS_Green_4", "SS_Green_ESG_1", "SS_Green_ESG_2",
"SS_Green_ESG_3", "SS_Green_ESG_4", "SS_ESG_1", "SS_ESG_2",
"SS_ESG_3", "SS_ESG_4", "SS_Comp_Green_1", "SS_Comp_Green_2",
"SS_Comp_Green_3", "SS_Comp_Green_4", "SS_Comp_ESG_1", "SS_Comp_ESG_2",
"SS_Comp_ESG_3", "SS_Comp_ESG_4", "SS_Comp_1", "SS_Comp_2",
"SS_Comp_3", "SS_Comp_4", "SS_All_1", "SS_All_2", "SS_All_3",
"SS_All_4", "WTP_All_1", "WTP_All_2", "WTP_All_3", "WTP_All_4",
"WTP_Comp_1", "WTP_Comp_2", "WTP_Comp_3", "WTP_Comp_4", "WTP_Comp_ESG_1",
"WTP_Comp_ESG_2", "WTP_Comp_ESG_3", "WTP_Comp_ESG_4", "WTP_Comp_Green_1",
"WTP_Comp_Green_2", "WTP_Comp_Green_3", "WTP_Comp_Green_4",
"WTP_ESG_1", "WTP_ESG_2", "WTP_ESG_3", "WTP_ESG_4", "WTP_ESG_Green_1",
"WTP_ESG_Green_2", "WTP_ESG_Green_3", "WTP_ESG_Green_4",
"WTP_Green_1", "WTP_Green_2", "WTP_Green_3", "WTP_Green_4"
), class = "factor"), value = c("Off", "Off", "Off", "Off",
"Off", "Off"), Choice = c("Conventional", "Conventional",
"Conventional", "Conventional", "Conventional", "Conventional"
), price = c(1.5, 1.5, 1.5, 1.5, 1.5, 1.5)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
Is price not an attribute of the choices? The formula you have now models it as an attribute of the subjects, see the difference here from an example in the mlogit documentation:
## a pure "conditional" model
summary(mlogit(mode ~ price + catch, data = Fish)) #what you should have if price is an attribute of the choice options
## a pure "multinomial model"
summary(mlogit(mode ~ 0 | income, data = Fish)) #what you have now
So in your case reg.M <- mlogit::mlogit(formula = value ~ price, data = listDatasets[[2]])
What you did with multinom is what I mean (you fitted a conditional logit model, i.e., modeled price as an attribute of the choices). The model recognizes the levels of the dependent variable, but price in your model is modeled as a variable with a generic coefficient (one that is the same for all alternatives).
"...while working with multinomial logit models, one has to consider three kinds of variables :
•
alternative specific variables xij with a generic coefficient β,
individual specific variables zi with an alternative specific coefficients γj ,
alternative specific variables wij with an alternative specific coefficient δj .
The satisfaction index for the alternative j is then :
Vij = αj + βxij + γjzi + δjwij" (Source)
So if you want the third option from the list above, use this formula: reg.M <- mlogit::mlogit(formula = value ~ 0 | 1 | price, data = listDatasets[[2]])

Replacing NAs in columns with values from rows in a different dataframe in R that have the same ID

I have two dataframes:
deploy.info <- data.frame(Echo_ID = c("20180918_7.5Fa_1", "20180918_Sebre_3", "20190808_Bake_2", "20190808_NH_2"),
uppermost_bin = c(2, 7, 8, 12))
spc <- data.frame(species = c("RS", "GS", "YG", "RR", "BR", "GT", "CB"),
percent_dist = c(0, 25, 80, 100, 98, 60, 100),
percent_dist_from_surf = c(0, 25, 80, 100, 98, 60, 100),
'20180918_7.5Fa_1' = c(1, 1, 1, "NA", "NA", 1, "NA"),
'20180918_Sebre_3' = c(1, 2, "NA", "NA", "NA", 4, "NA"),
'20190808_Bake_2' = c(1, 3, 7, "NA", "NA", 6, "NA"),
'20190808_NH_2' = c(1, 2, 8, "NA", "NA", 6, "NA"))
The last four columns in the spc data frame refer to each Echo_ID that I am dealing with in the deploy.info data frame. I want to replace the NAs in the spc data frame with the uppermost_bin values for each of the Echo_IDs. Does anyone know how to go about doing this?
My desired end product would look like:
i.want.this <- data.frame(species = c("RS", "GS", "YG", "RR", "BR", "GT", "CB"),
percent_dist = c(0, 25, 80, 100, 98, 60, 100),
percent_dist_from_surf = c(0, 25, 80, 100, 98, 60, 100),
'20180918_7.5Fa_1' = c(1, 1, 1, 2, 2, 1, 2),
'20180918_Sebre_3' = c(1, 2, 7, 7, 7, 4, 7),
'20190808_Bake_2' = c(1, 3, 7, 8, 8, 6, 8),
'20190808_NH_2' = c(1, 2, 8, 12, 12, 6, 12))
I have over 100 columns like this and would rather not go in and have to do this change by hand. Any ideas are greatly appreciated.
We can use Map to replace the NA elements in the columns of 'Echo_ID' by the corresponding values of 'uppermost_bin'. In the OP's dataset, the columns were factor, so it was converted to the correct type with type.convert
nm1 <- paste0("X", deploy.info$Echo_ID)
spc <- type.convert(spc, as.is = TRUE)
spc[nm1] <- Map(function(x, y) replace(x, is.na(x), y),
spc[nm1], deploy.info$uppermost_bin)
spc
# species percent_dist percent_dist_from_surf X20180918_7.5Fa_1 X20180918_Sebre_3 X20190808_Bake_2 X20190808_NH_2
#1 RS 0 0 1 1 1 1
#2 GS 25 25 1 2 3 2
#3 YG 80 80 1 7 7 8
#4 RR 100 100 2 7 8 12
#5 BR 98 98 2 7 8 12
#6 GT 60 60 1 4 6 6
#7 CB 100 100 2 7 8 12

aggregate (R) behaving differently for apparently identical tasks

I've been banging my head against a brick wall for days on this issue; I wonder if anyone can see what is wrong with my code, or tell me if I am overlooking something obvious.
I have this data.frame, where most columns are vectors, either numerical or character, and one column is a list of character vectors:
t0g2 <- structure(list(P = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4,
4, 4, 5, 5, 5, 5), ID = c(8, 10, 7, 9, 5, 2, 3, 4, 8, 9, 1, 2,
8, 1, 4, 10, 4, 10, 2, 7), SC = c("A", "D", "A", "B", "B", "A",
"A", "E", "A", "B", "D", "A", "A", "D", "E", "D", "E", "D", "A",
"A"), FP = list(`40,41,37,8,11` = c("40", "41", "37", "8", "11"
), `49,28,16,41` = c("49", "28", "16", "41"), `15,49` = c("15",
"49"), `27,12,20,35,45` = c("27", "12", "20", "35", "45"), `1,34,43,37` = c("1",
"34", "43", "37"), `41,7,30,2,34,43` = c("41", "7", "30", "2",
"34", "43"), `22,35,31,10,3` = c("22", "35", "31", "10", "3"),
`29,6,15` = c("29", "6", "15"), `40,41,37,8,11` = c("40",
"41", "37", "8", "11"), `27,12,20,35,45` = c("27", "12",
"20", "35", "45"), `10,49,28` = c("10", "49", "28"), `41,7,30,2,34,43` = c("41",
"7", "30", "2", "34", "43"), `40,41,37,8,11` = c("40", "41",
"37", "8", "11"), `10,49,28` = c("10", "49", "28"), `29,6,15` = c("29",
"6", "15"), `49,28,16,41` = c("49", "28", "16", "41"), `29,6,15` = c("29",
"6", "15"), `49,28,16,41` = c("49", "28", "16", "41"), `41,7,30,2,34,43` = c("41",
"7", "30", "2", "34", "43"), `15,49` = c("15", "49"))), class = "data.frame", row.names = c("8",
"10", "7", "9", "5", "2", "3", "4", "81", "91", "1", "21", "82",
"11", "41", "101", "42", "102", "22", "71"))
I want to aggregate it by one of the columns, with the function for the other columns being simply the concatenation of unique values. [Yes, I know this can be done with many ad hoc packages, but I need to do it with base R].
This works perfectly well if I choose numeric column "ID" as the column to aggregate on:
aggregate(x=t0g2[, !(colnames(t0g2) %in% c("ID"))], by=list(ID=t0g2[["ID"]]),
FUN=function(y) unique(unlist(y)))
# ID P SC FP
#1 1 3, 4 D 10, 49, 28
#2 2 2, 3, 5 A 41, 7, 30, 2, 34, 43
#3 3 2 A 22, 35, 31, 10, 3
#4 4 2, 4, 5 E 29, 6, 15
#5 5 2 B 1, 34, 43, 37
#6 7 1, 5 A 15, 49
#7 8 1, 3, 4 A 40, 41, 37, 8, 11
#8 9 1, 3 B 27, 12, 20, 35, 45
#9 10 1, 4, 5 D 49, 28, 16, 41
or with character column "SC":
aggregate(x=t0g2[, !(colnames(t0g2) %in% c("SC"))], by=list(SC=t0g2[["SC"]]),
FUN=function(y) unique(unlist(y)))
# SC P ID FP
#1 A 1, 2, 3, 4, 5 8, 7, 2, 3 40, 41, 37, 8, 11, 15, 49, 7, 30, 2, 34, 43, 22, 35, 31, 10, 3
#2 B 1, 2, 3 9, 5 27, 12, 20, 35, 45, 1, 34, 43, 37
#3 D 1, 3, 4, 5 10, 1 49, 28, 16, 41, 10
#4 E 2, 4, 5 4 29, 6, 15
However, if I try with "P", which as far as I know is just another numerical column, this is what I get:
aggregate(x=t0g2[, !(colnames(t0g2) %in% c("P"))], by=list(P=t0g2[["P"]]),
FUN=function(y) unique(unlist(y)))
# P ID.1 ID.2 ID.3 ID.4 SC.1 SC.2 SC.3 FP
#1 1 8 10 7 9 A D B 40, 41, 37, 8, 11, 49, 28, 16, 15, 27, 12, 20, 35, 45
#2 2 5 2 3 4 B A E 1, 34, 43, 37, 41, 7, 30, 2, 22, 35, 31, 10, 3, 29, 6, 15
#3 3 8 9 1 2 A B D 40, 41, 37, 8, 11, 27, 12, 20, 35, 45, 10, 49, 28, 7, 30, 2, 34, 43
#4 4 8 1 4 10 A D E 40, 41, 37, 8, 11, 10, 49, 28, 29, 6, 15, 16
#5 5 4 10 2 7 E D A 29, 6, 15, 49, 28, 16, 41, 7, 30, 2, 34, 43
Does anybody know what is going on, why this happens?
Literally going mental with this stuff...
EDIT: adding an example of the desired output from aggregating on "P", as requested by jay.sf.
# P ID SC FP
#1 1 8, 10, 7, 9 A, D, B 40, 41, 37, 8, 11, 49, 28, 16, 15, 27, 12, 20, 35, 45
#2 2 5, 2, 3, 4 B, A, E 1, 34, 43, 37, 41, 7, 30, 2, 22, 35, 31, 10, 3, 29, 6, 15
#3 3 8, 9, 1, 2 A, B, D 40, 41, 37, 8, 11, 27, 12, 20, 35, 45, 10, 49, 28, 7, 30, 2, 34, 43
#4 4 8, 1, 4, 10 A, D, E 40, 41, 37, 8, 11, 10, 49, 28, 29, 6, 15, 16
#5 5 4, 10, 2, 7 E, D, A 29, 6, 15, 49, 28, 16, 41, 7, 30, 2, 34, 43
In fact, I found out that by setting simplify=F in aggregate, it works as I want.
I hope this won't backfire.
EDIT 2: it did backfire...
I don't want all my columns to become lists even when they can be vectors, but with simplify = F they do become lists:
sapply(aggregate(x=t0g2[,!(colnames(t0g2) %in% c("P"))],by=list(P=t0g2[["P"]]),FUN=function(y) unique(unlist(y)), simplify = F),class)
# P ID SC FP
#"numeric" "list" "list" "list"
sapply(aggregate(x=t0g2[,!(colnames(t0g2) %in% c("ID"))],by=list(ID=t0g2[["ID"]]),FUN=function(y) unique(unlist(y)), simplify = T),class)
# ID P SC FP
# "numeric" "list" "character" "list"
sapply(aggregate(x=t0g2[,!(colnames(t0g2) %in% c("ID"))],by=list(ID=t0g2[["ID"]]),FUN=function(y) unique(unlist(y)), simplify = F),class)
# ID P SC FP
#"numeric" "list" "list" "list"
So I still don't have a solution... :(
EDIT 3: maybe a viable (if rather clumsy) solution?
t0g2_by_ID <- aggregate(x=t0g2[,!(colnames(t0g2) %in% c("ID"))],by=list(ID=t0g2[["ID"]]),FUN=function(y) unique(unlist(y)), simplify = F)
sapply(t0g2_by_ID,class)
# ID P SC FP
#"numeric" "list" "list" "list"
for (i in 1:NCOL(t0g2_by_ID)) {y = t0g2_by_ID[,i]; if ((class(y) == "list") & (length(y) == length(unlist(y)))) {t0g2_by_ID[,i] <- unlist(y)} }
sapply(t0g2_by_ID,class)
# ID P SC FP
#"numeric" "list" "character" "list"
I tried to obviate to the inelegant loop using sapply, but then any cbind operation goes back to a data.frame of lists.
This is the best I can come up with.
If anyone can suggest how to do this better using only base R, that'd be great.
aggregate obviously tries to give a matrix where this is possible. See This example:
# data
n <- 10
df <- data.frame(id= rep(1:2, each= n/2),
value= 1:n)
length(unique(df$value[df$id == 1])) == length(unique(df$value[df$id == 2]))
TRUE
Here the length of unique is same for every id value, thus aggregate provides a matrix
aggregate(x= df[, "value"], by=list(id=df[, "id"]),
FUN=function(y) unique(unlist(y)))
id x.1 x.2 x.3 x.4 x.5
1 1 1 2 3 4 5
2 2 6 7 8 9 10
Now we change data so that length of unique per id is not equal
df$value[2] <- 1
length(unique(df$value[df$id == 1])) == length(unique(df$value[df$id == 2]))
FALSE
In this case we get an output with values separated by ,:
aggregate(x= df[, "value"], by=list(id=df[, "id"]),
FUN=function(y) unique(unlist(y)))
id x
1 1 1, 3, 4, 5
2 2 6, 7, 8, 9, 10
In your case you have for every P value exactly 4 unique ID values and exactly 3 unique SC values, hence, aggregate shows those results as a matrix. This is not true for FP: here aggregate can't provide a matrix, hence, we get the values separated by ,
aggregate has an argument simplify that is TRUE by default, which means it tries to simplify to a vector or matrix when possible. All groups in P have n = 4, so your aggregated data is being simplified to a matrix. Just set simpflify = FALSE to change this behavior:
aggregate(x=t0g2[, !(colnames(t0g2) %in% c("P"))], by=list(P=t0g2[["P"]]),
FUN=function(y) unique(unlist(y)), simplify = F)
#### OUTPUT ####
P ID SC FP
1 1 8, 10, 7, 9 A, D, B 40, 41, 37, 8, 11, 49, 28, 16, 15, 27, 12, 20, 35, 45
2 2 5, 2, 3, 4 B, A, E 1, 34, 43, 37, 41, 7, 30, 2, 22, 35, 31, 10, 3, 29, 6, 15
3 3 8, 9, 1, 2 A, B, D 40, 41, 37, 8, 11, 27, 12, 20, 35, 45, 10, 49, 28, 7, 30, 2, 34, 43
4 4 8, 1, 4, 10 A, D, E 40, 41, 37, 8, 11, 10, 49, 28, 29, 6, 15, 16
5 5 4, 10, 2, 7 E, D, A 29, 6, 15, 49, 28, 16, 41, 7, 30, 2, 34, 43

Reshaping data to wide without quantitative data

I think I understand general reshaping. However, I have data that needs to be reshaped to wide format, but I don't want to show scores or quantitative data indexed by another variable.
Instead, I want to switch one variable from a single variable to five variables based on its values. No other variables should be indexed. I want the values of the one variable to form the other five variables, and values of those variables should simply be the same as their variable names.
I've included an example of a before and after.
Data:
> dput(ansscales3)
structure(list(ATID = c(33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
33, 33, 33), AnswerTypeDesc = c("VS|S|SD|VD", "VS|S|SD|VD", "VS|S|SD|VD",
"VS|S|SD|VD", "VS|S|SD|VD", "VS|S|SD|VD", "VS|S|SD|VD", "VS|S|SD|VD",
"VS|S|SD|VD", "VS|S|SD|VD", "VS|S|SD|VD", "VS|S|SD|VD", "VS|S|SD|VD"
), AValue = c(4, 3, 2, 1, 4, 3, 2, 1, 2, 1, 4, 3, 4), ScaleValue = c(1,
2, 3, 4, 1, 2, 3, 4, 3, 4, 1, 2, 1), ADesc = c("Very Satisfied",
"Satisfied", "Somewhat Dissatisfied", "Very Dissatisfied", "Very Satisfied",
"Satisfied", "Somewhat Dissatisfied", "Very Dissatisfied", "Somewhat Dissatisfied",
"Very Dissatisfied", "Very Satisfied", "Satisfied", "Very Satisfied"
), AOrder = c(1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 1, 2, 1), StatGroup = c("AdjN",
"AdjN", "AdjN", "AdjN", "N", "N", "N", "N", "PctNeg", "PctNeg",
"PctPos", "PctPos", "TopBox"), Cycles = c(11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11)), .Names = c("ATID", "AnswerTypeDesc",
"AValue", "ScaleValue", "ADesc", "AOrder", "StatGroup", "Cycles"
), row.names = c(NA, -13L), class = "data.frame")
Reshape into:
> dput(atids1)
structure(list(ATID = c(33, 33, 33, 33), AnswerTypeDesc = structure(c(1L,
1L, 1L, 1L), .Label = "VS|S|SD|VD", class = "factor"), AValue = c(4,
3, 2, 1), ScaleValue = c(1, 2, 3, 4), ADesc = c("Very Satisfied",
"Satisfied", "Somewhat Dissatisfied", "Very Dissatisfied"), AOrder = c(1,
2, 3, 4), Cycles = c(11, 11, 11, 11), N = c("N", "N", "N", "N"
), AdjN = c("AdjN", "AdjN", "AdjN", "AdjN"), PctPos = c("PctPos",
"PctPos", "", ""), PctNeg = c("", "", "PctNeg", "PctNeg"), TopBox = c("TopBox",
"", "", "")), .Names = c("ATID", "AnswerTypeDesc", "AValue",
"ScaleValue", "ADesc", "AOrder", "Cycles", "N", "AdjN", "PctPos",
"PctNeg", "TopBox"), row.names = c(NA, -4L), class = "data.frame")
I'm sure this is simple but unfortunately I haven't been able to figure it out using the reshape method.
Here is a solution with reshape2
(note: I've loaded dplyr to trigger the %>% operator but this is just a personal choice of styling code)
library(reshape2)
library(dplyr)
dat1 %>% dcast(... ~ StatGroup, value.var = "StatGroup", fill = "")
ATID AnswerTypeDesc AValue ScaleValue ADesc AOrder Cycles AdjN N PctNeg PctPos TopBox
1 33 VS|S|SD|VD 1 4 Very Dissatisfied 4 11 AdjN N PctNeg
2 33 VS|S|SD|VD 2 3 Somewhat Dissatisfied 3 11 AdjN N PctNeg
3 33 VS|S|SD|VD 3 2 Satisfied 2 11 AdjN N PctPos
4 33 VS|S|SD|VD 4 1 Very Satisfied 1 11 AdjN N PctPos TopBox
another solution with tidyr
library(tidyr)
dat1 %>% spread(StatGroup, StatGroup, fill = "")
ATID AnswerTypeDesc AValue ScaleValue ADesc AOrder Cycles AdjN N PctNeg PctPos TopBox
1 33 VS|S|SD|VD 1 4 Very Dissatisfied 4 11 AdjN N PctNeg
2 33 VS|S|SD|VD 2 3 Somewhat Dissatisfied 3 11 AdjN N PctNeg
3 33 VS|S|SD|VD 3 2 Satisfied 2 11 AdjN N PctPos
4 33 VS|S|SD|VD 4 1 Very Satisfied 1 11 AdjN N PctPos TopBox

Resources