Related
I realize that I may ask this question to a void because wnominate package is not of a wide use, but...
I have data on evaluation of state policies. I would like to know the potential choice polarization using W-NOMINATE Method. I have prepared my data according to the wnominate package vignette (p. 9, 11). But when I try to apply the method to my data, I receive a very strange error:
Error in wnominate(rc_samp, polarity = c(2, 7)) :
Data contains values other than 1 or 6 or 9.
And I do not understand what I am asked about. Can anybody clarify, what I am doing wrong? It will be a miracle if someone can help me with this package.
Data and code:
respNames <- samp$id
codeData <- matrix(samp$code, length(samp$code), 1)
colnames(codeData) <- "code"
samp <- samp[, -c(1,2)]
rc_samp <- rollcall(samp, yea = c(5,6,7), nay = c(1,2,3), missing = 4,
notInLegis = 88, legis.names = respNames, legis.data = codeData, desc = "Ideological polarization")
samp_result <- wnominate(rc_samp, polarity = c(7,7))
structure(list(id = structure(1:100, .Label = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26",
"27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37",
"38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48",
"49", "50", "51", "52", "53", "54", "55", "56", "57", "58", "59",
"60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "70",
"71", "72", "73", "74", "75", "76", "77", "78", "79", "80", "81",
"82", "83", "84", "85", "86", "87", "88", "89", "90", "91", "92",
"93", "94", "95", "96", "97", "98", "99", "100", "101", "102",
"103", "104", "105", "106", "107", "108", "109", "110", "111",
"112", "113", "114", "115", "116", "117", "118", "119", "120",
"121", "122", "123", "124", "125", "126", "127", "128", "129",
"130", "131", "132", "133", "134", "135", "136", "137", "138",
"139", "140", "141", "142", "143", "144", "145", "146", "147",
"148", "149", "150", "151", "152", "153", "154", "155", "156",
"157", "158", "159", "160", "161", "162", "163", "164", "165",
"166", "167", "168", "169", "170", "171", "172", "173", "174",
"175", "176", "177", "178", "179", "180", "181", "182", "183",
"184", "185", "186", "187", "188", "189", "190", "191", "192",
"193", "194", "195", "196", "197", "198", "199", "200", "201",
"202", "203", "204", "205", "206", "207", "208", "209", "210",
"211", "212", "213", "214", "215", "216", "217", "218", "219",
"220", "221", "222", "223", "224", "225", "226", "227", "228",
"229", "230", "231", "232", "233", "234", "235", "236", "237",
"238", "239", "240", "241", "242", "243", "244", "245", "246",
"247", "248", "249", "250", "251", "252", "253", "254", "255",
"256", "257", "258", "259", "260", "261", "262", "263", "264",
"265", "266", "267", "268", "269", "270", "271", "272", "273",
"274", "275", "276", "277", "278", "279", "280", "281", "282",
"283", "284", "285", "286", "287", "288", "289", "290", "291",
"292", "293", "294", "295", "296", "297", "298", "299", "300",
"301", "302", "303", "304", "305", "306", "307", "308", "309",
"310", "311", "312", "313", "314", "315", "316", "317", "318",
"319", "320", "321", "322", "323", "324", "325", "326", "327",
"328", "329", "330", "331", "332", "333", "334", "335", "336",
"337", "338", "339", "340", "341", "342", "343", "344", "345",
"346", "347", "348"), class = "factor"), code = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("control", "treatment_1", "treatment_2"
), class = "factor"), ideol_samesexmarriage = c(2, 7, 1, 2, 1,
7, 6, 1, 88, 1, 6, 6, 4, 1, 6, 1, 1, 88, 1, 88, 7, 6, 4, 6, 6,
6, 5, 7, 6, 3, 7, 4, 7, 7, 5, 5, 7, 7, 3, 6, 1, 7, 1, 1, 1, 7,
7, 7, 7, 7, 1, 88, 1, 7, 4, 7, 5, 5, 3, 7, 4, 7, 7, 88, 7, 7,
7, 4, 6, 6, 6, 6, 7, 7, 1, 7, 7, 88, 2, 2, 7, 7, 7, 1, 7, 5,
7, 7, 7, 1, 7, 5, 6, 5, 5, 7, 4, 7, 88, 2), ideol_flattaxes = c(5,
7, 1, 2, 2, 1, 6, 6, 2, 1, 6, 2, 4, 5, 1, 1, 6, 1, 4, 2, 3, 1,
1, 1, 4, 1, 2, 3, 1, 1, 1, 1, 3, 5, 4, 1, 88, 7, 5, 4, 1, 4,
1, 5, 3, 4, 4, 7, 2, 2, 1, 3, 5, 1, 4, 4, 5, 3, 3, 1, 3, 6, 1,
4, 3, 1, 2, 2, 2, 4, 6, 7, 3, 1, 5, 7, 1, 7, 1, 6, 7, 3, 6, 2,
2, 4, 1, 1, 1, 6, 2, 4, 5, 1, 5, 1, 2, 1, 1, 2), ideol_progrtaxes = c(2,
1, 7, 5, 88, 5, 1, 1, 7, 7, 2, 6, 6, 1, 5, 5, 1, 6, 6, 5, 6,
5, 6, 7, 4, 7, 5, 6, 5, 7, 7, 7, 5, 4, 3, 88, 6, 1, 4, 4, 1,
4, 5, 2, 7, 5, 7, 1, 7, 7, 7, 2, 4, 7, 4, 7, 5, 6, 3, 7, 5, 1,
7, 4, 6, 7, 6, 5, 7, 5, 1, 2, 5, 7, 3, 2, 7, 2, 7, 2, 1, 5, 1,
7, 88, 3, 7, 7, 7, 3, 7, 5, 2, 7, 3, 7, 5, 5, 1, 6), ideol_democracy = c(4,
7, 7, 3, 4, 7, 5, 1, 7, 6, 6, 5, 4, 7, 5, 6, 6, 88, 1, 6, 7,
5, 6, 6, 6, 2, 4, 3, 5, 6, 6, 2, 7, 6, 5, 7, 7, 3, 6, 4, 6, 7,
2, 4, 6, 7, 7, 7, 7, 6, 6, 6, 5, 6, 4, 5, 5, 6, 7, 7, 2, 5, 7,
4, 6, 7, 6, 4, 6, 4, 5, 6, 4, 7, 1, 6, 6, 88, 5, 6, 5, 4, 2,
3, 4, 5, 7, 3, 7, 6, 6, 5, 6, 7, 5, 7, 4, 4, 1, 7), ideol_dictatorship = c(5,
4, 5, 3, 1, 1, 5, 7, 2, 88, 6, 2, 7, 5, 2, 6, 3, 5, 7, 1, 1,
7, 2, 6, 1, 7, 1, 5, 7, 2, 1, 4, 1, 2, 1, 1, 1, 1, 7, 3, 7, 1,
6, 7, 1, 1, 1, 1, 1, 1, 7, 1, 1, 2, 3, 1, 6, 1, 1, 1, 3, 4, 1,
4, 1, 1, 2, 1, 3, 2, 3, 2, 4, 1, 7, 1, 3, 4, 3, 1, 2, 3, 7, 4,
7, 3, 1, 1, 1, 3, 1, 3, 1, 6, 2, 1, 5, 6, 1, 1), ideol_goveconomy = c(2,
2, 1, 2, 1, 2, 3, 1, 1, 3, 3, 2, 4, 1, 2, 1, 5, 1, 7, 1, 2, 4,
1, 6, 1, 6, 2, 3, 4, 1, 6, 2, 2, 2, 1, 2, 2, 4, 3, 1, 1, 1, 5,
2, 2, 2, 1, 1, 5, 5, 1, 1, 1, 1, 3, 1, 2, 1, 1, 88, 3, 2, 1,
4, 1, 3, 1, 6, 1, 2, 1, 1, 1, 4, 1, 1, 1, 1, 2, 1, 2, 1, 7, 5,
2, 4, 3, 1, 1, 2, 1, 4, 2, 1, 2, 6, 2, 4, 3, 3), ideol_govpaternalism = c(3,
4, 5, 4, 88, 3, 5, 1, 6, 7, 7, 5, 4, 88, 5, 4, 3, 6, 1, 5, 6,
2, 4, 4, 6, 7, 6, 6, 2, 7, 6, 7, 5, 5, 4, 88, 6, 6, 5, 5, 6,
7, 3, 3, 7, 2, 7, 5, 7, 5, 3, 6, 6, 6, 5, 5, 6, 7, 2, 88, 5,
6, 7, 4, 6, 6, 3, 5, 2, 5, 3, 2, 88, 7, 5, 6, 5, 2, 5, 4, 4,
5, 6, 4, 4, 6, 6, 6, 3, 4, 7, 5, 6, 6, 6, 6, 4, 5, 88, 6), ideol_govfreeimmigration = c(3,
4, 1, 2, 1, 2, 4, 1, 4, 1, 4, 3, 1, 2, 2, 1, 1, 4, 1, 3, 6, 2,
1, 2, 2, 5, 2, 3, 2, 6, 5, 2, 5, 3, 1, 2, 2, 2, 2, 1, 1, 6, 1,
2, 3, 2, 5, 2, 3, 1, 3, 4, 3, 4, 2, 2, 4, 3, 3, 1, 1, 88, 6,
2, 1, 5, 3, 3, 3, 4, 2, 5, 4, 7, 1, 3, 4, 2, 2, 1, 4, 4, 4, 1,
2, 4, 4, 5, 6, 2, 1, 4, 4, 4, 4, 6, 2, 2, 88, 2), ideol_govimmigration = c(5,
6, 7, 6, 88, 5, 1, 7, 4, 88, 6, 5, 7, 88, 6, 7, 7, 88, 7, 88,
4, 88, 7, 5, 6, 5, 6, 5, 88, 3, 4, 6, 4, 6, 7, 7, 6, 4, 6, 6,
88, 5, 7, 6, 5, 4, 3, 5, 4, 6, 5, 4, 6, 4, 4, 6, 6, 5, 7, 88,
7, 4, 7, 4, 6, 3, 5, 3, 5, 5, 6, 5, 88, 1, 7, 7, 4, 4, 6, 7,
6, 6, 5, 7, 88, 3, 3, 3, 2, 6, 6, 5, 4, 6, 5, 5, 4, 88, 1, 6),
ideol_commongoals = c(3, 2, 1, 2, 2, 2, 2, 4, 2, 6, 2, 3,
4, 6, 2, 1, 2, 2, 7, 4, 4, 2, 2, 88, 1, 6, 1, 6, 2, 1, 7,
6, 1, 3, 2, 1, 2, 1, 5, 4, 6, 1, 4, 4, 2, 4, 4, 1, 2, 3,
7, 5, 2, 1, 2, 4, 88, 88, 1, 3, 5, 88, 5, 4, 3, 5, 5, 2,
3, 4, 88, 2, 4, 88, 1, 4, 3, 3, 3, 1, 2, 3, 7, 3, 4, 4, 1,
1, 1, 3, 1, 4, 4, 2, 4, 2, 4, 5, 1, 3), ideol_privatefreedom = c(5,
88, 7, 5, 7, 4, 3, 3, 5, 1, 6, 5, 4, 2, 5, 7, 6, 7, 1, 3,
3, 5, 5, 88, 7, 2, 6, 3, 5, 7, 2, 2, 7, 5, 5, 88, 6, 7, 3,
4, 1, 7, 4, 4, 3, 5, 5, 7, 2, 5, 1, 3, 6, 7, 4, 4, 3, 88,
7, 6, 3, 4, 5, 4, 5, 4, 4, 7, 7, 3, 88, 6, 6, 88, 7, 3, 5,
2, 4, 7, 4, 5, 1, 3, 5, 4, 5, 7, 5, 5, 5, 4, 5, 2, 5, 6,
5, 3, 88, 3), ideol_goveconomy_lib = c(5, 2, 4, 2, 6, 3,
1, 5, 2, 1, 5, 5, 4, 5, 2, 6, 2, 7, 1, 5, 4, 5, 3, 4, 2,
2, 6, 4, 5, 7, 2, 1, 2, 3, 5, 5, 88, 4, 3, 5, 5, 6, 1, 3,
2, 4, 5, 5, 5, 1, 5, 88, 5, 6, 4, 3, 2, 1, 2, 88, 5, 5, 1,
4, 6, 4, 1, 3, 4, 3, 6, 5, 4, 1, 7, 5, 1, 1, 2, 6, 3, 7,
1, 1, 3, 4, 1, 5, 4, 6, 6, 4, 6, 3, 3, 3, 2, 3, 1, 4)), row.names = c(NA,
-100L), class = c("tbl_df", "tbl", "data.frame"))
I contacted with the package creator and we came to the (though unilateral) conclusion that WNOMINATE method is applicable to the data with big number of columns.
Rollcall votes should be coded as either 1, 6, or 9.
I am trying to produce a table of mean scores for each participant in my tibble. The number of observants is much larger than the data given below, but this tibble should be sufficient. I need to produce a table for each unique user_id. I would like the table to have 10 rows, 8 of which are the means of the indicators 1-8 per timepoint, and the other two are domain means per timepoint. The mean of domain 0 is the mean of indicators 1-4, and the mean of domain 1 is the mean of indicators 5-8. I would also like the outputted tables to have four columns, one per timepoint. Thus, each teacher_id's outputted table should be a 10 by 4. I have attempted this with tidyverse and would appreciate help. Also, some users (read several) will not have values at all timepoints.
structure(list(Group = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1), user_id = c("Kim", "Kim",
"Kim", "Kim", "Kim", "Kim", "Kim",
"Kim", "Bob", "Bob", "Bob", "Bob",
"Bob", "Bob", "Bob", "Bob", "Bob",
"Bob", "Bob", "Bob", "Bob", "Bob",
"Bob", "Bob", "George", "George", "George", "George",
"George", "George", "George", "George", "George", "George", "George",
"George", "George", "George", "George", "George"), indicator = c("1",
"2", "3", "4", "5", "6", "7", "8", "1", "1", "2", "2", "3", "3",
"4", "4", "5", "5", "6", "6", "7", "7", "8", "8", "1", "1", "2",
"2", "3", "3", "4", "4", "5", "5", "6", "6", "7", "7", "8", "8"
), Timepoint = c(1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 3, 4, 3,
4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4,
3, 4, 3, 4, 3, 4), score = c(3.5, 3.5, 2, 3, 3.5, 4,
3, 4, 2, 3, 2.5, 3, 1.5, 1.5, 0.5, 3, 2, 4, 2.5, 4, 2.5, 3.5,
3, 3.5, 3.5, 3, 2.5, 2.5, 2.5, 2, 2, 3, 3.5, 3.5, 3.5, 3.5, 3,
3, 3, 2.5)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-40L))
Attempted tidyverse code:
user_tables <- d %>%
group_by(user_id,indicator,Timepoint) %>%
summarise(Time1 = mean[which(indicator == 1 & Timepoint == 1)], mean[which(indicator == 2 & Timepoint == 1)], mean[which(indicator == 3 & Timepoint == 1)], mean[which(indicator == 4 & Timepoint == 1)], mean[which(indicator == 5 & Timepoint == 1)], mean[which(indicator == 6 & Timepoint == 1)], mean[which(indicator == 7 & Timepoint == 1)], mean[which(indicator == 8 & Timepoint == 1)],
Time2 = mean[which(indicator == 1 & Timepoint == 2)], mean[which(indicator == 2 & Timepoint == 2)], mean[which(indicator == 3 & Timepoint == 2)], mean[which(indicator == 4 & Timepoint == 2)], mean[which(indicator == 5 & Timepoint == 2)], mean[which(indicator == 6 & Timepoint == 2)], mean[which(indicator == 7 & Timepoint == 2)], mean[which(indicator == 8 & Timepoint == 2)],
Time3 = mean[which(indicator == 1 & Timepoint == 3)], mean[which(indicator == 2 & Timepoint == 3)], mean[which(indicator == 3 & Timepoint == 3)], mean[which(indicator == 4 & Timepoint == 3)], mean[which(indicator == 5 & Timepoint == 3)], mean[which(indicator == 6 & Timepoint == 3)], mean[which(indicator == 7 & Timepoint == 3)], mean[which(indicator == 8 & Timepoint == 3)],
Time4 = mean[which(indicator == 1 & Timepoint == 4)], mean[which(indicator == 2 & Timepoint == 4)], mean[which(indicator == 3 & Timepoint == 4)], mean[which(indicator == 4 & Timepoint == 4)], mean[which(indicator == 5 & Timepoint == 4)], mean[which(indicator == 6 & Timepoint == 4)], mean[which(indicator == 7 & Timepoint == 4)], mean[which(indicator == 8 & Timepoint == 4)]) %>%
split(., .$user_id)
Ultimately, I would like a table like this per user (where the NAs are the appropriate means) (Note: This one is for Bob - he didn't have scores for time 1 or time 2):
structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 1.625, 2, 2.5, 1.5, 0.5, 2.5, 2,
2.5, 2.5, 3, 2.625, 3, 3, 1.5, 3, 3.75, 4, 4, 3.5, 3.5), .Dim = c(10L,
4L), .Dimnames = list(c("Domain 0", "Ind 1", "Ind 2", "Ind 3",
"Ind 4", "Domain 1", "Ind 5", "Ind 6", "Ind 7", "Ind 8"), c("Time 1",
"Time 2", "Time 3", "Time 4")))
Thank you!
Since you are adding rows, you could do:
df %>%
group_by(Group, user_id, Timepoint, domain = +(indicator>4), indicator) %>%
summarise(sc=mean(score),.groups ='drop_last') %>%
pivot_wider(c(Group, user_id, indicator, domain), Timepoint,'Time_', values_from = sc) %>%
group_nest()%>%
mutate(data = map(data,
~rbind(c(NA,colMeans(select_if(.x,is.numeric), na.rm = TRUE)),.x)))%>%
unnest(data)%>%
mutate(indicator = ifelse(is.na(indicator),
paste0('Domain ', domain), paste0('Ind ', indicator)),
domain = NULL)
A tibble: 30 x 6
Group user_id indicator Time_3 Time_4 Time_1
<dbl> <chr> <chr> <dbl> <dbl> <dbl>
1 1 Bob Domain 0 1.62 2.62 NaN
2 1 Bob Ind 1 2 3 NA
3 1 Bob Ind 2 2.5 3 NA
4 1 Bob Ind 3 1.5 1.5 NA
5 1 Bob Ind 4 0.5 3 NA
6 1 Bob Domain 1 2.5 3.75 NaN
7 1 Bob Ind 5 2 4 NA
8 1 Bob Ind 6 2.5 4 NA
9 1 Bob Ind 7 2.5 3.5 NA
10 1 Bob Ind 8 3 3.5 NA
# ... with 20 more rows
Same basic idea as in #Onyambu’s answer, but simplified a bit with new dplyr
1.0.0 features that allow summarise() to increase the row count:
library(tidyverse)
have %>%
mutate(domain = (as.numeric(indicator) - 1) %/% 4) %>%
group_by(user_id, Timepoint, domain, indicator) %>%
summarise(score = mean(score)) %>%
summarise(
cur_data() %>% add_row(score = mean(score), .before = 1)
) %>%
arrange(Timepoint) %>%
pivot_wider(
values_from = score,
names_from = Timepoint,
names_prefix = "Time "
) %>%
filter(user_id == "Bob")
#> `summarise()` regrouping output by 'user_id', 'Timepoint', 'domain' (override with `.groups` argument)
#> `summarise()` regrouping output by 'user_id', 'Timepoint', 'domain' (override with `.groups` argument)
#> # A tibble: 10 x 6
#> # Groups: user_id, domain [2]
#> user_id domain indicator `Time 1` `Time 3` `Time 4`
#> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 Bob 0 <NA> NA 1.62 2.62
#> 2 Bob 0 1 NA 2 3
#> 3 Bob 0 2 NA 2.5 3
#> 4 Bob 0 3 NA 1.5 1.5
#> 5 Bob 0 4 NA 0.5 3
#> 6 Bob 1 <NA> NA 2.5 3.75
#> 7 Bob 1 5 NA 2 4
#> 8 Bob 1 6 NA 2.5 4
#> 9 Bob 1 7 NA 2.5 3.5
#> 10 Bob 1 8 NA 3 3.5
Data setup:
have <- structure(list(
Group = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
user_id = c(
"Kim", "Kim", "Kim", "Kim", "Kim", "Kim", "Kim", "Kim", "Bob", "Bob",
"Bob", "Bob", "Bob", "Bob", "Bob", "Bob", "Bob", "Bob", "Bob", "Bob",
"Bob", "Bob", "Bob", "Bob", "George", "George", "George", "George",
"George", "George", "George", "George", "George", "George", "George",
"George", "George", "George", "George", "George"
),
indicator = c(
"1", "2", "3", "4", "5", "6", "7", "8", "1", "1", "2", "2", "3", "3",
"4", "4", "5", "5", "6", "6", "7", "7", "8", "8", "1", "1", "2",
"2", "3", "3", "4", "4", "5", "5", "6", "6", "7", "7", "8", "8"
), Timepoint = c(
1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4,
3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4
), score = c(
3.5, 3.5, 2, 3, 3.5, 4, 3, 4, 2, 3, 2.5, 3, 1.5, 1.5,
0.5, 3, 2, 4, 2.5, 4, 2.5, 3.5, 3, 3.5, 3.5, 3, 2.5,
2.5, 2.5, 2, 2, 3, 3.5, 3.5, 3.5, 3.5, 3, 3, 3, 2.5
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -40L))
want <- structure(
c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 1.625, 2, 2.5, 1.5, 0.5, 2.5,
2, 2.5, 2.5, 3, 2.625, 3, 3, 1.5, 3, 3.75, 4, 4, 3.5, 3.5),
.Dim = c(10L, 4L),
.Dimnames = list(
c("Domain 0", "Ind 1", "Ind 2", "Ind 3","Ind 4",
"Domain 1", "Ind 5", "Ind 6", "Ind 7", "Ind 8"),
c("Time 1", "Time 2", "Time 3", "Time 4")
)
)
Great question. How about a nest solution? Here, you create a function to summarise, nest the data by user_id, then apply the function to each participant.
library(tidyverse)
df <- structure(list(Group = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1), user_id = c("Kim", "Kim",
"Kim", "Kim", "Kim", "Kim", "Kim",
"Kim", "Bob", "Bob", "Bob", "Bob",
"Bob", "Bob", "Bob", "Bob", "Bob",
"Bob", "Bob", "Bob", "Bob", "Bob",
"Bob", "Bob", "George", "George", "George", "George",
"George", "George", "George", "George", "George", "George", "George",
"George", "George", "George", "George", "George"), indicator = c("1",
"2", "3", "4", "5", "6", "7", "8", "1", "1", "2", "2", "3", "3",
"4", "4", "5", "5", "6", "6", "7", "7", "8", "8", "1", "1", "2",
"2", "3", "3", "4", "4", "5", "5", "6", "6", "7", "7", "8", "8"
), Timepoint = c(1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 3, 4, 3,
4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4,
3, 4, 3, 4, 3, 4), score = c(3.5, 3.5, 2, 3, 3.5, 4,
3, 4, 2, 3, 2.5, 3, 1.5, 1.5, 0.5, 3, 2, 4, 2.5, 4, 2.5, 3.5,
3, 3.5, 3.5, 3, 2.5, 2.5, 2.5, 2, 2, 3, 3.5, 3.5, 3.5, 3.5, 3,
3, 3, 2.5)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-40L))
output <- structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 1.625, 2, 2.5, 1.5, 0.5, 2.5, 2,
2.5, 2.5, 3, 2.625, 3, 3, 1.5, 3, 3.75, 4, 4, 3.5, 3.5), .Dim = c(10L,
4L), .Dimnames = list(c("Domain 0", "Ind 1", "Ind 2", "Ind 3",
"Ind 4", "Domain 1", "Ind 5", "Ind 6", "Ind 7", "Ind 8"), c("Time 1",
"Time 2", "Time 3", "Time 4")))
dfnest <- df %>%
group_nest(user_id)
my_summarise <- function(.data) {
.data %>%
group_by(indicator,Timepoint) %>%
summarise(mean = mean(score, na.rm = TRUE)) %>%
pivot_wider(values_from = mean,names_from = Timepoint,names_prefix = 'Timepoint')
}
map(dfnest$data,my_summarise)
#> `summarise()` regrouping output by 'indicator' (override with `.groups` argument)
#> `summarise()` regrouping output by 'indicator' (override with `.groups` argument)
#> `summarise()` regrouping output by 'indicator' (override with `.groups` argument)
#> [[1]]
#> # A tibble: 8 x 3
#> # Groups: indicator [8]
#> indicator Timepoint3 Timepoint4
#> <chr> <dbl> <dbl>
#> 1 1 2 3
#> 2 2 2.5 3
#> 3 3 1.5 1.5
#> 4 4 0.5 3
#> 5 5 2 4
#> 6 6 2.5 4
#> 7 7 2.5 3.5
#> 8 8 3 3.5
#>
#> [[2]]
#> # A tibble: 8 x 3
#> # Groups: indicator [8]
#> indicator Timepoint3 Timepoint4
#> <chr> <dbl> <dbl>
#> 1 1 3.5 3
#> 2 2 2.5 2.5
#> 3 3 2.5 2
#> 4 4 2 3
#> 5 5 3.5 3.5
#> 6 6 3.5 3.5
#> 7 7 3 3
#> 8 8 3 2.5
#>
#> [[3]]
#> # A tibble: 8 x 2
#> # Groups: indicator [8]
#> indicator Timepoint1
#> <chr> <dbl>
#> 1 1 3.5
#> 2 2 3.5
#> 3 3 2
#> 4 4 3
#> 5 5 3.5
#> 6 6 4
#> 7 7 3
#> 8 8 4
Created on 2020-11-12 by the reprex package (v0.3.0)
I've been banging my head against a brick wall for days on this issue; I wonder if anyone can see what is wrong with my code, or tell me if I am overlooking something obvious.
I have this data.frame, where most columns are vectors, either numerical or character, and one column is a list of character vectors:
t0g2 <- structure(list(P = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4,
4, 4, 5, 5, 5, 5), ID = c(8, 10, 7, 9, 5, 2, 3, 4, 8, 9, 1, 2,
8, 1, 4, 10, 4, 10, 2, 7), SC = c("A", "D", "A", "B", "B", "A",
"A", "E", "A", "B", "D", "A", "A", "D", "E", "D", "E", "D", "A",
"A"), FP = list(`40,41,37,8,11` = c("40", "41", "37", "8", "11"
), `49,28,16,41` = c("49", "28", "16", "41"), `15,49` = c("15",
"49"), `27,12,20,35,45` = c("27", "12", "20", "35", "45"), `1,34,43,37` = c("1",
"34", "43", "37"), `41,7,30,2,34,43` = c("41", "7", "30", "2",
"34", "43"), `22,35,31,10,3` = c("22", "35", "31", "10", "3"),
`29,6,15` = c("29", "6", "15"), `40,41,37,8,11` = c("40",
"41", "37", "8", "11"), `27,12,20,35,45` = c("27", "12",
"20", "35", "45"), `10,49,28` = c("10", "49", "28"), `41,7,30,2,34,43` = c("41",
"7", "30", "2", "34", "43"), `40,41,37,8,11` = c("40", "41",
"37", "8", "11"), `10,49,28` = c("10", "49", "28"), `29,6,15` = c("29",
"6", "15"), `49,28,16,41` = c("49", "28", "16", "41"), `29,6,15` = c("29",
"6", "15"), `49,28,16,41` = c("49", "28", "16", "41"), `41,7,30,2,34,43` = c("41",
"7", "30", "2", "34", "43"), `15,49` = c("15", "49"))), class = "data.frame", row.names = c("8",
"10", "7", "9", "5", "2", "3", "4", "81", "91", "1", "21", "82",
"11", "41", "101", "42", "102", "22", "71"))
I want to aggregate it by one of the columns, with the function for the other columns being simply the concatenation of unique values. [Yes, I know this can be done with many ad hoc packages, but I need to do it with base R].
This works perfectly well if I choose numeric column "ID" as the column to aggregate on:
aggregate(x=t0g2[, !(colnames(t0g2) %in% c("ID"))], by=list(ID=t0g2[["ID"]]),
FUN=function(y) unique(unlist(y)))
# ID P SC FP
#1 1 3, 4 D 10, 49, 28
#2 2 2, 3, 5 A 41, 7, 30, 2, 34, 43
#3 3 2 A 22, 35, 31, 10, 3
#4 4 2, 4, 5 E 29, 6, 15
#5 5 2 B 1, 34, 43, 37
#6 7 1, 5 A 15, 49
#7 8 1, 3, 4 A 40, 41, 37, 8, 11
#8 9 1, 3 B 27, 12, 20, 35, 45
#9 10 1, 4, 5 D 49, 28, 16, 41
or with character column "SC":
aggregate(x=t0g2[, !(colnames(t0g2) %in% c("SC"))], by=list(SC=t0g2[["SC"]]),
FUN=function(y) unique(unlist(y)))
# SC P ID FP
#1 A 1, 2, 3, 4, 5 8, 7, 2, 3 40, 41, 37, 8, 11, 15, 49, 7, 30, 2, 34, 43, 22, 35, 31, 10, 3
#2 B 1, 2, 3 9, 5 27, 12, 20, 35, 45, 1, 34, 43, 37
#3 D 1, 3, 4, 5 10, 1 49, 28, 16, 41, 10
#4 E 2, 4, 5 4 29, 6, 15
However, if I try with "P", which as far as I know is just another numerical column, this is what I get:
aggregate(x=t0g2[, !(colnames(t0g2) %in% c("P"))], by=list(P=t0g2[["P"]]),
FUN=function(y) unique(unlist(y)))
# P ID.1 ID.2 ID.3 ID.4 SC.1 SC.2 SC.3 FP
#1 1 8 10 7 9 A D B 40, 41, 37, 8, 11, 49, 28, 16, 15, 27, 12, 20, 35, 45
#2 2 5 2 3 4 B A E 1, 34, 43, 37, 41, 7, 30, 2, 22, 35, 31, 10, 3, 29, 6, 15
#3 3 8 9 1 2 A B D 40, 41, 37, 8, 11, 27, 12, 20, 35, 45, 10, 49, 28, 7, 30, 2, 34, 43
#4 4 8 1 4 10 A D E 40, 41, 37, 8, 11, 10, 49, 28, 29, 6, 15, 16
#5 5 4 10 2 7 E D A 29, 6, 15, 49, 28, 16, 41, 7, 30, 2, 34, 43
Does anybody know what is going on, why this happens?
Literally going mental with this stuff...
EDIT: adding an example of the desired output from aggregating on "P", as requested by jay.sf.
# P ID SC FP
#1 1 8, 10, 7, 9 A, D, B 40, 41, 37, 8, 11, 49, 28, 16, 15, 27, 12, 20, 35, 45
#2 2 5, 2, 3, 4 B, A, E 1, 34, 43, 37, 41, 7, 30, 2, 22, 35, 31, 10, 3, 29, 6, 15
#3 3 8, 9, 1, 2 A, B, D 40, 41, 37, 8, 11, 27, 12, 20, 35, 45, 10, 49, 28, 7, 30, 2, 34, 43
#4 4 8, 1, 4, 10 A, D, E 40, 41, 37, 8, 11, 10, 49, 28, 29, 6, 15, 16
#5 5 4, 10, 2, 7 E, D, A 29, 6, 15, 49, 28, 16, 41, 7, 30, 2, 34, 43
In fact, I found out that by setting simplify=F in aggregate, it works as I want.
I hope this won't backfire.
EDIT 2: it did backfire...
I don't want all my columns to become lists even when they can be vectors, but with simplify = F they do become lists:
sapply(aggregate(x=t0g2[,!(colnames(t0g2) %in% c("P"))],by=list(P=t0g2[["P"]]),FUN=function(y) unique(unlist(y)), simplify = F),class)
# P ID SC FP
#"numeric" "list" "list" "list"
sapply(aggregate(x=t0g2[,!(colnames(t0g2) %in% c("ID"))],by=list(ID=t0g2[["ID"]]),FUN=function(y) unique(unlist(y)), simplify = T),class)
# ID P SC FP
# "numeric" "list" "character" "list"
sapply(aggregate(x=t0g2[,!(colnames(t0g2) %in% c("ID"))],by=list(ID=t0g2[["ID"]]),FUN=function(y) unique(unlist(y)), simplify = F),class)
# ID P SC FP
#"numeric" "list" "list" "list"
So I still don't have a solution... :(
EDIT 3: maybe a viable (if rather clumsy) solution?
t0g2_by_ID <- aggregate(x=t0g2[,!(colnames(t0g2) %in% c("ID"))],by=list(ID=t0g2[["ID"]]),FUN=function(y) unique(unlist(y)), simplify = F)
sapply(t0g2_by_ID,class)
# ID P SC FP
#"numeric" "list" "list" "list"
for (i in 1:NCOL(t0g2_by_ID)) {y = t0g2_by_ID[,i]; if ((class(y) == "list") & (length(y) == length(unlist(y)))) {t0g2_by_ID[,i] <- unlist(y)} }
sapply(t0g2_by_ID,class)
# ID P SC FP
#"numeric" "list" "character" "list"
I tried to obviate to the inelegant loop using sapply, but then any cbind operation goes back to a data.frame of lists.
This is the best I can come up with.
If anyone can suggest how to do this better using only base R, that'd be great.
aggregate obviously tries to give a matrix where this is possible. See This example:
# data
n <- 10
df <- data.frame(id= rep(1:2, each= n/2),
value= 1:n)
length(unique(df$value[df$id == 1])) == length(unique(df$value[df$id == 2]))
TRUE
Here the length of unique is same for every id value, thus aggregate provides a matrix
aggregate(x= df[, "value"], by=list(id=df[, "id"]),
FUN=function(y) unique(unlist(y)))
id x.1 x.2 x.3 x.4 x.5
1 1 1 2 3 4 5
2 2 6 7 8 9 10
Now we change data so that length of unique per id is not equal
df$value[2] <- 1
length(unique(df$value[df$id == 1])) == length(unique(df$value[df$id == 2]))
FALSE
In this case we get an output with values separated by ,:
aggregate(x= df[, "value"], by=list(id=df[, "id"]),
FUN=function(y) unique(unlist(y)))
id x
1 1 1, 3, 4, 5
2 2 6, 7, 8, 9, 10
In your case you have for every P value exactly 4 unique ID values and exactly 3 unique SC values, hence, aggregate shows those results as a matrix. This is not true for FP: here aggregate can't provide a matrix, hence, we get the values separated by ,
aggregate has an argument simplify that is TRUE by default, which means it tries to simplify to a vector or matrix when possible. All groups in P have n = 4, so your aggregated data is being simplified to a matrix. Just set simpflify = FALSE to change this behavior:
aggregate(x=t0g2[, !(colnames(t0g2) %in% c("P"))], by=list(P=t0g2[["P"]]),
FUN=function(y) unique(unlist(y)), simplify = F)
#### OUTPUT ####
P ID SC FP
1 1 8, 10, 7, 9 A, D, B 40, 41, 37, 8, 11, 49, 28, 16, 15, 27, 12, 20, 35, 45
2 2 5, 2, 3, 4 B, A, E 1, 34, 43, 37, 41, 7, 30, 2, 22, 35, 31, 10, 3, 29, 6, 15
3 3 8, 9, 1, 2 A, B, D 40, 41, 37, 8, 11, 27, 12, 20, 35, 45, 10, 49, 28, 7, 30, 2, 34, 43
4 4 8, 1, 4, 10 A, D, E 40, 41, 37, 8, 11, 10, 49, 28, 29, 6, 15, 16
5 5 4, 10, 2, 7 E, D, A 29, 6, 15, 49, 28, 16, 41, 7, 30, 2, 34, 43
Having a hard time figuring out an efficient solution to the following problem. The question is very verbose because I'm not sure if I'm making this problem harder than it can be.
Given a named vector
t <- c(2, 0, 0, 30, 0, 0, 10, 2000, 0, 20, 0, 40, 60, 10)
names(t) <- c(1, 0, 0, 2, 0, 0, 3, 4, 0, 5, 0, 6, 7, 8)
I want to split t into a list of 4 elements that's balanced based on the sum of the resulting list elements while keeping the order of elements, and only splitting on non-zero elements. Expected outcome
L[1] <- c(2, 0, 0, 30, 0, 0, 10) # sum = 42
L[2] <- c(2000, 0) # sum = 2000
L[3] <- c(20, 0, 40) # sum = 60
L[4] <- c(60, 10) # sum = 70
The error function I use is minimizing sd(rowSums(L)) or sd(sapply(L, sum))
Trying to split the vector using something like the following doesn't quite work
split(t, cut(cumsum(t), 4))
# $`(-0.17,544]`
# 1 0 0 2 0 0 3
# 2 0 0 30 0 0 10
# $`(544,1.09e+03]`
# named numeric(0)
# $`(1.09e+03,1.63e+03]`
# named numeric(0)
# $`(1.63e+03,2.17e+03]`
# 4 0 5 0 6 7 8
# 2000 0 20 0 40 60 10
I wrote a function to split the list the way that I wanted (see error function above)
break_at <- function(val, nchunks) {
nchunks <- nchunks - 1
nonzero <- val[val != 0]
all_groupings <- as.matrix(gtools::permutations(n = 2, r = length(nonzero), v = c(1, 0), repeats.allowed = TRUE))
all_groupings <- all_groupings[rowSums(all_groupings) == nchunks, ]
which_grouping <- which.min(
sapply(
1:nrow(all_groupings),
function(i) {
sd(
sapply(
split(
nonzero,
cumsum(all_groupings[i,])
),
sum
)
)
}
)
)
mark_breaks <- rep(0, length(val))
mark_breaks[names(val) %in% which(all_groupings[which_grouping,]==1)] <- 1
return(mark_breaks)
}
You can see the result is much better
break_at(t, 4)
# 0 0 0 0 0 0 0 1 0 1 0 0 1 0
split(t, cumsum(break_at(t, 4)))
# $`0`
# 1 0 0 2 0 0 3
# 2 0 0 30 0 0 10
# $`1`
# 4 0
# 2000 0
# $`2`
# 5 0 6
# 20 0 40
# $`3`
# 7 8
# 60 10
It works by using gtools::permutations(n = 2, r = length(nonzero), v = c(1, 0), repeats.allowed = TRUE) to look at all potential splits. See how the above works for r = 3
# [,1] [,2] [,3]
# [1,] 0 0 0
# [2,] 0 0 1
# [3,] 0 1 0
# [4,] 0 1 1
# [5,] 1 0 0
# [6,] 1 0 1
# [7,] 1 1 0
# [8,] 1 1 1
which I then filter, all_groupings[rowSums(all_groupings) == nchunks, ]. This only looks at potential splits that produce nchunks.
My issue is that this works horribly with my real data because of the number of permutations involved.
hard <- structure(c(2, 0, 1, 2, 0, 1, 1, 1, 5, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 0, 0, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1,
1, 1, 2, 0, 2, 0, 1, 4, 0, 0, 0, 1, 3, 0, 0, 4, 0, 0, 0, 2, 0,
1, 1, 1, 3, 0, 0, 1, 1, 1, 1, 2, 0, 1, 2, 0, 1, 1, 2, 0, 1, 6,
0, 0, 0, 0, 0, 1, 1, 1, 3, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0,
1, 1, 2, 0, 1, 2, 0, 1, 1, 4, 0, 0, 0, 1, 1, 3, 0, 0, 1, 2, 0,
1, 1, 2, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 1, 1, 2, 0, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 2, 0, 3,
0, 0, 1, 1, 2, 0, 1, 2, 0, 1, 1, 1, 2, 0, 2, 0, 1, 3, 0, 0, 1,
1, 1, 1, 1, 2, 0, 1, 1, 1, 2, 0, 1, 2, 0, 1, 1, 1, 1, 1, 1, 2,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
0, 1, 1, 1, 1, 1, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 2, 0, 1, 1, 1, 2, 0, 1, 1, 1, 2, 0, 8, 0, 0, 0, 0, 0, 0, 0,
1, 2, 0, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1,
3, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1,
1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 3, 0,
0, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1,
1, 1, 1, 2, 0, 1, 1, 1, 1, 5, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 0, 1, 1, 1, 1, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 2, 0, 1, 1, 2, 0, 1, 2, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 2,
0, 1, 9, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 4, 0, 0, 0, 1, 1, 1,
1, 6, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 1, 3, 0, 0, 1, 1, 1, 3,
0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1), .Names = c("1", "0",
"2", "3", "0", "4", "5", "6", "7", "0", "0", "0", "0", "8", "9",
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
"21", "22", "23", "24", "0", "0", "25", "26", "27", "28", "29",
"30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "0",
"40", "41", "42", "43", "0", "44", "45", "46", "47", "48", "49",
"50", "51", "52", "0", "53", "0", "54", "55", "0", "0", "0",
"56", "57", "0", "0", "58", "0", "0", "0", "59", "0", "60", "61",
"62", "63", "0", "0", "64", "65", "66", "67", "68", "0", "69",
"70", "0", "71", "72", "73", "0", "74", "75", "0", "0", "0",
"0", "0", "76", "77", "78", "79", "0", "0", "80", "81", "82",
"83", "84", "85", "86", "87", "88", "0", "89", "90", "91", "0",
"92", "93", "0", "94", "95", "96", "0", "0", "0", "97", "98",
"99", "0", "0", "100", "101", "0", "102", "103", "104", "0",
"105", "106", "0", "0", "107", "108", "0", "0", "109", "110",
"111", "112", "0", "113", "114", "115", "116", "117", "118",
"119", "120", "121", "122", "123", "124", "125", "126", "127",
"128", "129", "130", "131", "0", "132", "133", "134", "0", "135",
"0", "0", "136", "137", "138", "0", "139", "140", "0", "141",
"142", "143", "144", "0", "145", "0", "146", "147", "0", "0",
"148", "149", "150", "151", "152", "153", "0", "154", "155",
"156", "157", "0", "158", "159", "0", "160", "161", "162", "163",
"164", "165", "166", "0", "167", "168", "169", "170", "171",
"172", "173", "174", "175", "176", "177", "178", "179", "180",
"181", "182", "183", "184", "185", "186", "0", "187", "188",
"189", "190", "191", "192", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "193", "194", "195", "196", "197", "0", "198",
"199", "200", "201", "0", "202", "203", "204", "205", "0", "206",
"0", "0", "0", "0", "0", "0", "0", "207", "208", "0", "209",
"210", "211", "212", "213", "214", "215", "0", "216", "217",
"218", "219", "220", "221", "0", "222", "223", "224", "225",
"0", "0", "226", "227", "228", "229", "230", "231", "232", "233",
"234", "235", "236", "237", "238", "239", "240", "0", "241",
"242", "243", "244", "245", "246", "247", "248", "0", "249",
"250", "251", "252", "253", "254", "0", "255", "256", "257",
"258", "259", "260", "0", "0", "261", "262", "263", "264", "0",
"265", "266", "267", "268", "269", "270", "271", "272", "273",
"274", "0", "275", "276", "277", "278", "279", "280", "281",
"282", "0", "283", "284", "285", "286", "287", "0", "0", "0",
"0", "288", "0", "0", "0", "0", "0", "289", "290", "291", "292",
"293", "294", "295", "296", "297", "298", "299", "300", "301",
"302", "303", "304", "305", "306", "307", "308", "309", "310",
"311", "312", "313", "314", "315", "316", "317", "318", "319",
"320", "321", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "322", "323", "324", "325", "326", "327", "328", "329",
"330", "331", "332", "333", "334", "335", "336", "337", "338",
"339", "340", "341", "0", "342", "343", "344", "345", "346",
"0", "347", "0", "348", "349", "350", "351", "352", "353", "354",
"355", "356", "357", "358", "359", "360", "0", "361", "362",
"363", "0", "364", "365", "0", "366", "367", "0", "0", "0", "0",
"0", "0", "0", "368", "0", "369", "370", "0", "0", "0", "0",
"0", "0", "0", "0", "371", "0", "0", "372", "0", "0", "0", "373",
"374", "375", "376", "377", "0", "0", "0", "0", "0", "378", "0",
"0", "0", "0", "0", "379", "380", "0", "0", "381", "382", "383",
"384", "0", "0", "385", "0", "0", "0", "0", "0", "0", "386",
"387", "388", "0", "389", "390", "391", "392", "393", "394",
"395", "396", "397", "398", "399", "400", "401", "402", "0",
"403", "404", "405", "406", "407", "408", "409"))
I don't know if there are some analytical solutions. But if you treat it as a integer programming problem you could use the "SANN" heuristics implemented in optim. For example, consider some (sub-optimal) random split points to cut the vector t
> startpar <- sort(sample(length(t)-1, 3))
> startpar
[1] 5 6 9
> # result in a sub-optimal split
> split(t, cut(1:length(t), c(0, startpar, length(t)), labels = 1:4))
$`1`
1 0 0 2 0
2 0 0 30 0
$`2`
0
0
$`3`
3 4 0
10 2000 0
$`4`
5 0 6 7 8
20 0 40 60 10
The error function could be written as
> # from manual: A function to be minimized (or maximized)
> fn <- function(par, vec){
+ ind_vec <- cut(1:length(vec), c(0, par, length(vec)), labels = 1:4)
+ sd(unlist(lapply(split(vec, ind_vec), sum)))
+ }
> # evaluated at the starting parameters
> fn(startpar, t)
[1] 979.5625
The "SANN" heuristics (Simulated annealing) needs a method to generate a new candidate solution. There can be more sophisticated ways to select either the functions or the starting values, but the present choices still lead to the/an [edit:] near optimal solution (and maybe in acceptable time?).
> # from manual: For the "SANN" method it specifies a function to generate a new candidate point
> gr <- function(par, vec){
+ ind <- sample(length(par), 1)
+ par[ind] <- par[ind] + sample(-1:1, 1)
+ par[ind] <- max(c(par[ind], ifelse(ind == 1, 1, par[ind - 1] + 1)))
+ par[ind] <- min(c(par[ind], ifelse(ind == 3, length(vec) - 1, par[ind + 1] - 1)))
+ par
+ }
Applied to the toy data
> optimpar <- optim(startpar, fn, gr, method = "SANN", vec = t)$par
> split(t, cut(1:length(t), c(0, optimpar, length(t)), labels = 1:4))
$`1`
1 0 0 2
2 0 0 30
$`2`
0 0 3
0 0 10
$`3`
4
2000
$`4`
0 5 0 6 7 8
0 20 0 40 60 10
> fn(optimpar, t)
[1] 972.7329
>
Applied to the real data
> # use for "hard"
> startpar <- sort(sample(length(hard)-1, 3))
> optimpar <- optim(startpar, fn, gr, method = "SANN", vec = hard)
> optimpar
$par
[1] 146 293 426
$value
[1] 4.573474
...[output shortened]
[Edit] since my initial results were sub-optimal.
I'm sure you found a sufficient alternative yourself already, but for the sake of completeness: Regarding the present toy and real data examples a better choice for gr (I'll call it gr2 for later reference) would have a different sampling length (e.g. dependent on the length of the data) in order to generate the new candidate which will be less dependent from the incumbent (the current solution). For example
> gr2 <- function(par, vec){
+ ind <- sample(length(par), 1)
+ l <- round(log(length(vec), 2))
+ par[ind] <- par[ind] + sample(-l:l, 1)
+ par[ind] <- max(c(par[ind], ifelse(ind == 1, 1, par[ind - 1] + 1)))
+ par[ind] <- min(c(par[ind], ifelse(ind == 3, length(vec) - 1, par[ind + 1] - 1)))
+ par
+ }
For the real data resulting in
> set.seed(1337)
>
> startpar <- sort(sample(length(hard)-1, 3))
> opt <- optim(startpar, fn, gr2, method = "SANN", vec = hard)
> opt$value
[1] 4.5
> lapply(split(hard, cut(1:length(hard), c(0, opt$par, length(hard)), labels = 1:4)), sum)
$`1`
[1] 140
$`2`
[1] 141
$`3`
[1] 144
$`4`
[1] 150
And for the toy data resulting in
> startpar <- sort(sample(length(t)-1, 3))
> opt <- optim(startpar, fn, gr2, method = "SANN", vec = t)
> opt$value
[1] 971.4024
> split(t, cut(1:length(t), c(0, opt$par, length(t)), labels = 1:4))
$`1`
1 0 0 2 0 0 3
2 0 0 30 0 0 10
$`2`
4
2000
$`3`
0 5 0 6
0 20 0 40
$`4`
7 8
60 10
Regarding the optimality for the real data (using gr2), I ran a short simulation of 100 optimization runs from different starting parameters: Each of those runs terminated at a value of 4.5.
By using dynamic programming you can get the true optimum in O(N^2) time. The trick is to see that minimizing the standard deviation is the same as minimizing the sum of squares of rowSums. Since the error contributions of each subvector are independent, we can reduce the search-space of
possible splits by ignoring extensions of suboptimal splits of subvectors.
If for instance (3, 5) is a better split for V[1:7] than (2, 4), then
every split of V starting with (3, 5, 8,...) is better than every split starting with
(2, 4, 8, ...).
So if we for each 1 < k < len(V) find the best 2-group split of 'V[1:k]',
we can find the best into 3-group split of each V[1:k] by only considering extensions of the optimal 2-group splits of the subvectors V[1:k]. In general we find the best (n+1)-group spilt by extending the optimal n-group splits.
The balanced.split function below takes in a vector of values and the number of splits and returns a list of subvectors. This yields the a solution with row sums 140,141,144,150 on the hard set.
balanced.split <- function(all.values, n.splits) {
nonzero.idxs <- which(all.values!=0)
values <- all.values[nonzero.idxs]
cumsums = c(0, cumsum(values))
error.table <- outer(cumsums, cumsums, FUN='-')**2
# error.table[i, j] = error contribution of segment
# values[i:(j-1)]
# Iteratively find best i splits
index.matrix <- array(dim=c(n.splits-1, ncol(error.table)))
cur.best.splits <- error.table[1, ]
for (i in 1:(n.splits-1)){
error.sums <- cur.best.splits + error.table
index.matrix[i, ] <- apply(error.sums, 2, which.min)
# index.matrix[i, k] = last split of optimal (i+1)-group
# split of values[1:k]
cur.best.splits <- apply(error.sums, 2, min)
# cur.best.splits[k] = minimal error function
# of (i+1)-group split of values[1:k]
}
# Trace best splits
cur.idx <- ncol(index.matrix)
splits <- vector("numeric", n.splits-1)
for (i in (n.splits-1):1) {
cur.idx = index.matrix[i, cur.idx]
splits[i] <- cur.idx
}
# Split values vector
splits <- c(1, nonzero.idxs[splits], length(all.values)+1)
chunks <- list()
for (i in 1:n.splits)
chunks[[i]] <- all.values[splits[i]:(splits[i+1]-1)]
return(chunks)
}
Below is more detailed code for the same algorithm
# Matrix containing the error contribution of
# subsegments [i:j]
.makeErrorTable <- function(values) {
cumsums = c(0, cumsum(values))
return(outer(cumsums, cumsums, FUN='-')**2)
}
# Backtrace the optimal split points from an index matrix
.findPath <- function(index.matrix){
nrows <- nrow(index.matrix)
cur.idx <- ncol(index.matrix)
path <- vector("numeric", nrows)
for (i in nrows:1) {
cur.idx = index.matrix[i, cur.idx]
path[i] <- cur.idx
}
return(path)
}
.findSplits <- function(error.table, n.splits) {
n.diffs <- nrow(error.table)
max.val <- error.table[1, n.diffs]
# Table used to backtrace the optimal path
idx.table <- array(dim=c(n.splits-1, n.diffs))
cur.best.splits <- error.table[1, ]
for (i in 1:(n.splits-1)){
error.sums <- cur.best.splits + error.table
idx.table[i, ] <- apply(error.sums, 2, which.min)
cur.best.splits <- apply(error.sums, 2, min)
}
return(.findPath(idx.table))
}
# Split values at given split points
.splitChunks <- function(values, splits) {
splits <- c(1, splits, length(values)+1)
chunks <- list()
for (i in 1:(length(splits)-1))
chunks[[i]] <- values[splits[i]:(splits[i+1]-1)]
return(chunks)
}
#' Main function that splits all.values into n.splits
#' chunks, minimizing sd(sum(chunk))
balanced.split <- function(all.values, n.splits) {
nonzero.idxs <- which(all.values!=0)
values <- all.values[nonzero.idxs]
error.table <- .makeErrorTable(values)
splits <- .findSplits(error.table, n.splits)
full.splits <- nonzero.idxs[splits]
return(.splitChunks(all.values, full.splits))
}
The following solution is "split[ting] t into a list of 4 elements that's balanced based on the sum of the resulting list elements while keeping the order of elements, and only splitting on non-zero elements.".
It's not producing your exact expected output though, but to my understanding your optimization rules were not requirements but just things you've tried to get those balanced lists. And it should be efficient :).
t <- c(2, 0, 0, 30, 0, 0, 10, 2000, 0, 20, 0, 40, 60, 10)
groups <- cut(cumsum(t),
breaks=quantile(cumsum(t),
probs=seq(0, 1, 0.25)),
include.lowest =TRUE)
lapply(unique(groups),function(x) t[groups==x])
# [[1]]
# [1] 2 0 0 30 0 0
#
# [[2]]
# [1] 10
#
# [[3]]
# [1] 2000 0 20 0
#
# [[4]]
# [1] 40 60 10
On your hard data, the results are quite well "balanced" :
t2 <- as.numeric(hard)
groups <- cut(cumsum(t2),
breaks=quantile(cumsum(t2),
probs=seq(0, 1, 0.25)),
include.lowest =TRUE)
L2 <- lapply(unique(groups),function(x) t2[groups==x])
sapply(L2,sum)
# [1] 144 145 149 137
To compare with 138 143 144 150 using currently chosen solution.
I'm having a dataframe as like below.
`> am_me
Group.1 Group.2 x.x x.y
2 AM clearterminate 3 21.00000
3 AM display.cryptic 86 30.12791
4 AM price 71 898.00000`
I would like to get result as like below.
`> am_me_t
Group.2 x.x x.y
2 clearterminate 3 21
3 display.cryptic 86 30.1279069767442
4 price 71 898
41 AM 160 316.375968992248`
I have taken out the first column and got the result like below
`> am_res
Group.2 x.x x.y
2 clearterminate 3 21.00000
3 display.cryptic 86 30.12791
4 price 71 898.00000`
When I try rbind to Add "AM" to new row, as like below, I'm getting a warning message and getting NA.
`> am_me_t <- rbind(am_res, c("AM", colSums(am_res[2]), colMeans(am_res[3])))
Warning message:
invalid factor level, NAs generated in: "[<-.factor"(`*tmp*`, ri, value = "AM")
Group.2 x.x x.y
2 clearterminate 3 21
3 display.cryptic 86 30.1279069767442
4 price 71 898
41 <NA> 160 316.375968992248`
For your information, Output of edit(am_me)
`> edit(am_me)
structure(list(Group.1 = structure(as.integer(c(2, 2, 2)), .Label = c("1Y",
"AM", "BE", "CM", "CO", "LX", "SN", "US", "VK", "VS"), class = "factor"),
Group.2 = structure(as.integer(c(2, 5, 9)), .Label = c("bestbuy",
"clearterminate", "currency.display", "display", "display.cryptic",
"fqa", "mileage.display", "ping", "price", "reissue", "reissuedisplay",
"shortaccess.followon"), class = "factor"), x.x = as.integer(c(3,
86, 71)), x.y = c(21, 30.1279069767442, 898)), .Names = c("Group.1",
"Group.2", "x.x", "x.y"), row.names = c("2", "3", "4"), class = "data.frame")`
Also
`> edit(me)
structure(list(Group.1 = structure(as.integer(c(1, 2, 2, 2, 3,
4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 8, 8,
8, 8, 9, 9, 10, 10, 10, 10, 10, 10)), .Label = c("1Y", "AM",
"BE", "CM", "CO", "LX", "SN", "US", "VK", "VS"), class = "factor"),
Group.2 = structure(as.integer(c(8, 2, 5, 9, 10, 1, 2, 5,
9, 1, 2, 5, 9, 1, 2, 3, 4, 7, 9, 11, 12, 2, 4, 6, 1, 2, 5,
9, 2, 5, 1, 2, 3, 5, 9, 10)), .Label = c("bestbuy", "clearterminate",
"currency.display", "display", "display.cryptic", "fqa",
"mileage.display", "ping", "price", "reissue", "reissuedisplay",
"shortaccess.followon"), class = "factor"), x.x = as.integer(c(1,
3, 86, 71, 1, 2, 5, 1, 52, 10, 7, 27, 15, 5, 267, 14, 4,
1, 256, 1, 1, 80, 1, 78, 2, 10, 23, 6, 1, 2, 4, 3, 3, 11,
1, 1)), x.y = c(5, 21, 30.1279069767442, 898, 12280, 800,
56.4, 104, 490.442307692308, 1759.1, 18.1428571428571, 1244.81481481481,
518.533333333333, 3033.2, 18.5468164794007, 20, 3788.5, 23,
2053.49609375, 3863, 6376, 17.825, 240, 1752.21794871795,
1114.5, 34, 1369.60869565217, 1062.16666666667, 23, 245,
5681.5, 11.3333333333333, 13.3333333333333, 1273.81818181818,
2076, 5724)), .Names = c("Group.1", "Group.2", "x.x", "x.y"
), row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
"21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31",
"32", "33", "34", "35", "36"), class = "data.frame")
Group.1 Group.2 x.x x.y
1 1Y ping 1 5.00000
2 AM clearterminate 3 21.00000
3 AM display.cryptic 86 30.12791
4 AM price 71 898.00000
5 BE reissue 1 12280.00000
6 CM bestbuy 2 800.00000
7 CM clearterminate 5 56.40000
8 CM display.cryptic 1 104.00000
9 CM price 52 490.44231
10 CO bestbuy 10 1759.10000
11 CO clearterminate 7 18.14286
12 CO display.cryptic 27 1244.81481
13 CO price 15 518.53333
14 LX bestbuy 5 3033.20000
15 LX clearterminate 267 18.54682
16 LX currency.display 14 20.00000
17 LX display 4 3788.50000
18 LX mileage.display 1 23.00000
19 LX price 256 2053.49609
20 LX reissuedisplay 1 3863.00000
21 LX shortaccess.followon 1 6376.00000
22 SN clearterminate 80 17.82500
23 SN display 1 240.00000
24 SN fqa 78 1752.21795
25 US bestbuy 2 1114.50000
26 US clearterminate 10 34.00000
27 US display.cryptic 23 1369.60870
28 US price 6 1062.16667
29 VK clearterminate 1 23.00000
30 VK display.cryptic 2 245.00000
31 VS bestbuy 4 5681.50000
32 VS clearterminate 3 11.33333
33 VS currency.display 3 13.33333
34 VS display.cryptic 11 1273.81818
35 VS price 1 2076.00000
36 VS reissue 1 5724.00000`
The type of the Group.2 column is factor, and that limits the possible values. You can transform it to character with am_me$Group.2 <- as.character(am_me$Group.2), after that the AM value will be added without errors.
Note that you can also use sum() and mean() for single column operations.