Related
I have data like this:
df<-structure(list(levels_incised___1 = c(0, 0, 0, 0, 0, 0), levels_incised___2 = c(1,
0, 0, 0, 0, 0), levels_incised___3 = c(1, 0, 0, 0, 0, 0), levels_incised___4 = c(1,
0, 0, 0, 0, 0), levels_incised___5 = c(1, 0, 0, 0, 0, 0), levels_incised___6 = c(1,
0, 0, 0, 0, 0), levels_incised___7 = c(1, 0, 0, 0, 0, 0), levels_incised___8 = c(1,
1, 1, 0, 0, 0), levels_incised___9 = c(1, 1, 1, 0, 0, 0), levels_incised___10 = c(1,
1, 1, 0, 0, 0), levels_incised___11 = c(0, 1, 0, 0, 0, 0), levels_incised___12 = c(0,
1, 0, 0, 0, 0), levels_incised___13 = c(0, 1, 0, 0, 0, 0), levels_incised___14 = c(0,
1, 0, 0, 0, 0), levels_incised___15 = c(0, 1, 0, 0, 0, 0), levels_incised___16 = c(0,
0, 0, 0, 0, 0), levels_incised___17 = c(0, 0, 0, 0, 0, 0), levels_incised___18 = c(0,
0, 0, 0, 0, 0), levels_incised___19 = c(0, 0, 0, 0, 0, 0), levels_incised___20 = c(0,
0, 0, 0, 0, 0), levels_incised___21 = c(0, 0, 0, 0, 0, 0), levels_incised___22 = c(0,
0, 0, 0, 1, 0), levels_incised___23 = c(0, 0, 0, 0, 1, 1), levels_incised___24 = c(0,
0, 0, 0, 1, 1), levels_incised___25 = c(0, 0, 0, 0, 1, 1), levels_incised___26 = c(0,
0, 0, 0, 1, 1), levels_incised___27 = c(0, 0, 0, 1, 1, 1), levels_incised___28 = c(0,
0, 0, 1, 1, 1), levels_incised___29 = c(0, 0, 0, 1, 1, 0), levels_incised___30 = c(0,
0, 0, 1, 1, 0), levels_incised___31 = c(0, 0, 0, 0, 0, 0)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
That originally came from this Redcap input where each button was one of those columns:
And I need to create a column at the end (lets call it Level) with these possible inputs:
Cervical (any of the c buttons)
Thoracic (the t's)
Lumbar (the L's)
Sacral (sacral)
Thoracocervical (t's or c's)
Thoracolumbar (t's or l's)
Lumbosacral (l's and sacral)
So for instance, the patient in the first row had "1"'s in levels_incised_2 through levels_incised_10... meaning they had values in both the cervical range and the thoracic range. So that patient should get "Thoracocervical".
The patient in row 2 had 1's in 8 through 15, so they'd only get a "thoracic"
Does anyone know the most straight forward way to accomplish this?
Oh one last detail, there's 100+ other columns so it'd be nice if I could select/name these specific ones to count
A few things to resolve here:
find a way to convert levels...# to one of the C/T/... categories;
produce logic to infer based on presence of groups.
I think the first can be done by extracting the number and using findInterval to determine with of C/T/... each column belongs to. From there, we can do some simple c_across to find "any" in a group, and case_when to get your Level labels.
library(dplyr)
# helper function for renaming
func <- function(z) {
num <- as.integer(gsub("\\D", "", z))
grp <- c("C","T","L","S","Co","unclear")[findInterval(num, 1+c(0, 7, 19, 24, 29, 30, 31))]
grp <- paste0(grp, ave(grp, grp, FUN = seq_along))
# fix those that do not need numbering
grp[grepl("^Co", grp)] <- "Co"
grp[grepl("^unc", grp)] <- "unclear"
grp
}
out <- df %>%
rename_with(.cols = starts_with("levels"), .fn = func) %>%
rowwise() %>%
mutate(
anyC = sum(c_across(C1:C7)) > 0,
anyT = sum(c_across(T1:T12)) > 0,
anyL = sum(c_across(L1:L5)) > 0,
anyS = sum(c_across(S1:S5)) > 0
) %>%
ungroup() %>%
mutate(
Level = case_when(
anyC & anyT & anyL ~ "More than 2?",
anyL & anyS ~ "Lumbosacral",
anyT & anyL ~ "Thoracolumbar",
anyT & anyC ~ "Thoracocervical",
anyS ~ "Sacral",
anyL ~ "Lumbar",
anyT ~ "Thoracic",
anyC ~ "Cervical",
TRUE ~ "Nothing?"
)
)
out
# # A tibble: 6 x 36
# C1 C2 C3 C4 C5 C6 C7 T1 T2 T3 T4 T5 T6 T7 T8 T9 T10 T11 T12 L1 L2 L3 L4 L5 S1 S2 S3 S4 S5 Co unclear anyC anyT anyL anyS Level
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <lgl> <lgl> <lgl> <chr>
# 1 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 TRUE TRUE FALSE FALSE Thoracocervical
# 2 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 FALSE TRUE FALSE FALSE Thoracic
# 3 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 FALSE TRUE FALSE FALSE Thoracic
# 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 FALSE FALSE FALSE TRUE Sacral
# 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 FALSE FALSE TRUE TRUE Lumbosacral
# 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 FALSE FALSE TRUE TRUE Lumbosacral
out$Level
# [1] "Thoracocervical" "Thoracic" "Thoracic" "Sacral" "Lumbosacral" "Lumbosacral"
If you don't want to keep the renaming, then you can combine the Level result to your original frame with cbind(df, Level = out$Level).
using package dplyr:
## vertebra codes needed later on
vertebra_codes <- c(
paste0('C',1:7), paste0('T',1:12),
paste0('L',1:5), paste0('S',1:5),
'X', ## for Coccyx
'-' ## for unknown
)
df %>%
mutate(
## assuming each row is a case:
case_id = paste0('case_',row_number())
) %>%
## reshape the data from wide to long format:
pivot_longer(
cols = -case_id,
names_to = 'level_incised', values_to = 'is_incised'
) %>%
mutate(
## remove the redundant 'levels_incised__' prefix:
level_incised = gsub('.*_','',level_incised),
## assign the vertebra corresponding to 'level':
vertebra = vertebra_codes[as.integer(level_incised)],
## assign the spine region (e.g.: all lumbal vert. start with 'L'
spine_region = substr(vertebra,1,1)
) %>%
filter(is_incised == 1) %>% ## we're interested in incised vert. only
## remove replicates (more than one vertebra per spine region affected:
distinct(case_id, spine_region) %>%
## do the counts per case:
group_by(case_id) %>%
## string together the affected regions per case:
summarise(incised_regions = paste(spine_region, collapse = ','))
result:
# A tibble: 6 x 2
case_id incised_regions
<chr> <chr>
1 case_1 C,T
2 case_2 T
3 case_3 T
4 case_4 S,X
5 case_5 L,S,X
6 case_6 L,S
(Note that original `df` remains unchanged throughout the processing pipeline. However you can break up the pipeline by removing the `%>%` operator and inspect the intermediary steps, or assign them to temporary objects.)
extra / for fun: example code to ggplot the spine with vertebra status (incised or not) per patient.
This question already has answers here:
How to select the rows with maximum values in each group with dplyr? [duplicate]
(6 answers)
Closed 2 years ago.
I want to group by data set based on some IDs, then leave the grouped data that has largest value in the column. Here is a description of my data set.
BSTN ASTN1 BSTN2 ASTN2 BSTN3 ASTN3 BSTN4 ASTN4 BSTN5 ASTN TRNID TRNID2 TRNID3 TRNID4 TRNID5 count
1 150 0 0 0 0 0 0 0 0 152 1674 0 0 0 0 1
2 150 0 0 0 0 0 0 0 0 152 1676 0 0 0 0 2
3 150 0 0 0 0 0 0 0 0 152 1678 0 0 0 0 2
4 150 0 0 0 0 0 0 0 0 152 1680 0 0 0 0 13
5 150 0 0 0 0 0 0 0 0 152 1682 0 0 0 0 3
6 150 0 0 0 0 0 0 0 0 152 1684 0 0 0 0 4
I want to group and summarise this data into a single row based on IDs the first 10 columns BSTN ASTN1 BSTN2 ASTN2 BSTN3 ASTN3 BSTN4 ASTN4 BSTN5 ASTN.
Then for the rest of the columns, TRNID TRNID2 TRNID3 TRNID4 TRNID5 I would like to replace them with the row with maximum value in column count.
What I want as my final output would look as below.
BSTN ASTN1 BSTN2 ASTN2 BSTN3 ASTN3 BSTN4 ASTN4 BSTN5 ASTN TRNID TRNID2 TRNID3 TRNID4 TRNID5 count
150 0 0 0 0 0 0 0 0 152 1680 0 0 0 0 13
How would summarise my data? I have 2,931,959 rows with more groups of BSTN, ASTNs.
dput(head(A_Routetable2))
structure(list(BSTN = c(150, 150, 150, 150, 150, 150), ASTN1 = c(0,
0, 0, 0, 0, 0), BSTN2 = c(0, 0, 0, 0, 0, 0), ASTN2 = c(0, 0,
0, 0, 0, 0), BSTN3 = c(0, 0, 0, 0, 0, 0), ASTN3 = c(0, 0, 0,
0, 0, 0), BSTN4 = c(0, 0, 0, 0, 0, 0), ASTN4 = c(0, 0, 0, 0,
0, 0), BSTN5 = c(0, 0, 0, 0, 0, 0), ASTN = c(152, 152, 152, 152,
152, 152), TRNID = c(1674, 1676, 1678, 1680, 1682, 1684), TRNID2 = c(0,
0, 0, 0, 0, 0), TRNID3 = c(0, 0, 0, 0, 0, 0), TRNID4 = c(0, 0,
0, 0, 0, 0), TRNID5 = c(0, 0, 0, 0, 0, 0), count = c(1L, 2L,
2L, 13L, 3L, 4L)), row.names = c(NA, -6L), groups = structure(list(
BSTN = c(150, 150, 150, 150, 150, 150), ASTN1 = c(0, 0, 0,
0, 0, 0), BSTN2 = c(0, 0, 0, 0, 0, 0), ASTN2 = c(0, 0, 0,
0, 0, 0), BSTN3 = c(0, 0, 0, 0, 0, 0), ASTN3 = c(0, 0, 0,
0, 0, 0), BSTN4 = c(0, 0, 0, 0, 0, 0), ASTN4 = c(0, 0, 0,
0, 0, 0), BSTN5 = c(0, 0, 0, 0, 0, 0), ASTN = c(152, 152,
152, 152, 152, 152), TRNID = c(1674, 1676, 1678, 1680, 1682,
1684), TRNID2 = c(0, 0, 0, 0, 0, 0), TRNID3 = c(0, 0, 0,
0, 0, 0), TRNID4 = c(0, 0, 0, 0, 0, 0), .rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 6L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
You can group_by position and then select row with max value in count.
library(dplyr)
df %>% group_by(across(1:10)) %>% slice(which.max(count))
# BSTN ASTN1 BSTN2 ASTN2 BSTN3 ASTN3 BSTN4 ASTN4 BSTN5 ASTN TRNID TRNID2 TRNID3 TRNID4 TRNID5 count
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#1 150 0 0 0 0 0 0 0 0 152 1680 0 0 0 0 13
Or group by range of columns
df %>% group_by(across(BSTN:ASTN)) %>%slice(which.max(count))
The dput shared by OP is grouped which results an error with across. We can ungroup the data first and run the above which runs without any error. However functions in the previous version of dplyr work without any error on it. For example - group_by_at
A_Routetable2 %>% group_by_at(1:10) %>% slice(which.max(count))
I have been struggling with this for a while now and I haven't been able to find a comparable question asked anywhere, hence my first question on here!
I'm fairly new to R so please excuse any obvious errors I have made.
I have a dataset which has a row for each subscription that a user has or has had. Some users have multiple rows, while some others only have one. Only active or previously active subscriptions are present.
I have two variables which state when the subscription has started and when it ended called, Begindate and Enddate respectively. I already have relationlength variables created which state the amount of days between these two variables for each type of subscription. This means that the relationlength variables only give the amount of days for when a subscription was active.
What I would like to do is create empty rows in between the different subscription rows for the time periods in which no subscription was active, starting from the earliest Begindate known for the specific user and ending on a given date where all subscriptions end (20-04-2022).
I have tried to compare the date difference from the first begindate known for a user and the final date and subtracting the relation length known for the other subscription types. However, I could not make this work.
An example of what the df currently looks like:
(rl standing for relationlength)
ID Begindate Enddate Subscrtype active rl_fixed rl_promotional Productgroup
1 2019-08-26 2022-04-20 fixed 1 968 0 1
1 2018-08-24 2019-08-23 fixed 0 364 0 1
1 2015-08-24 2016-08-23 promo 0 0 364 2
2 2019-08-26 2019-09-12 fixed 0 17 0 1
2 2018-08-24 2019-08-23 fixed 0 364 0 1
What I would like it to look like:
ID Begindate Enddate Subscrtype active rl_fixed rl_promo rl_none Productgroup
1 2019-08-26 2022-04-20 fixed 1 968 0 0 1
1 2019-08-24 2019-08-25 none 0 0 0 2 NA
1 2018-08-24 2019-08-23 fixed 0 364 0 0 1
1 2016-08-24 2018-08-23 none 0 0 0 729 NA
1 2015-08-24 2016-08-23 promo 0 0 364 0 2
2 2019-09-13 2022-04-20 none 0 0 0 950 NA
2 2019-08-26 2019-09-12 fixed 0 17 0 0 1
2 2019-08-24 2019-08-25 none 0 0 0 2 NA
2 2018-08-24 2019-08-23 fixed 0 364 0 0 1
The end goal is to aggregate and have a clear overview of the specific relation lengths for the different types of relations possible for a user.
Thank you in advance!
dput for one specific user in the real df:
structure(list(ï..CRM.relatienummer = structure(c(1L, 1L, 1L,
1L, 1L, 1L), .Label = "1", class = "factor"), Begindatum = c("2019-08-26",
"2018-08-24", "2017-08-24", "2016-08-24", "2015-08-20", "2016-06-01"
), Einddatum = c("2022-04-20", "2019-08-23", "2018-08-23", "2017-08-23",
"2016-05-31", "2016-08-19"), Type.abonnement = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "Actie", class = "factor"), Status_dummy = c(1,
0, 0, 0, 0, 0), relationlength_fixed = c(0, 0, 0, 0, 0, 0), relationlength_promo = c(968,
364, 364, 364, 285, 79), relationlength_trial = c(0, 0, 0, 0,
0, 0), fixed_dummy = c(0, 0, 0, 0, 0, 0), trial_dummy = c(0,
0, 0, 0, 0, 0), promotional_dummy = c(1, 1, 1, 1, 1, 1)), row.names = c("1:20610",
"2:38646", "2:39231", "2:39232", "2:39248", "2:39837"), class = "data.frame")
Edit:
I have tried to run this code:
dfs <- split(testdata,testdata$ï..CRM.relatienummer)
r <- lapply(seq(length(dfs)), function(k){
v <- dfs[[k]]
vt <- data.frame(unique(v$ï..CRM.relatienummer),
as.character((as.Date(v$Einddatum)+1)[-1]),
as.character((as.Date(v$Begindatum)-1)[-nrow(v)]),
0,
0,
0,
0,
(as.Date(v$Begindatum)-1)[-nrow(v)] - (as.Date(v$Einddatum)+1)[-1],
NA,
0,
0,
0,
0,
0)
colnames(vt) <- c(colnames(v)[-ncol(v)],"rl_none",colnames(v)[ncol(v)])
(testdata <- rbind(data.frame(v[-ncol(v)],rl_none = 0,v[ncol(v)]),vt))[order(as.Date(testdata$Begindatum),decreasing = T),]
})
res <- data.frame(Reduce(rbind,r),row.names = NULL)
On this dataframe, with no luck unfortunately:
structure(list(ï..CRM.relatienummer = structure(c("d45248b8974dc4f8ff948779e0fd07e20f304e929ada4e14c0420aebed81e9b5",
"2ab04e80b3e64601147df977d6054c04ffa80014b3691b25dd1cc8ef85cea06a",
"2ab04e80b3e64601147df977d6054c04ffa80014b3691b25dd1cc8ef85cea06a",
"bcf2c99e6dc974380f967204b9623dce2c8a3fad694dc0b4430fcbf77f8f39f3",
"bcf2c99e6dc974380f967204b9623dce2c8a3fad694dc0b4430fcbf77f8f39f3",
"f8610cd0237858ac9384d6ba209759ae306860ffabb3f8e6c3d6fc68dbaddc51",
"e5b8b3f46165e48aec8bbe65ed1cb29d18a0492fbcac44803372f672348459db",
"c737815b2365b01a8a85c380364a0f721685a131de98cd7790b4d40bb8c4e05b",
"b9c0272caa8d5d3497d28cce3bda5d3d17c22f18c5f65c5e82c572b410a8ea71",
"b9c0272caa8d5d3497d28cce3bda5d3d17c22f18c5f65c5e82c572b410a8ea71",
"539c6c3e604245008daefbe500ff29357bee91f82a7896126bd0f69848524cb7",
"d361338bed51cb9c8aa73fd8914cbf392f4e05e7b073f637f7b150cf02b89c8c",
"505d3df3f1298e07aa96073490b72acd2391da06ad4cfbd5a9fbde3a3de79684",
"826443481cbb5b4e061040d443a0ce8d94322615d8ffae1e68b2ff7d896afcf7",
"2b59a1ec028c261c0f22cd6a49220dc7cec9a9fb0fabe2296b4ba77a60cfdaae"
), class = c("hash", "sha256")), Begindatum = c("2019-06-14",
"2019-03-01", "2019-09-02", "2019-03-03", "2019-04-01", "2019-09-21",
"2019-02-02", "2019-06-11", "2019-02-05", "2019-02-09", "2019-07-24",
"2019-05-08", "2019-09-27", "2019-08-03", "2019-04-03"), Einddatum = c("2022-04-20",
"2019-09-01", "2022-04-20", "2019-03-31", "2022-04-20", "2022-04-20",
"2019-02-14", "2019-07-08", "2019-02-11", "2020-02-08", "2019-09-03",
"2019-06-18", "2019-11-07", "2019-08-16", "2022-04-20"), Status_dummy = c(1,
0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1), relationlength_fixed = c(0,
184, 961, 28, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0), relationlength_promo = c(1041,
0, 0, 0, 1115, 942, 12, 0, 0, 364, 0, 0, 0, 0, 1113), relationlength_trial = c(0,
0, 0, 0, 0, 0, 0, 27, 0, 0, 41, 41, 41, 13, 0), rl_none = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), fixed_dummy = c(0,
1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), trial_dummy = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0), promotional_dummy = c(1,
0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1), active_subscr_dummy = c(3,
0, 5, 0, 3, 3, 0, 0, 0, 3, 0, 0, 1, 0, 3), hashedEmail = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c("1:1",
"1:2", "1:3", "1:4", "1:5", "1:6", "1:7", "1:8", "1:9", "1:10",
"1:11", "1:12", "1:13", "1:14", "1:15"), class = "data.frame")
Hopefully this is what you are expecting
dfs <- split(df,df$ID)
r <- lapply(seq(length(dfs)), function(k){
v <- dfs[[k]]
vt <- data.frame(unique(v$ID),
as.character((as.Date(v$Enddate)+1)[-1]),
as.character((as.Date(v$Begindate)-1)[-nrow(v)]),
"none",
0,
0,
0,
(as.Date(v$Begindate)-1)[-nrow(v)] - (as.Date(v$Enddate)+1)[-1],
NA)
colnames(vt) <- c(colnames(v)[-ncol(v)],"rl_none",colnames(v)[ncol(v)])
(df <- rbind(data.frame(v[-ncol(v)],rl_none = 0,v[ncol(v)]),vt))[order(as.Date(df$Begindate),decreasing = T),]
})
res <- data.frame(Reduce(rbind,r),row.names = NULL)
which gives
> res
ID Begindate Enddate Subscrtype active rl_fixed rl_promo rl_none Productgroup
1 1 2019-08-26 2022-04-20 fixed 1 968 0 0 1
2 1 2019-08-24 2019-08-25 none 0 0 0 1 NA
3 1 2018-08-24 2019-08-23 fixed 0 364 0 0 1
4 1 2016-08-24 2018-08-23 none 0 0 0 729 NA
5 1 2015-08-24 2016-08-23 promo 0 0 364 0 2
6 2 2019-08-26 2019-09-12 fixed 0 17 0 0 1
7 2 2019-08-24 2019-08-25 none 0 0 0 1 NA
8 2 2018-08-24 2019-08-23 fixed 0 364 0 0 1
DATA
structure(list(ID = c(1L, 1L, 1L, 2L, 2L), Begindate = structure(c(3L,
2L, 1L, 3L, 2L), .Label = c("2015-08-24", "2018-08-24", "2019-08-26"
), class = "factor"), Enddate = structure(c(4L, 2L, 1L, 3L, 2L
), .Label = c("2016-08-23", "2019-08-23", "2019-09-12", "2022-04-20"
), class = "factor"), Subscrtype = structure(c(1L, 1L, 2L, 1L,
1L), .Label = c("fixed", "promo"), class = "factor"), active = c(1L,
0L, 0L, 0L, 0L), rl_fixed = c(968L, 364L, 0L, 17L, 364L), rl_promo = c(0L,
0L, 364L, 0L, 0L), Productgroup = c(1L, 1L, 2L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-5L))
I've got the following data:
dataset <- structure(list(id = structure(c(2L, 3L, 1L, 3L, 1L, 9L), .Label = c("215101",
"215559", "216566", "217284", "219435", "220209", "220249", "220250",
"225678", "225679", "225687", "225869", "228420", "228435", "230621",
"230623", "233063", "233097", "233098", "235546", "235560", "235567",
"236379"), class = "factor"), cat1 = c("A", "B", "B", "A", "A",
"A"), cat2 = c("item 1", "item 1", "item 2", "item 5", "item 3",
"item 28"), cat3 = c("theme 2", "theme 2", "theme 1", "theme 4",
"theme 10", "theme 40")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L))
I would like to create kind of model matrix with one hot encoded columns features created from columns cat2 and cat3. Therefore, my output would look like this:
structure(list(id = structure(c(1L, 1L, 2L, 3L, 3L, 9L), .Label = c("215101",
"215559", "216566", "217284", "219435", "220209", "220249", "220250",
"225678", "225679", "225687", "225869", "228420", "228435", "230621",
"230623", "233063", "233097", "233098", "235546", "235560", "235567",
"236379"), class = "factor"), cat1 = c("A", "B", "A", "A", "B",
"A"), `item 1` = c(0, 0, 1, 0, 1, 0), `item 2` = c(0, 1, 0, 0,
0, 0), `item 28` = c(0, 0, 0, 0, 0, 1), `item 3` = c(1, 0, 0,
0, 0, 0), `item 5` = c(0, 0, 0, 1, 0, 0), `theme 1` = c(0, 1,
0, 0, 0, 0), `theme 10` = c(1, 0, 0, 0, 0, 0), `theme 2` = c(0,
0, 1, 0, 1, 0), `theme 4` = c(0, 0, 0, 1, 0, 0), `theme 40` = c(0,
0, 0, 0, 0, 1)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L))
However, I don't have my independent variable in this dataset and I would like to preserve id and cat1 columns. How can I do that?
You could use merge and dcast twice.
library(reshape2)
merge(dcast(dataset, id + cat1 ~ cat2, fun.aggregate = length),
dcast(dataset, id + cat1 ~ cat3, fun.aggregate = length),
by = c("id", "cat1"))
# id cat1 item 1 item 2 item 28 item 3 item 5 theme 1 theme 10 theme 2 theme 4 theme 40
#1 215101 A 0 0 0 1 0 0 1 0 0 0
#2 215101 B 0 1 0 0 0 1 0 0 0 0
#3 215559 A 1 0 0 0 0 0 0 1 0 0
#4 216566 A 0 0 0 0 1 0 0 0 1 0
#5 216566 B 1 0 0 0 0 0 0 1 0 0
#6 225678 A 0 0 1 0 0 0 0 0 0 1
If you have more then two variables to spread you might melt you data first. This will save you some typing.
dcast(melt(dataset, id.vars = c("id", "cat1")), id + cat1 ~ value, fun.aggregate = length)
I have data like,
df <- structure(list(Sex = structure(c(1L, 1L, 2L, 1L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("F", "M"), class = "factor"),
Age = c(19L, 16L, 16L, 13L, 16L, 30L, 16L, 30L, 16L, 30L,
30L, 16L, 19L, 1L, 30L), I = c(1, 1, 0, 0, 1, 0, 1, 0, 1,
0, 0, 0, 1, 0, 1), E = c(0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,
1, 0, 1, 0), S = c(1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
0, 1), N = c(0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0),
F = c(1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1), T = c(0,
1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0), C = c(1, 1, 1,
0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1), D = c(0, 0, 0, 1, 0,
1, 0, 1, 0, 1, 1, 1, 1, 0, 0), type = c("CIFS", "CITN", "CESF",
"DEFS", "CIFN", "DETS", "CITS", "DEFS", "CIFN", "DEFN", "DETS",
"DETS", "DINF", "CENT", "CIFS"), PO = runif(15, -3, 3), AO = runif(15, -3, 3)), .Names = c("Sex",
"Age", "I", "E", "S", "N", "F", "T", "C", "D", "type", "PO",
"AO"), class = c("tbl_dt", "tbl", "data.table", "data.frame"), row.names = c(NA,
-15L))
I want to sort the column type. Not the column but the characters in it. And get the same structure afterwards. For example, CIFS should then be CFIS. I tried to do it as,
df <- within(df, {
type <- apply(sapply(strsplit(df[, type], split=''), sort), 2,
function(x) paste0(x, collapse = ''))
})
Is there any simpler solution, that I have missed to find.
Since you are using data.table, I would suggest
df[, type := paste(sort(unlist(strsplit(type, ""))), collapse = ""), by = type]
like described in How to sort letters in a string?
This should work for both data.frame and data.table (base R only):
df$type <- vapply(strsplit(df$type, split=''),FUN=function(x)paste(sort(x),collapse=''),"")
Result:
> df
Sex Age I E S N F T C D type PO AO
1 F 19 1 0 1 0 1 0 1 0 CFIS 2.9750666 2.0308410
2 F 16 1 0 0 1 0 1 1 0 CINT 0.7902187 2.0891158
3 M 16 0 1 1 0 1 0 1 0 CEFS -1.7173785 2.4774140
4 F 13 0 1 1 0 1 0 0 1 DEFS 1.5352127 -1.9272470
5 M 16 1 0 0 1 1 0 1 0 CFIN -0.2160741 1.7359897
6 M 30 0 1 1 0 0 1 0 1 DEST 2.6314981 -0.6252466
7 F 16 1 0 1 0 0 1 1 0 CIST -1.6032894 -1.9938226
8 M 30 0 1 1 0 1 0 0 1 DEFS 0.7748583 -2.0935737
9 F 16 1 0 0 1 1 0 1 0 CFIN -2.9368356 0.3363364
10 F 30 0 1 0 1 1 0 0 1 DEFN -0.6506217 2.6681535
11 F 30 0 1 1 0 0 1 0 1 DEST -0.4432578 0.4627441
12 F 16 0 1 1 0 0 1 0 1 DEST 2.0236760 2.7684298
13 F 19 1 0 0 1 1 0 0 1 DFIN -1.1774931 2.6546726
14 F 1 0 1 0 1 0 1 1 0 CENT -2.2365388 2.7902646
15 F 30 1 0 1 0 1 0 1 0 CFIS -1.6139238 -2.4982620