Checking that two different variables match - r

I have a dataset that I need to run a check of two variables "Call" and "cluster".
I need to check that calls of the same value all have the same cluster number.
The problem is the K-means clustering method I used assigns different cluster numbers each time I run it.
So I need a check that looks at the allele 1 call for example and checks that the numbers are all the same under the cluster call. (the numbers could be listed as 2 one run and the change to 3 if I re-run the code)
Please see data structure below
structure(list(RFU1 = c(-0.295502405, 0.964070798, 3381.332182,
3532.769062, 3431.836843, 3242.966511, 2104.791167, 2220.008503,
3548.252161, 3506.51418, 2290.273178, 2281.587684, -5.64819475,
-11.73109864, 3784.914039, 3619.00781, 3618.211608, 3248.106466,
3394.650325, 3339.870196, 2449.202902, 2426.835174, 3432.153478,
2761.392304, -9.267907504, -7.365704356, 3743.092314, 3787.241702,
2172.027787, 2096.845649, 2135.649551, 2149.145547, 2293.757257,
2348.099108, 2321.019045, 2022.168867, -17.93532331, -12.59832941,
3805.416768, 3498.998138, 2304.597239, 2509.63987, 2181.11547,
2261.011876, 3432.453036, 3662.758933, 2371.11049, 3068.827061,
2612.107589, 2687.824075, 3179.315918, 3688.525218, 3465.327523,
3405.154043, 2535.514915, 2452.200079, 374.435299, 423.6015308,
3742.515563, 3578.777925, 2634.955017, 2527.514043, 3817.579252,
3550.999412, -10.72035816, 3294.486334, 3352.40368, 3463.150507,
3472.576514, 3741.898759, 3571.369947, 3720.645869, 3739.569593,
3855.583168, 418.6837047, 49.47548241, 2171.034284, 2155.314713,
3432.363384, 3582.508917, 3425.415274, 3487.203299, 3505.23909,
3413.342096, 113.5100691, 128.6414053, 2454.588175, 2323.061591,
3188.705702, 3376.950546, 3291.072437, 3181.001961, 3195.013863,
3776.919197, 2284.22659, 2277.338631), RFU2 = c(-8.346468029,
235.4058561, 637.9218251, 650.3759507, 617.4161748, 604.0792911,
4270.310727, 4199.615749, 689.863543, 712.6144338, 4274.287194,
4541.168491, -1.626221758, -2.437395631, 802.0941252, 730.5998997,
686.9037384, 625.8245403, 644.3586836, 642.8833044, 4937.691887,
5159.479928, 725.4449756, 573.3910899, -4.006398006, 213.2859144,
739.7910786, 731.0150586, 4408.81923, 4767.533969, 4302.641493,
4325.913445, 4597.47663, 4666.904418, 4800.357526, 4142.535329,
-17.23239968, 178.5311942, 778.305843, 743.1438168, 4214.507094,
4553.703511, 4629.339014, 4459.697405, 661.7299014, 727.1054982,
4553.170272, 5482.231486, 4520.517999, 4737.802036, 643.3599887,
726.4314715, 696.5968338, 697.6099599, 411.8118071, 409.4943424,
5687.32635, 5757.51512, 766.4240193, 779.2403225, 4745.055632,
4582.267792, 749.5679421, 675.8747055, -7.254521898, 628.3467565,
631.116767, 672.7064514, 687.2642132, 718.1192917, 731.785499,
668.3686048, 784.8055727, 791.3155894, 4471.047168, 4501.597841,
4504.670332, 4442.621066, 682.0632225, 706.6204595, 680.5242182,
683.9558692, 684.2909706, 618.6535251, 5727.684954, 6098.485474,
5099.952926, 4779.742057, 571.4303822, 614.9258218, 602.9830491,
651.2847695, 591.8833499, 742.2387568, 4443.376841, 4716.792177
), cluster = c(2L, 2L, 4L, 4L, 4L, 4L, 1L, 1L, 4L, 4L, 1L, 1L,
2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 4L, 4L, 2L, 2L, 4L, 4L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 4L, 4L, 1L, 1L, 1L, 1L,
4L, 4L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 4L, 4L,
1L, 1L, 4L, 4L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L,
1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 1L, 1L, 4L, 4L, 4L, 4L,
4L, 4L, 1L, 1L)), row.names = c(NA, -96L), class = "data.frame")
First run of k-means clustering
First run
Second run of k-means clustering
Second run

Since it is a question of labels, you can always relabel your groups afterwards. I used 4 clusters based on the within sum of square criteria :
suppressPackageStartupMessages( library(dplyr) )
dfr <- as_tibble(dfr)
groups <- lapply(1:10, function(ct) kmeans(x = dfr[,c("RFU1", "RFU2")], centers = ct)$tot.withinss) # as.matrix(
# plot(unlist(groups))
Method 1 : fix the random seed
# --- Experiment 1
set.seed(123)
groups <- kmeans(x = dfr[,c("RFU1", "RFU2")], centers = 4)
dfr$cluster <- groups$cluster
dfr
#> # A tibble: 96 x 4
#> id RFU1 RFU2 cluster
#> <chr> <dbl> <dbl> <int>
#> 1 1 -0.296 -8.35 4
#> 2 2 0.964 235. 4
#> 3 3 3381. 638. 2
#> 4 4 3533. 650. 2
#> 5 5 3432. 617. 2
#> 6 6 3243. 604. 3
#> 7 7 2105. 4270. 1
#> 8 8 2220. 4200. 1
#> 9 9 3548. 690. 2
#> 10 10 3507. 713. 2
#> # ... with 86 more rows
# --- Experiment 2
set.seed(123)
groups <- kmeans(x = dfr[,c("RFU1", "RFU2")], centers = 4, )
dfr$cluster2 <- groups$cluster
dfr
#> # A tibble: 96 x 5
#> id RFU1 RFU2 cluster cluster2
#> <chr> <dbl> <dbl> <int> <int>
#> 1 1 -0.296 -8.35 4 4
#> 2 2 0.964 235. 4 4
#> 3 3 3381. 638. 2 2
#> 4 4 3533. 650. 2 2
#> 5 5 3432. 617. 2 2
#> 6 6 3243. 604. 3 3
#> 7 7 2105. 4270. 1 1
#> 8 8 2220. 4200. 1 1
#> 9 9 3548. 690. 2 2
#> 10 10 3507. 713. 2 2
#> # ... with 86 more rows
Method 2 : labels don't matter, groups do so don't do anything
Method 3 : labels do matter
3.1 my data is ordinate, label per cluster centers
lab <- order(order(groups$centers[,1])) # c(2, 4, 3, 1)
# head( cbind(dfr$cluster2, lab[dfr$cluster2] ) )
dfr <- dfr %>%
mutate(label1 = lab[dfr$cluster2] )
3.2 my data has important individuals, label using references
# Suppose individuals 1, 4, 6 and 7 could name the classes
dfr <- dfr %>% tibble::rownames_to_column(var = "id") %>%
mutate(id = paste0("id_", id) )
refs <- tibble(id = dfr$id[c(1, 4, 6, 7)],
cluster = dfr$cluster2[c(1, 4, 6, 7)]
)
dfr %>%
mutate(label2 = refs$id[ c(2, 4, 3, 1)[cluster2] ]
# label2 = letters[1:4][cluster2]
)
#> # A tibble: 96 x 7
#> id RFU1 RFU2 cluster cluster2 label1 label2
#> <chr> <dbl> <dbl> <int> <int> <int> <chr>
#> 1 1 -0.296 -8.35 4 4 1 1
#> 2 2 0.964 235. 4 4 1 1
#> 3 3 3381. 638. 2 2 4 7
#> 4 4 3533. 650. 2 2 4 7
#> 5 5 3432. 617. 2 2 4 7
#> 6 6 3243. 604. 3 3 3 6
#> 7 7 2105. 4270. 1 1 2 4
#> 8 8 2220. 4200. 1 1 2 4
#> 9 9 3548. 690. 2 2 4 7
#> 10 10 3507. 713. 2 2 4 7
#> # ... with 86 more rows

Related

Group by partial string matches

I have a table with a list of categories each with a count value that i'd like to collapse across based on similarity ... for example Mariner-1_Amel and Mariner-10 would be a single category of Mariner and anything with 'Jockey' or 'hAT' in the name should be collapsed across.
I'm struggling to find a solution that can cope with all the possibilities. Is there an easy dplyr solution?
reproducible with
> dput(tibs)
structure(list(type = c("(TTAAG)n_1", "AMARI_1", "Copia-4_LH-I",
"DNA", "DNA-1_CQ", "DNA/hAT-Charlie", "DNA/hAT-Tip100", "DNA/MULE-MuDR",
"DNA/P", "DNA/PiggyBac", "DNA/TcMar-Mariner", "DNA/TcMar-Tc1",
"DNA/TcMar-Tigger", "G3_DM", "Gypsy-10_CFl-I", "hAT-1_DAn", "hAT-16_SM",
"hAT-N4_RPr", "HELITRON7_CB", "Jockey-1_DAn", "Jockey-1_DEl",
"Jockey-12_DF", "Jockey-5_DTa", "Jockey-6_DYa", "Jockey-6_Hmel",
"Jockey-7_HMM", "Jockey-8_Hmel", "LINE/Dong-R4", "LINE/I", "LINE/I-Jockey",
"LINE/I-Nimb", "LINE/Jockey", "LINE/L1", "LINE/L2", "LINE/R1",
"LINE/R2", "LINE/R2-NeSL", "LINE/Tad1", "LTR/Gypsy", "Mariner_CA",
"Mariner-1_AMel", "Mariner-10_HSal", "Mariner-13_ACe", "Mariner-15_HSal",
"Mariner-16_DAn", "Mariner-19_RPr", "Mariner-30_SM", "Mariner-39_SM",
"Mariner-42_HSal", "Mariner-46_HSal", "Mariner-49_HSal", "TE-5_EL",
"Unknown", "Utopia-1_Crp"), n = c(1L, 1L, 1L, 2L, 1L, 18L, 3L,
9L, 2L, 8L, 21L, 12L, 18L, 1L, 3L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 2L, 1L, 2L, 7L, 2L, 7L, 24L, 1L, 1L, 5L, 3L, 1L,
1L, 7L, 1L, 5L, 1L, 1L, 5L, 5L, 1L, 1L, 3L, 5L, 5L, 2L, 1L, 190L,
1L)), row.names = c(NA, -54L), class = c("tbl_df", "tbl", "data.frame"
))
It seems to me that your broader types are mostly/entirely at the beginning of the string. You could therefore use just the first alphanumerical sequence ([[:alnum:]]+) of the type as broader types. This would give you the following types:
library(tidyverse)
df %>%
mutate(type_short = str_extract(type, "[[:alnum:]]+")) %>%
count(type_short, sort = TRUE)
#> # A tibble: 15 x 2
#> type_short n
#> <chr> <int>
#> 1 Mariner 12
#> 2 LINE 11
#> 3 DNA 10
#> 4 Jockey 8
#> 5 hAT 3
#> 6 AMARI 1
#> 7 Copia 1
#> 8 G3 1
#> 9 Gypsy 1
#> 10 HELITRON7 1
#> 11 LTR 1
#> 12 TE 1
#> 13 TTAAG 1
#> 14 Unknown 1
#> 15 Utopia 1
You can easily use the new column to group_by:
df %>%
mutate(type_short = str_extract(type, "[[:alnum:]]+")) %>%
group_by(type_short) %>%
summarise(n = sum(n))
#> # A tibble: 15 x 2
#> type_short n
#> <chr> <int>
#> 1 AMARI 1
#> 2 Copia 1
#> 3 DNA 94
#> 4 G3 1
#> 5 Gypsy 3
#> 6 hAT 5
#> 7 HELITRON7 1
#> 8 Jockey 10
#> 9 LINE 54
#> 10 LTR 7
#> 11 Mariner 35
#> 12 TE 1
#> 13 TTAAG 1
#> 14 Unknown 190
#> 15 Utopia 1
Theoretically, you could also try to use string similarity here. Yet your types do not have great similarity among themselves. A relative Levenshtein distance (distance / characters of the longer string) for example retrieves results like this:
strings <- c("Mariner-1_Amel", "Mariner-10")
adist(strings) / max(nchar(strings))
#> [,1] [,2]
#> [1,] 0.0000000 0.3571429
#> [2,] 0.3571429 0.0000000
This could be interpreted as the two types being 36% similar. Finding a good threshold might be hard in that case.
This solution uses package dplyr function case_when and base R grepl.
library(dplyr)
tibs %>%
mutate(category = case_when(
grepl("hAT|Jockey", type) ~ "Jokey",
grepl("Mariner", type) ~ "Mariner",
grepl("DNA", type) ~ "DNA",
grepl("LINE", type) ~"LINE",
TRUE ~ as.character(type)
),
category = factor(category)
)
If there is no commonality to define the groups you can define individual conditions using case_when.
library(dplyr)
library(stringr)
tibs %>%
mutate(category = case_when(str_detect(type, 'Mariner-\\d+') ~ 'Mariner',
str_detect(type, 'Jockey|hAT') ~ 'common',
#Add more conditions
))

Computing the positive response rate for survey dataset using dplyr

I have a large survey dataset which looks as follows:
trust09 q16a q16b q16c q16f q16g q23e
1 5A3 3 3 3 4 3 3
2 5A3 2 2 2 2 3 2
3 5A3 4 4 4 5 5 5
4 5A3 3 3 2 4 4 3
5 5A3 NA NA NA NA NA NA
6 5A3 4 4 4 4 4 3
....
....
159524 TAN 2 2 3 4 4 3
159525 TAN 4 3 2 1 3 3
159526 TAN 4 4 4 4 4 4
159527 TAN 4 NA 4 2 3 4
159528 TAN 4 4 4 4 4 4
159529 TAN 4 4 4 5 4 5
trust09 is the code for the hospital or organisation and the remaining columns are survey questions from strongly disagree to strongly agree and are scored from 1 to 5 respectively.
Each row corresponds a response from a respondent belonging to some hospital.
From this data, I want to calculate the positive response rate or PRR for each survey question for each hospital i.e. the number of respondents that have answered 'Agree' (4) or 'Strongly Agree' (5) and express this is as a percentage over the total no. of respondents.
I can get the total no. of respondents quite easily from the following code:
df0 <- nss08 %>% select(trust09, q16a, q16b, q16c, q16f, q16g, q23e) %>%
group_by(trust09) %>%
summarise_all(funs(length(.)))
Which gives me the following table:
trust09 q16a q16b q16c q16f q16g q23e
<chr> <int> <int> <int> <int> <int> <int>
1 5A3 414 414 414 414 414 414
2 5A4 298 298 298 298 298 298
3 5A5 271 271 271 271 271 271
4 5A7 384 384 384 384 384 384
5 5A8 343 343 343 343 343 343
6 5A9 502 502 502 502 502 502
I can easily count the number of responses for 'Agree'(4) and 'Strongly Agree' (5) for a single survey question using the following code:
df1 <- nss08 %>%
select(trust09, q16a) %>%
group_by(trust09) %>%
filter(q16a == 4|q16a == 5) %>%
summarise_all(funs(length(.)))
which gives this sample data:
trust09 q16a
<chr> <int>
1 5A3 124
2 5A4 65
3 5A5 107
4 5A7 142
5 5A8 126
6 5A9 159
....
I also get the same result using:
aggregate(q16a ~ trust09, data = nss08[nss08$q16a == 4|nss08$q16a == 5, ], length)
I then simply merge these two data and calculate the PRR for the variable/question q16a i.e. no. of respondents who answered 'Agree' (4) or 'Strongly Agree' (5) for this question, dividend by total responses for the question and then multiplied by 100.
The problem occurs when I wish to do the same for all the remaining variables simultaneously rather than simply writing one code corresponding to one single variable.
I have tried the following, but I get an error message:
myList <- vector("list", length = length(myVars))
for (x in seq_along(myVars)){
myList[x] <- aggregate(myVars[x] ~ trust09, data = nss08[nss08$myVars[,x] == 4|nss08$myVars[,x] == 5, ], length)}
I have also tried this without any success:
for (x in seq_along(myVars)){
myList[[x]] <- nss08 %>%
select(trust09, myVars[x]) %>%
group_by(trust09) %>%
filter(myVars[x] == 4|myVars[x] == 5) %>%
summarise(length(myVars[x]))
}
Maybe, you can see from the code what I am trying to do here.
I wanted to know how can you do the whole process more efficiently by using less code and eventually create a data frame that consists of positive response rates for each the variables/survey questions?
Thank you.
Base on your dplyr code , I made this function , you can call it in the for loop or within apply
xx=function(arg){
var=quo(arg)
#print(var)
df1=df %>%
select(trust09, !!!quos(arg)) %>%
group_by(trust09) %>%
filter_(.dots=paste0(arg,'==','4|',arg,'== 5'))%>%
summarise(length(!!var))
return(df1)
}
xx('q16a')
<quosure: frame>
~arg
# A tibble: 2 x 2
trust09 `length(arg)`
<chr> <int>
1 5A3 1
2 TAN 1
Assume your data frame contains trust09 and all other columns correspond to questions you want to summarize, you can use summarize_all and count the number of 4 and 5 responses with sum(col %in% 4:5, na.rm=TRUE) and divide it by length(col) directly:
df %>% group_by(trust09) %>% summarise_all(~ sum(. %in% 4:5, na.rm = T)/length(.))
# here . refers to all other columns individually except the group variable
# A tibble: 2 x 7
# trust09 q16a q16b q16c q16f q16g q23e
# <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 5A3 0.3333333 0.3333333 0.3333333 0.6666667 0.5000000 0.1666667
#2 TAN 0.8333333 0.5000000 0.6666667 0.6666667 0.6666667 0.6666667
Data used as following:
dput(df)
structure(list(trust09 = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("5A3", "TAN"), class = "factor"),
q16a = c(3L, 2L, 4L, 3L, NA, 4L, 2L, 4L, 4L, 4L, 4L, 4L),
q16b = c(3L, 2L, 4L, 3L, NA, 4L, 2L, 3L, 4L, NA, 4L, 4L),
q16c = c(3L, 2L, 4L, 2L, NA, 4L, 3L, 2L, 4L, 4L, 4L, 4L),
q16f = c(4L, 2L, 5L, 4L, NA, 4L, 4L, 1L, 4L, 2L, 4L, 5L),
q16g = c(3L, 3L, 5L, 4L, NA, 4L, 4L, 3L, 4L, 3L, 4L, 4L),
q23e = c(3L, 2L, 5L, 3L, NA, 3L, 3L, 3L, 4L, 4L, 4L, 5L)), .Names = c("trust09",
"q16a", "q16b", "q16c", "q16f", "q16g", "q23e"), class = "data.frame", row.names = c(NA,
12L))

Find all numbers in range with local min and global max

I have a dataframe testData which is made up of many unique ids. My objective is to identify whether or not the ids contain all of the possible integers in the range of month, yday, and week where the min is the first value per id and max is the max value in the entire range of the column
Please note this is different from the related question here
In other words, if id has all possible values in the range in month, then it should receive a t. For example, under month where id = 1, the min value is 2 and the max value for the whole column is 5, therefore 1 should receive a true because there is a value 2, 3, 4, and 5. Where id = 2, however, there are only values 1, 2, 4, and 5, so the 3 was skipped and therefore 2 should receive an f.
So far, I have a formula that takes all the values in the entire range of the column (but NOT the min value per id):
library(data.table)
setDT(testData)
output<-testData[,.(month=all(unique(testData$month)%in%.SD$month),yday=all(unique(testData$yday)%in%.SD$yday),week=all(unique(testData$week)%in%.SD$week)),by=(id)]
Any idea how I could integrate min where min is the minimum value per id and max is the maximum value in the range?
> testData
id month yday week
1 1 2 1 1
2 3 1 2 1
3 4 1 3 1
4 2 1 4 1
5 3 3 5 2
6 4 3 6 3
7 2 2 7 1
8 3 1 8 3
9 1 2 9 2
10 5 4 10 3
11 3 2 11 1
12 4 4 12 1
13 5 4 13 2
14 1 3 14 3
15 1 4 15 1
16 1 5 16 2
17 2 4 17 3
18 2 5 18 1
19 5 5 19 1
> dput(testData)
structure(list(id = c(1L, 3L, 4L, 2L, 3L, 4L, 2L, 3L, 1L, 5L,
3L, 4L, 5L, 1L, 1L, 1L, 2L, 2L, 5L), month = c(2L, 1L, 1L, 1L,
3L, 3L, 2L, 1L, 2L, 4L, 2L, 4L, 4L, 3L, 4L, 5L, 4L, 5L, 5L),
yday = 1:19, week = c(1L, 1L, 1L, 1L, 2L, 3L, 1L, 3L, 2L,
3L, 1L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 1L)), .Names = c("id",
"month", "yday", "week"), class = "data.frame", row.names = c(NA,
-19L))
In the end, the output should look like this:
> output
id month yday week
1 1 t f t
2 2 f f f
3 3 f f t
4 4 f f f
5 5 t f t
Using dplyr you can group by id and then just check whether all elements of the range are in the values present for each group. Note that min(month) gives the min for the grouped id variable, but max(testData$month) gives the max for the whole list.
library(dplyr)
tD2 <- testData %>% group_by(id) %>%
summarise(month=all(min(month):max(testData$month) %in% month),
yday=all(min(yday):max(testData$yday) %in% yday),
week=all(min(week):max(testData$week) %in% week))
tD2
# A tibble: 5 × 4
id month yday week
<int> <lgl> <lgl> <lgl>
1 1 TRUE FALSE TRUE
2 2 FALSE FALSE FALSE
3 3 FALSE FALSE TRUE
4 4 FALSE FALSE FALSE
5 5 TRUE FALSE TRUE

Tidy data.frame with repeated column names

I have a program that gives me data in this format
toy
file_path Condition Trial.Num A B C ID A B C ID A B C ID
1 root/some.extension Baseline 1 2 3 5 car 2 1 7 bike 4 9 0 plane
2 root/thing.extension Baseline 2 3 6 45 car 5 4 4 bike 9 5 4 plane
3 root/else.extension Baseline 3 4 4 6 car 7 5 4 bike 68 7 56 plane
4 root/uniquely.extension Treatment 1 5 3 7 car 1 7 37 bike 9 8 7 plane
5 root/defined.extension Treatment 2 6 7 3 car 4 6 8 bike 9 0 8 plane
My goal is to tidy the format into something that at least can be easier to finally tidy with reshape having unique column names
tidy_toy
file_path Condition Trial.Num A B C ID
1 root/some.extension Baseline 1 2 3 5 car
2 root/thing.extension Baseline 2 3 6 45 car
3 root/else.extension Baseline 3 4 4 6 car
4 root/uniquely.extension Treatment 1 5 3 7 car
5 root/defined.extension Treatment 2 6 7 3 car
6 root/some.extension Baseline 1 2 1 7 bike
7 root/thing.extension Baseline 2 5 4 4 bike
8 root/else.extension Baseline 3 7 5 4 bike
9 root/uniquely.extension Treatment 1 1 7 37 bike
10 root/defined.extension Treatment 2 4 6 8 bike
11 root/some.extension Baseline 1 4 9 0 plane
12 root/thing.extension Baseline 2 9 5 4 plane
13 root/else.extension Baseline 3 68 7 56 plane
14 root/uniquely.extension Treatment 1 9 8 7 plane
15 root/defined.extension Treatment 2 9 0 8 plane
If I try to melt from toy it doesn't work because only the first ID column will get used for id.vars (hence everything will get tagged as cars). Identical variables will get dropped.
Here's the dput of both tables
structure(list(file_path = structure(c(3L, 4L, 2L, 5L, 1L), .Label = c("root/defined.extension",
"root/else.extension", "root/some.extension", "root/thing.extension",
"root/uniquely.extension"), class = "factor"), Condition = structure(c(1L,
1L, 1L, 2L, 2L), .Label = c("Baseline", "Treatment"), class = "factor"),
Trial.Num = c(1L, 2L, 3L, 1L, 2L), A = 2:6, B = c(3L, 6L,
4L, 3L, 7L), C = c(5L, 45L, 6L, 7L, 3L), ID = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "car", class = "factor"), A = c(2L,
5L, 7L, 1L, 4L), B = c(1L, 4L, 5L, 7L, 6L), C = c(7L, 4L,
4L, 37L, 8L), ID = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "bike", class = "factor"),
A = c(4L, 9L, 68L, 9L, 9L), B = c(9L, 5L, 7L, 8L, 0L), C = c(0L,
4L, 56L, 7L, 8L), ID = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "plane", class = "factor")), .Names = c("file_path",
"Condition", "Trial.Num", "A", "B", "C", "ID", "A", "B", "C",
"ID", "A", "B", "C", "ID"), class = "data.frame", row.names = c(NA,
-5L))
structure(list(file_path = structure(c(3L, 4L, 2L, 5L, 1L, 3L,
4L, 2L, 5L, 1L, 3L, 4L, 2L, 5L, 1L), .Label = c("root/defined.extension",
"root/else.extension", "root/some.extension", "root/thing.extension",
"root/uniquely.extension"), class = "factor"), Condition = structure(c(1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L), .Label = c("Baseline",
"Treatment"), class = "factor"), Trial.Num = c(1L, 2L, 3L, 1L,
2L, 1L, 2L, 3L, 1L, 2L, 1L, 2L, 3L, 1L, 2L), A = c(2L, 3L, 4L,
5L, 6L, 2L, 5L, 7L, 1L, 4L, 4L, 9L, 68L, 9L, 9L), B = c(3L, 6L,
4L, 3L, 7L, 1L, 4L, 5L, 7L, 6L, 9L, 5L, 7L, 8L, 0L), C = c(5L,
45L, 6L, 7L, 3L, 7L, 4L, 4L, 37L, 8L, 0L, 4L, 56L, 7L, 8L), ID = structure(c(2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L), .Label = c("bike",
"car", "plane"), class = "factor")), .Names = c("file_path",
"Condition", "Trial.Num", "A", "B", "C", "ID"), class = "data.frame", row.names = c(NA,
-15L))
You can use the make.unique-function to create unique column names. After that you can use melt from the data.table-package which is able to create multiple value-columns based on patterns in the columnnames:
# make the column names unique
names(toy) <- make.unique(names(toy))
# let the 'Condition' column start with a small letter 'c'
# so it won't be detected by the patterns argument from melt
names(toy)[2] <- tolower(names(toy)[2])
# load the 'data.table' package
library(data.table)
# tidy the data into long format
tidy_toy <- melt(setDT(toy),
measure.vars = patterns('^A','^B','^C','^ID'),
value.name = c('A','B','C','ID'))
which gives:
> tidy_toy
file_path condition Trial.Num variable A B C ID
1: root/some.extension Baseline 1 1 2 3 5 car
2: root/thing.extension Baseline 2 1 3 6 45 car
3: root/else.extension Baseline 3 1 4 4 6 car
4: root/uniquely.extension Treatment 1 1 5 3 7 car
5: root/defined.extension Treatment 2 1 6 7 3 car
6: root/some.extension Baseline 1 2 2 1 7 bike
7: root/thing.extension Baseline 2 2 5 4 4 bike
8: root/else.extension Baseline 3 2 7 5 4 bike
9: root/uniquely.extension Treatment 1 2 1 7 37 bike
10: root/defined.extension Treatment 2 2 4 6 8 bike
11: root/some.extension Baseline 1 3 4 9 0 plane
12: root/thing.extension Baseline 2 3 9 5 4 plane
13: root/else.extension Baseline 3 3 68 7 56 plane
14: root/uniquely.extension Treatment 1 3 9 8 7 plane
15: root/defined.extension Treatment 2 3 9 0 8 plane
Another option is to use a list of column-indexes for measure.vars:
tidy_toy <- melt(setDT(toy),
measure.vars = list(c(4,8,12), c(5,9,13), c(6,10,14), c(7,11,15)),
value.name = c('A','B','C','ID'))
Making the column-names unique isn't necessary then.
A more complicated method that creates names that are better distinguishable by the patterns argument:
# select the names that are not unique
tt <- table(names(toy))
idx <- which(names(toy) %in% names(tt)[tt > 1])
nms <- names(toy)[idx]
# make them unique
names(toy)[idx] <- paste(nms,
rep(seq(length(nms) / length(names(tt)[tt > 1])),
each = length(names(tt)[tt > 1])),
sep = '.')
# your columnnames are now unique:
> names(toy)
[1] "file_path" "Condition" "Trial.Num" "A.1" "B.1" "C.1" "ID.1" "A.2"
[9] "B.2" "C.2" "ID.2" "A.3" "B.3" "C.3" "ID.3"
# tidy the data into long format
tidy_toy <- melt(setDT(toy),
measure.vars = patterns('^A.\\d','^B.\\d','^C.\\d','^ID.\\d'),
value.name = c('A','B','C','ID'))
which will give the same end-result.
As mentioned in the comments, the janitor-package can be helpful for this problem as well. The clean_names() works similar as the make.unique function. See here for an explanation.
with tidyverse we can do :
library(tidyverse)
toy %>%
repair_names(sep="_") %>%
pivot_longer(-(1:3),names_to = c(".value","id"), names_sep="_") %>%
select(-id)
#> # A tibble: 15 x 7
#> file_path Condition Trial.Num A B C ID
#> <fct> <fct> <int> <int> <int> <int> <fct>
#> 1 root/some.extension Baseline 1 2 3 5 car
#> 2 root/some.extension Baseline 1 2 1 7 bike
#> 3 root/some.extension Baseline 1 4 9 0 plane
#> 4 root/thing.extension Baseline 2 3 6 45 car
#> 5 root/thing.extension Baseline 2 5 4 4 bike
#> 6 root/thing.extension Baseline 2 9 5 4 plane
#> 7 root/else.extension Baseline 3 4 4 6 car
#> 8 root/else.extension Baseline 3 7 5 4 bike
#> 9 root/else.extension Baseline 3 68 7 56 plane
#> 10 root/uniquely.extension Treatment 1 5 3 7 car
#> 11 root/uniquely.extension Treatment 1 1 7 37 bike
#> 12 root/uniquely.extension Treatment 1 9 8 7 plane
#> 13 root/defined.extension Treatment 2 6 7 3 car
#> 14 root/defined.extension Treatment 2 4 6 8 bike
#> 15 root/defined.extension Treatment 2 9 0 8 plane
#> Warning message:
#> Expected 2 pieces. Missing pieces filled with `NA` in 4 rows [1, 2, 3, 4].

Create n data sets from one data set without repetition using stratified sampling

I have a data set train which has say 500 rows, I would like to get a data frame with n columns each containing 500/n values(row numbers without repetition in other columns) basing on stratified sampling of a column in train, say train$y.
I have tried the following but it returns duplicate values,
library(caret)
n <- 10 # I want to divide my data set in to 10 parts
data_partition <- createDataPartition(y = train$y, times = 10,
p = 1/n, list = F)
To summarize with an example,
If I have a data set train with 100 rows and one of the column train$y(value= 0 or 1). I would like to get 10 data sets with 10 rows each from the train and they should be stratified basing on train$y and they should not be seen on other 9 data sets.
Example input:
ID x y
1 1 0
2 2 0
3 3 1
4 1 1
5 2 1
6 4 1
7 4 0
8 4 1
9 3 1
10 1 1
11 2 1
12 3 0
13 4 1
14 5 1
15 6 1
16 10 1
17 9 1
18 3 0
19 7 0
20 8 1
Expected output (4 first column, with details of each set aside)
ID x y sample set 1 set 2 set 3
1 1 0 set 2 ID x y ID x y ID x y
2 2 0 set 3 8 4 1 11 2 1 17 9 1
3 3 1 set 3 9 3 1 12 3 0 5 2 1
4 1 1 set 3 10 1 1 13 4 1 6 4 1
5 2 1 set 3 18 3 0 1 1 0 7 4 0
6 4 1 set 3 19 7 0 14 5 1 2 2 0
7 4 0 set 3 20 8 1 15 6 1 3 3 1
8 4 1 set 1 16 10 1 4 1 1
9 3 1 set 1
10 1 1 set 1
11 2 1 set 2
12 3 0 set 2
13 4 1 set 2
14 5 1 set 2
15 6 1 set 2
16 10 1 set 2
17 9 1 set 3
18 3 0 set 1
19 7 0 set 1
20 8 1 set 1
In the above example given input as ID,x and y. I would like to get the column sample which I can segregate into those 3 tables(to the right) whenever I want to.
Please observe, the y in the data has 14- 1s and 6- 0s which are in the ratio of 70:30 and the output sets are almost in similar ratio.
Sample dataset in a copy/run friendly format:
data <- structure(list(ID = 1:20, x = c(1L, 2L, 3L, 1L, 2L, 4L, 4L, 4L,
3L, 1L, 2L, 3L, 4L, 5L, 6L, 10L, 9L, 3L, 7L, 8L), y = c(0L, 0L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L,
0L, 1L)), .Names = c("ID", "x", "y"), class = "data.frame", row.names = c(NA,
-20L))
It can be done using the caret package. Try the code below
# Createing dataset
data <- structure(list(ID = 1:20, x = c(1L, 2L, 3L, 1L, 2L, 4L, 4L, 4L,
3L, 1L, 2L, 3L, 4L, 5L, 6L, 10L, 9L, 3L, 7L, 8L), y = c(0L, 0L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L,
0L, 1L)), .Names = c("ID", "x", "y"), class = "data.frame", row.names = c(NA, -20L))
# Solution
library(caret)
k <- createFolds(data$y,k = 3,list = F)
addmargins(table(k,data$y))

Resources