How to I keep I visualise my factor variable without any number when clustering it's done? I want a more elegant data visualisation. Not as in this picture:
Yet, the one I retrieved from my kmean clustering does seem to be as nice as in the - https://afit-r.github.io/kmeans_clustering. As you can see, the author manages nicely to plot the cities as factor, where the names are visible.
Here is the data I have:
data_piv <- structure(list(Comorbidities = structure(1:9, .Label = c("asthma",
"diabetes_type_one", "diabetes_type_two", "heart_disease", "hypertension",
"kidney_disease", "liver_disease", "lung_condition", "obesity"
), class = "factor"), chills = c(26L, 22L, 23L, 43L, 22L, 15L,
43L, 24L, 20L), cough = c(58L, 57L, 56L, 57L, 59L, 60L, 62L,
58L, 59L), diarrhoea = c(21L, 14L, 16L, 25L, 19L, 21L, 25L, 19L,
22L), fatigue = c(59L, 51L, 53L, 62L, 54L, 49L, 62L, 56L, 54L
), headache = c(44L, 30L, 34L, 44L, 39L, 33L, 48L, 43L, 42L),
loss_smell_taste = c(21L, 21L, 19L, 25L, 19L, 23L, 28L, 20L,
19L), muscle_ache = c(47L, 44L, 43L, 60L, 46L, 43L, 56L,
45L, 46L), nasal_congestion = c(34L, 25L, 32L, 36L, 33L,
33L, 46L, 38L, 34L), nausea_vomiting = c(11L, 10L, 9L, 18L,
7L, 12L, 28L, 13L, 9L), shortness_breath = c(61L, 36L, 32L,
53L, 35L, 50L, 44L, 50L, 37L), sore_throat = c(46L, 36L,
39L, 51L, 49L, 50L, 57L, 45L, 49L), sputum = c(47L, 34L,
41L, 50L, 39L, 41L, 47L, 46L, 43L), temperature = c(20L,
31L, 31L, 32L, 23L, 18L, 38L, 23L, 20L)), row.names = c(NA,
-9L), groups = structure(list(Comorbidities = structure(1:9, .Label = c("asthma",
"diabetes_type_one", "diabetes_type_two", "heart_disease", "hypertension",
"kidney_disease", "liver_disease", "lung_condition", "obesity"
), class = "factor"), .rows = structure(list(1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 9L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Here is the k-means clustering I have applied:
data_scaled <- as.data.frame(scale(data_piv[2:14]))
km_res <- kmeans(data_scaled, centers = 4, nstart = 25)
And tried to plot it :
fviz_cluster(km_res, data = data_piv)
And of course the picture above has been achieved with Comorbidities being transformed in an integer. Yet as I said, I dislike as it is not elegant. I want, instead of the numbers, to get the actual names of the factor variable. Can someone help?
This could help:
library(factoextra)
#Code
data_piv <- as.data.frame(data_piv)
data_piv$Comorbidities <- as.character(data_piv$Comorbidities)
rownames(data_piv) <- data_piv$Comorbidities
data_scaled <- as.data.frame(scale(data_piv[2:14]))
km_res <- kmeans(data_scaled, centers = 4, nstart = 25)
fviz_cluster(km_res, data = data_scaled)
This question already has answers here:
Create binary column (0/1) based on condition in another column
(2 answers)
Closed 3 years ago.
I am trying to create a new column in R (yes/no indicator) where if the data in X3 is >= 50 it would = 1(yes) or if <= 49 it would = 0(no).
I have tried various combinations of ifelse statements, I just cannot get it work. I need this step in order to construct my confidence interval.
dput (crime)
structure(list(Y = c(478L, 494L, 643L, 341L, 773L, 603L, 484L,
546L, 424L, 548L, 506L, 819L, 541L, 491L, 514L, 371L, 457L, 437L,
570L, 432L, 619L, 357L, 623L, 547L, 792L, 799L, 439L, 867L, 912L,
462L, 859L, 805L, 652L, 776L, 919L, 732L, 657L, 1419L, 989L,
821L, 1740L, 815L, 760L, 936L, 863L, 783L, 715L, 1504L, 1324L,
940L), X1 = c(184L, 213L, 347L, 565L, 327L, 260L, 325L, 102L,
38L, 226L, 137L, 369L, 109L, 809L, 29L, 245L, 118L, 148L, 387L,
98L, 608L, 218L, 254L, 697L, 827L, 693L, 448L, 942L, 1017L, 216L,
673L, 989L, 630L, 404L, 692L, 1517L, 879L, 631L, 1375L, 1139L,
3545L, 706L, 451L, 433L, 601L, 1024L, 457L, 1441L, 1022L, 1244L
), X2 = c(40L, 32L, 57L, 31L, 67L, 25L, 34L, 33L, 36L, 31L, 35L,
30L, 44L, 32L, 30L, 16L, 29L, 36L, 30L, 23L, 33L, 35L, 38L, 44L,
28L, 35L, 31L, 39L, 27L, 36L, 38L, 46L, 29L, 32L, 39L, 44L, 33L,
43L, 22L, 30L, 86L, 30L, 32L, 43L, 20L, 55L, 44L, 37L, 82L, 66L
), X3 = c(74L, 72L, 70L, 71L, 72L, 68L, 68L, 62L, 69L, 66L, 60L,
81L, 66L, 67L, 65L, 64L, 64L, 62L, 59L, 56L, 46L, 54L, 54L, 45L,
57L, 57L, 61L, 52L, 44L, 43L, 48L, 57L, 47L, 50L, 48L, 49L, 72L,
59L, 49L, 54L, 62L, 47L, 45L, 48L, 69L, 42L, 49L, 57L, 72L, 67L
), X4 = c(11L, 11L, 18L, 11L, 9L, 8L, 12L, 13L, 7L, 9L, 13L,
4L, 9L, 11L, 12L, 10L, 12L, 7L, 15L, 15L, 22L, 14L, 20L, 26L,
12L, 9L, 19L, 17L, 21L, 18L, 19L, 14L, 19L, 19L, 16L, 13L, 13L,
14L, 9L, 13L, 22L, 17L, 34L, 26L, 23L, 23L, 18L, 15L, 22L, 26L
), X5 = c(31L, 43L, 16L, 25L, 29L, 32L, 24L, 28L, 25L, 58L, 21L,
77L, 37L, 37L, 35L, 42L, 21L, 81L, 31L, 50L, 24L, 27L, 22L, 18L,
23L, 60L, 14L, 31L, 24L, 23L, 22L, 25L, 25L, 21L, 32L, 31L, 13L,
21L, 46L, 27L, 18L, 39L, 15L, 23L, 7L, 23L, 30L, 35L, 15L, 18L
), X6 = c(20L, 18L, 16L, 19L, 24L, 15L, 14L, 11L, 12L, 15L, 9L,
36L, 12L, 16L, 11L, 14L, 10L, 27L, 16L, 15L, 8L, 13L, 11L, 8L,
11L, 18L, 12L, 10L, 9L, 8L, 10L, 12L, 9L, 9L, 11L, 14L, 22L,
13L, 13L, 12L, 15L, 11L, 10L, 12L, 12L, 11L, 12L, 13L, 16L, 16L
), X7 = structure(list(Y = c(478L, 494L, 643L, 341L, 773L, 603L,
484L, 546L, 424L, 548L, 506L, 819L, 541L, 491L, 514L, 371L, 457L,
437L, 570L, 432L, 619L, 357L, 623L, 547L, 792L, 799L, 439L, 867L,
912L, 462L, 859L, 805L, 652L, 776L, 919L, 732L, 657L, 1419L,
989L, 821L, 1740L, 815L, 760L, 936L, 863L, 783L, 715L, 1504L,
1324L, 940L), X1 = c(184L, 213L, 347L, 565L, 327L, 260L, 325L,
102L, 38L, 226L, 137L, 369L, 109L, 809L, 29L, 245L, 118L, 148L,
387L, 98L, 608L, 218L, 254L, 697L, 827L, 693L, 448L, 942L, 1017L,
216L, 673L, 989L, 630L, 404L, 692L, 1517L, 879L, 631L, 1375L,
1139L, 3545L, 706L, 451L, 433L, 601L, 1024L, 457L, 1441L, 1022L,
1244L), X2 = c(40L, 32L, 57L, 31L, 67L, 25L, 34L, 33L, 36L, 31L,
35L, 30L, 44L, 32L, 30L, 16L, 29L, 36L, 30L, 23L, 33L, 35L, 38L,
44L, 28L, 35L, 31L, 39L, 27L, 36L, 38L, 46L, 29L, 32L, 39L, 44L,
33L, 43L, 22L, 30L, 86L, 30L, 32L, 43L, 20L, 55L, 44L, 37L, 82L,
66L), X3 = c(74L, 72L, 70L, 71L, 72L, 68L, 68L, 62L, 69L, 66L,
60L, 81L, 66L, 67L, 65L, 64L, 64L, 62L, 59L, 56L, 46L, 54L, 54L,
45L, 57L, 57L, 61L, 52L, 44L, 43L, 48L, 57L, 47L, 50L, 48L, 49L,
72L, 59L, 49L, 54L, 62L, 47L, 45L, 48L, 69L, 42L, 49L, 57L, 72L,
67L), X4 = c(11L, 11L, 18L, 11L, 9L, 8L, 12L, 13L, 7L, 9L, 13L,
4L, 9L, 11L, 12L, 10L, 12L, 7L, 15L, 15L, 22L, 14L, 20L, 26L,
12L, 9L, 19L, 17L, 21L, 18L, 19L, 14L, 19L, 19L, 16L, 13L, 13L,
14L, 9L, 13L, 22L, 17L, 34L, 26L, 23L, 23L, 18L, 15L, 22L, 26L
), X5 = c(31L, 43L, 16L, 25L, 29L, 32L, 24L, 28L, 25L, 58L, 21L,
77L, 37L, 37L, 35L, 42L, 21L, 81L, 31L, 50L, 24L, 27L, 22L, 18L,
23L, 60L, 14L, 31L, 24L, 23L, 22L, 25L, 25L, 21L, 32L, 31L, 13L,
21L, 46L, 27L, 18L, 39L, 15L, 23L, 7L, 23L, 30L, 35L, 15L, 18L
), X6 = c(20L, 18L, 16L, 19L, 24L, 15L, 14L, 11L, 12L, 15L, 9L,
36L, 12L, 16L, 11L, 14L, 10L, 27L, 16L, 15L, 8L, 13L, 11L, 8L,
11L, 18L, 12L, 10L, 9L, 8L, 10L, 12L, 9L, 9L, 11L, 14L, 22L,
13L, 13L, 12L, 15L, 11L, 10L, 12L, 12L, 11L, 12L, 13L, 16L, 16L
), X7 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0,
1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0)), row.names = c(NA, -50L), .Names = c("Y",
"X1", "X2", "X3", "X4", "X5", "X6", "X7"), class = "data.frame")), .Names = c("Y",
"X1", "X2", "X3", "X4", "X5", "X6", "X7"), row.names = c(NA,
-50L), class = "data.frame")
The dput didn't work. But I'd managed to dump the data from column Y and X3 in a dataset (called data) and use dplyr::mutate to do the transformation with a straightforward ifelse condition.
library(dplyr)
data <- data %>% mutate(X3_cat = ifelse(X3 >= 50, 1, 0))
I want to run the bfastts function (https://www.rdocumentation.org/packages/bfast/versions/1.5.7/topics/bfastts) on a series of dates formatted as dd-mm-yyyy and a series of values to convert them into a time series. For this function the dates need to be of "POSIXlt" type. However, when running the code
dv<-as.POSIXct.POSIXlt(dates, tz="")
I am getting the error
Error in as.POSIXct.POSIXlt(dates, tz = "") : invalid 'x' argument
When running
dv<-strftime(as.POSIXct.POSIXlt(dates, tz="")
The returned list seems to be empty. When inputting my date list without conversion into the bfastts I'm getting the error
Error in as.POSIXlt.default(dates) :
do not know how to convert 'dates' to class “POSIXlt”
I am not used to coding in R as I usually work in python. I've tried googling all three errors but I can't find a solution. Could any one provide me some pointers?
Edit:
dput(dates) gives me:
list(V1 = structure(c(19L, 57L, 31L, 59L, 33L, 34L, 4L, 7L, 40L,
12L, 50L, 56L, 3L, 37L, 6L, 39L, 46L, 17L, 43L, 55L, 30L, 2L,
36L, 38L, 11L, 21L, 49L, 24L, 27L, 10L, 45L, 14L, 1L, 18L, 47L,
54L, 29L, 32L, 8L, 42L, 9L, 16L, 44L, 48L, 23L, 51L, 52L, 26L,
35L, 5L, 15L, 20L, 22L, 25L, 53L, 28L, 58L, 41L, 13L), .Label = c("1-8-
2016", "11-5-2015", "11-7-2014", "12-10-2013", "12-2-2018", "12-8-2014",
"13-11-2013", "13-3-2017", "14-4-2017", "14-6-2016", "14-7-2015",
"15-12-2013", "15-2-2019", "16-7-2016", "17-4-2018", "17-6-2017",
"18-12-2014", "18-9-2016", "19-4-2013", "19-5-2018", "2-10-2015",
"20-6-2018", "20-8-2017", "21-12-2015", "22-7-2018", "23-10-2017",
"23-2-2016", "23-8-2018", "24-1-2017", "24-3-2015", "24-7-2013",
"25-2-2017", "25-8-2013", "26-9-2013", "27-1-2018", "27-5-2015",
"27-7-2014", "28-6-2015", "28-8-2014", "29-11-2013", "29-12-2018",
"29-3-2017", "3-1-2015", "3-7-2017", "30-6-2016", "31-10-2014",
"4-10-2016", "4-8-2017", "5-12-2015", "5-3-2014", "5-9-2017",
"7-10-2017", "7-8-2018", "8-1-2017", "8-3-2015", "8-5-2014",
"8-7-2013", "8-9-2018", "9-8-2013"), class = "factor"))
The problem is that dates is a list, but you actually want to access the first entry (V1) of it. Further you have to specify that the dates you are providing are in the format dd-mm-yyyy. This you can do with format = "%d-%m-%Y". Thus the following works:
as.POSIXlt(dates$V1, format = "%d-%m-%Y", tz="")
# [1] "2013-04-19 CEST" "2013-07-08 CEST" "2013-07-24 CEST" "2013-08-09 CEST"
# ...
Data
dates <- list(V1 = structure(c(19L, 57L, 31L, 59L, 33L, 34L, 4L, 7L, 40L,
12L, 50L, 56L, 3L, 37L, 6L, 39L, 46L, 17L, 43L, 55L, 30L, 2L,
36L, 38L, 11L, 21L, 49L, 24L, 27L, 10L, 45L, 14L, 1L, 18L, 47L,
54L, 29L, 32L, 8L, 42L, 9L, 16L, 44L, 48L, 23L, 51L, 52L, 26L,
35L, 5L, 15L, 20L, 22L, 25L, 53L, 28L, 58L, 41L, 13L),
.Label = c("1-8-2016", "11-5-2015", "11-7-2014", "12-10-2013", "12-2-2018", "12-8-2014",
"13-11-2013", "13-3-2017", "14-4-2017", "14-6-2016", "14-7-2015",
"15-12-2013", "15-2-2019", "16-7-2016", "17-4-2018", "17-6-2017",
"18-12-2014", "18-9-2016", "19-4-2013", "19-5-2018", "2-10-2015",
"20-6-2018", "20-8-2017", "21-12-2015", "22-7-2018", "23-10-2017",
"23-2-2016", "23-8-2018", "24-1-2017", "24-3-2015", "24-7-2013",
"25-2-2017", "25-8-2013", "26-9-2013", "27-1-2018", "27-5-2015",
"27-7-2014", "28-6-2015", "28-8-2014", "29-11-2013", "29-12-2018",
"29-3-2017", "3-1-2015", "3-7-2017", "30-6-2016", "31-10-2014",
"4-10-2016", "4-8-2017", "5-12-2015", "5-3-2014", "5-9-2017",
"7-10-2017", "7-8-2018", "8-1-2017", "8-3-2015", "8-5-2014",
"8-7-2013", "8-9-2018", "9-8-2013"), class = "factor"))
I'm trying to add several time dependent covariates to a dataset for survival analysis using tmerge from the survival package. I mean to add each sequentially, as recommended in the vignette on the subject, but the output from the first addition does not work as I intended.
More specifically, I have one simple data.frame with the ids of the individual (organizations) and the number of days (age) until the organization ceases activities. The second data.frame has the ids and the number of days until the organization experiences a "transition" event. Not all organizations experience a transition, so not all organizations are present in the second data.frame.
In the first call to tmerge I format the first data.frame in the format the package uses. In the second I try to add a variable that counts the number of transitions an organization has experienced. For most organizations, the result is as I expect, but for a small number the result does not make sense and there is no obvious reason to me why it fails.
The data.frames are small, so I post them along with the code below.
ages <- structure(list(id = c(1L, 2L, 5L, 6L, 9L, 10L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 20L, 21L, 24L, 26L, 27L, 28L, 29L, 30L, 31L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 42L, 45L, 46L, 43L, 48L, 49L, 50L, 51L, 52L, 54L, 55L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 8L, 19L, 22L, 23L, 33L, 41L), age = c(13668, 21550, 15249, 21550, 16045, 21550, 14976, 14976, 6574, 21550, 4463, 16927, 16927, 15706, 4567, 21306, 17235, 22158, 19692, 17632, 17597, 4383, 5811, 7704, 5063, 17351, 17015, 16801, 4383, 5080, 13185, 12604, 19784, 5310, 15369, 13239, 1638, 21323, 10914, 21262, 7297, 17214, 17508, 14199, 14062, 2227, 8434, 4593, 14429, 21323, 4782, 10813, 2667, 2853, 5709, 3140, 12237, 7882, 21550, 15553, 16466, 16621, 19534, 21842)), .Names = c("id", "age"), row.names = c(NA, 64L), class = "data.frame")
ages1 <- tmerge(ages, ages, id=id, tstop=age)
transitions <- structure(list(id = c(2L, 2L, 6L, 8L, 10L, 19L, 22L, 23L, 24L, 31L, 33L, 41L, 43L, 43L, 52L, 55L, 66L), transition = structure(c(18993, 13668, 15249, 15706, 15887, 11609, 4023, 9316, 16193, 1461, 4584, 17824, 3713, 11261, 16818, 10670, 15479), class = "difftime", units = "days")), .Names = c("id", "transition"), row.names = c(3L, 4L, 7L, 8L, 11L, 20L, 25L, 27L, 28L, 35L, 38L, 47L, 49L, 51L, 59L, 61L, 73L), class = "data.frame")
newdata <- tmerge(ages1, transitions, id=id, transition=cumtdc(transition))
As an example of one that fails, consider id=22. It experiences one transition after 4023 days. So, tmerge should create two new rows with id=22: one for 0 to 4023 and one for 4023 to 16466 (the age the organization 'dies'). Both of these are created, but so is a third unnecessary row for id=22 with a start of 0 and a stop of 16466.
There are 17 transitions spread across the 64 organizations and I count 3 errors like the one above and cannot figure out what sets these 3 apart from the remaining (successful) cases. I could easily fix these 3 but as other TVCs are added, the time cost of detecting and fixing such errors will rise exponentially. Any ideas about what I'm missing?
The problem is solved with a simple sort by id. ages1 <- ages1[order(ages1$id),]. The package creator provided this solution.