I need to get the centroid for each cluster computed by the hierarchical method.
First, this is a part of my dataset to get reproductible example:
> dput(DATABASE[1:20,])
structure(list(TYPE_PEAU = c(2L, 2L, 3L, 2L, 2L, 2L, 2L, 4L,
3L, 2L, 2L, 2L, 2L, 1L, 4L, 2L, 2L, 2L, 4L, 2L), SENSIBILITE = c(3L,
2L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 3L,
1L, 3L, 3L), IMPERFECTIONS = c(2L, 2L, 3L, 3L, 1L, 2L, 2L, 3L,
2L, 2L, 2L, 1L, 1L, 1L, 3L, 1L, 2L, 1L, 2L, 2L), BRILLANCE = c(3L,
3L, 1L, 3L, 1L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L,
3L, 3L, 3L), GRAIN_PEAU = c(3L, 3L, 3L, 1L, 3L, 3L, 3L, 2L, 3L,
2L, 1L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 3L), RIDES_VISAGE = c(3L,
1L, 1L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 2L, 1L, 3L, 1L, 3L, 3L, 3L,
3L, 3L, 3L), MAINS = c(2L, 2L, 3L, 3L, 1L, 1L, 1L, 3L, 3L, 3L,
3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L), PEAU_CORPS = c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 3L, 2L, 3L, 2L, 3L, 2L,
2L, 1L), INTERET_ALIM_NATURELLE = c(1L, 1L, 3L, 1L, 1L, 1L, 1L,
1L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 1L, 1L, 1L), INTERET_ORIGINE_GEO = c(1L,
1L, 2L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 1L, 3L, 3L, 1L,
1L, 1L, 1L), INTERET_VACANCES = c(1L, 2L, 3L, 1L, 1L, 1L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 1L, 2L), INTERET_ENVIRONNEMENT = c(1L,
3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), AGE_INTERVAL = c(3L, 3L, 4L, 2L, 2L, 3L, 3L, 4L,
4L, 3L, 4L, 2L, 1L, 3L, 3L, 2L, 2L, 2L, 2L, 3L), ATTENTE_BEAUTE_1 = c(1L,
6L, 4L, 4L, 6L, 6L, 3L, 1L, 1L, 4L, 3L, 6L, 2L, 5L, 5L, 6L, 7L,
4L, 6L, 3L), ATTENTE_BEAUTE_2 = c(2L, 2L, 3L, 6L, 4L, 1L, 4L,
7L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 2L, 6L, 2L, 2L, 2L), MILIEU_VIE = c(1L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), PROFIL_SELECTIONNE = c(1L, 32L, 21L, 23L, 34L, 31L,
15L, 6L, 1L, 20L, 14L, 34L, 9L, 28L, 28L, 32L, 42L, 20L, 32L,
14L), NOMBRE_ACHAT = c(14L, 6L, 3L, 9L, 8L, 13L, 10L, 14L, 4L,
3L, 10L, 8L, 12L, 3L, 7L, 6L, 4L, 13L, 3L, 3L), NOMBRE_CADEAU = c(2L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
2L, 1L, 1L)), .Names = c("TYPE_PEAU", "SENSIBILITE", "IMPERFECTIONS",
"BRILLANCE", "GRAIN_PEAU", "RIDES_VISAGE", "MAINS", "PEAU_CORPS",
"INTERET_ALIM_NATURELLE", "INTERET_ORIGINE_GEO", "INTERET_VACANCES",
"INTERET_ENVIRONNEMENT", "AGE_INTERVAL", "ATTENTE_BEAUTE_1",
"ATTENTE_BEAUTE_2", "MILIEU_VIE", "PROFIL_SELECTIONNE", "NOMBRE_ACHAT",
"NOMBRE_CADEAU"), row.names = c(NA, 20L), class = "data.frame")
then I used as follow :
mydist = dist(DATABASE)
clusters = cutree(hclust(mydist),k=3)
> clusters
[1] 1 2 3 3 2 2 3 1 1 3 1 2 1 3 2 2 2 3 2 1 3 2 1 1 1 1 2 1 2 1 3 3 2 3 2 2 1 1 1 1 3 2 1 1 3 2 1 2 2 1 2 2 3 1 3 1 3
[58] 1 3 2 2 1 1 2 1 2 2 2 3 2 3 1 2 2 1 1 3 3 2 1 2 2 1 2 3 3 3 1 2 1 2 1 1 1 1 1 3 2 2 2 1 1 3 2 2 1 1 1 2 1 1 1 1 3
[115] 1 2 2 1 2 3 1 1 2 3 1 1 1 2 1 3 1 2 3 2 2 1 2 1 1 3 3 2 1 2 2 1 1 1 1 2 1 2 2 3 3 1 1 3 1 3 3 3 3 2 3 1 2 3 3 3 1
[172] 1 2 2 1 1 2 1 2 2 1 3 3 1 2 2 1 1 1 2 2 1 1 1 1 3 2 3 3 1 1 2 2 2 3 1 1 1 2 2 1 2 1 3 1 2 1 3 3 1 1 1 1 2 1 2 2 2
[229] 3 3 1 1 2 1 3 2 2 2 1 1 2 1 3 1 2 1 3 1 3 1 3 1 1 1 1 2 2 1 3 3 3 2 1 2 3 2 2 1 1 3 1 2 3 1 1 2 1 1 1 1 2 2 2 3 2
[286] 1 2 1 1 2 1 2 1 2 2 1 2 3 1 3 1 3 1 1 3 1 1 2 2 1 3 3 2 2 1 2 1 1 2 2 1 3 3 2 2 1 3 3 3 1 1 1 1 3 3 2 1 3 1 2 1 2
[343] 1 2 3 3 2 3 1 3 2 3 3 1 2 2 1 2 2 3 2 1 3 2 2 1 2 3 2 3 3 3 2 2 3 2 1 1 1 2 3 2 2 1 2 2 2 1 2 1 1 1 3 1 2 2 1 1 2
[400] 1 1 1 1 1 2 2 2
Please Note that the objectif is to compute the inter and intra inertia:
So i need to compute the distance between each centroid and all points that are included in its cluster.
So I need to compute the distance between each centroid and its concerned cluster
to used then for computing the inter and intra inertia.
You can define the centroids as the means of variables, per cluster, in DATABASE.
mydist <- dist(DATABASE)
clusters <- cutree(hclust(mydist), k = 3)
## Col means in each cluster
apply(DATABASE, 2, function (x) tapply(x, clusters, mean))
## or
DATABASE$cluster <- clusters # add cluster to DATABASE
# Now take means per group
library(dplyr)
centroids <- DATABASE %>%
group_by(cluster) %>%
summarise_all(funs(mean))
## Distance between centroids
dist(centroids[, -1], method = "euclidean")
## Example for distance in cluster 1 (distance between all observations of cluster 1)
DATABASE %>%
filter(cluster == 1) %>%
select(-cluster) %>%
dist()
you might want to specify your k value into 1:3 not just 3
here is the code and how to find the center (mean)
I have a data, as an example I show below
a = rep(1:5, each=3)
b = rep(c("a","b","c","a","c"), each = 3)
df = data.frame(a,b)
I want to select all the rows that have the "a"
I tried to do it with
df[df$a %in% a,]
Can someone give me an idea how to get them out?
df2<- structure(list(V1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), V2 = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), .Label = c("B02", "B03",
"B04", "B05", "B06", "B07", "C02", "C03", "C04", "C05", "C06",
"C07"), class = "factor")), .Names = c("V1", "V2"), class = "data.frame", row.names = c(NA,
-24L))
I want to select specific rows that start with B but not all of them and just 02, 03, 04, 05
1 B02
1 B03
1 B04
1 B05
2 B02
2 B03
2 B04
2 B05
I also want to have the original data without them too
We need to check the 'b' column
df[df$b %in% 'a',]
For the updated question with 'df2', we can use paste to create the strings 'B02' to 'B05' and use %in% to subset
df2[df2$V2 %in% paste0("B0", 2:5),]
Or another option is grep
df2[grep("^B0[2-5]$", df2$V2),]
> df
a b
1 1 a
2 1 a
3 1 a
4 2 b
5 2 b
6 2 b
7 3 c
8 3 c
9 3 c
10 4 a
11 4 a
12 4 a
13 5 c
14 5 c
15 5 c
This basically says:
For all columns in df choose rows that have value equal to a
> rows_with_a<-df[df$b=='a', ]
> rows_with_a
a b
1 1 a
2 1 a
3 1 a
10 4 a
11 4 a
12 4 a
How would you go about creating the graph below in R? I want to show the duration of different treatments for different patients.
Mock data here:
Start Day Stop Day
Patient 1 Drug 1 1 3
Drug 2 2 5
Drug 3 3 8
Patient 2 Drug 1 2 4
Drug 2 2 5
Drug 3 1 6
Patient 3 Drug 1 4 7
Drug 2 3 8
Drug 3 5 6
Your graph can be generated using geom_segment in the ggplot2 package:
df <- structure(list(Patient = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
3L, 3L, 3L), .Label = c("Patient1", "Patient2", "Patient3"), class = "factor"),
Drug = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("Drug1",
"Drug2", "Drug3"), class = "factor"), StartDay = c(1L, 2L,
3L, 2L, 2L, 1L, 4L, 3L, 5L), StopDay = c(3L, 5L, 8L, 4L,
5L, 6L, 7L, 8L, 6L)), .Names = c("Patient", "Drug", "StartDay",
"StopDay"), class = "data.frame", row.names = c(NA, -9L))
df$Drug <- factor(df$Drug, levels(df$Drug)[c(3,2,1)])
library(ggplot2)
ggplot(data=df, aes(color=Drug))+
geom_segment(aes(x=StartDay, xend=StopDay, y=Drug, yend=Drug),lwd=12)+
facet_grid(Patient~.)+xlab("Days")
This question already has answers here:
Subset dataframe by multiple logical conditions of rows to remove
(8 answers)
Closed 6 years ago.
I have a dataset that contains 10 "houses" with energy production for every minute of the day. Like so:
HouseID Time KwH
1 1 X
2 1 X
3 1 X
4 1 X
5 1 X
6 1 X
7 1 X
8 1 X
9 1 X
10 1 X
1 2 X
2 2 X
3 2 X
4 2 X
5 2 X
6 2 X
7 2 X
8 2 X
9 2 X
10 2 X
I would like to delete the rows with houseIDs 6 until 10 so that I would be left with only the observations of houseID 1,2,3,4 and 5.
You can try
newdf <- df1[!df1$HouseID %in% 6:10,]
# HouseID Time KwH
#1 1 1 X
#2 2 1 X
#3 3 1 X
#4 4 1 X
#5 5 1 X
#11 1 2 X
#12 2 2 X
#13 3 2 X
#14 4 2 X
#15 5 2 X
data
df1 <- structure(list(HouseID = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), Time = c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), KwH = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "X",
class = "factor")), .Names = c("HouseID", "Time", "KwH"),
class = "data.frame", row.names = c(NA, -20L))
Assuming df is the name of your data frame then just use the following:
df2 <- subset(df, df$HouseID==1:5)
I have two data frames
df1 <- structure(list(g1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), g2 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"), val1 = 1:20, val2 = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 4L, 1L, 2L, 3L)), .Names = c("g1", "g2", "val1", "val2"), row.names = c(NA, -20L), class = "data.frame")
df2 <- structure(list(g1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), g2 = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"), val3 = c(5L, 6L, 7L, 3L, 4L, 5L, 2L, 3L, 4L, 8L, 9L, 10L, 4L, 5L, 6L, 5L, 6L)), .Names = c("g1", "g2", "val3"), row.names = c(NA, -17L), class = "data.frame")
> df1
g1 g2 val1 val2
1 A a 1 1
2 A a 2 2
3 A a 3 3
4 A a 4 4
5 A b 5 1
6 A b 6 2
7 A b 7 3
8 A c 8 1
9 A c 9 2
10 A c 10 3
11 B a 11 1
12 B a 12 2
13 B a 13 3
14 B b 14 1
15 B b 15 2
16 B b 16 3
17 B b 17 4
18 B c 18 1
19 B c 19 2
20 B c 20 3
> df2
g1 g2 val3
1 A a 5
2 A a 6
3 A a 7
4 A b 3
5 A b 4
6 A b 5
7 A c 2
8 A c 3
9 B c 4
10 B a 8
11 B a 9
12 B a 10
13 B b 4
14 B b 5
15 B b 6
16 B c 5
17 B c 6
My aim is to rescale df1$val2 to take values between the min and max values of df2$val3 within the respective groups.
I tried this:
library(dplyr)
df1 <- df1 %.% group_by(g1, g2) %.% mutate(rescaled=(max(df2$val3)-min(df2$val3))*(val2-min(val2))/(max(val2)-min(val2))+min(df2$val3))
But the output is different from what I expect. The problem is that I can neither cbind nor merge the two data frames due to their different lengths. Any hints?
Does this work?
library(plyr)
df3 <- ddply(df2, .(g1, g2), summarize, max.val=max(val3), min.val=min(val3))
merged.df <- merge(df1, df3, by=c("g1", "g2"), all.x=TRUE)
## Now rescale merged.df$val2 as desired