Get sum of unique rows in table function in R

Get sum of unique rows in table function in R - r

Suppose I have data which looks like this
Id Name Price sales Profit Month Category Mode Supplier
1 A 2 5 8 1 X K John
1 A 2 6 9 2 X K John
1 A 2 5 8 3 X K John
2 B 2 4 6 1 X L Sam
2 B 2 3 4 2 X L Sam
2 B 2 5 7 3 X L Sam
3 C 2 5 11 1 X M John
3 C 2 5 11 2 X L John
3 C 2 5 11 3 X K John
4 D 2 8 10 1 Y M John
4 D 2 8 10 2 Y K John
4 D 2 5 7 3 Y K John
5 E 2 5 9 1 Y M Sam
5 E 2 5 9 2 Y L Sam
5 E 2 5 9 3 Y M Sam
6 F 2 4 7 1 Z M Kyle
6 F 2 5 8 2 Z L Kyle
6 F 2 5 8 3 Z M Kyle
if I apply table function, it will just combines are the rows and result will be
K L M
X 4 4 1
Y 2 1 3
Z 0 1 2
Now what if I want not the sum of all rows but only sum of those rows with Unique Id
so it looks like
K L M
X 2 2 1
Y 1 1 2
Z 0 1 1
Thanks

If df is your data.frame:
# Subset original data.frame to keep columns of interest
df1 <- df[,c("Id", "Category", "Mode")]
# Remove duplicated rows
df1 <- df1[!duplicated(df1),]
# Create table
with(df1, table(Category, Mode))
# Mode
# Category K L M
# X 2 2 1
# Y 1 1 2
# Z 0 1 1
Or in one line using unique
table(unique(df[c("Id", "Category", "Mode")])[-1])
df <- structure(list(Id = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L), Name = structure(c(1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L), .Label = c("A",
"B", "C", "D", "E", "F"), class = "factor"), Price = c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), sales = c(5L, 6L, 5L, 4L, 3L, 5L, 5L, 5L, 5L, 8L, 8L, 5L,
5L, 5L, 5L, 4L, 5L, 5L), Profit = c(8L, 9L, 8L, 6L, 4L, 7L, 11L,
11L, 11L, 10L, 10L, 7L, 9L, 9L, 9L, 7L, 8L, 8L), Month = c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L), Category = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("X", "Y", "Z"
), class = "factor"), Mode = structure(c(1L, 1L, 1L, 2L, 2L,
2L, 3L, 2L, 1L, 3L, 1L, 1L, 3L, 2L, 3L, 3L, 2L, 3L), .Label = c("K",
"L", "M"), class = "factor"), Supplier = structure(c(1L, 1L,
1L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L
), .Label = c("John", "Kyle", "Sam"), class = "factor")), .Names = c("Id",
"Name", "Price", "sales", "Profit", "Month", "Category", "Mode",
"Supplier"), class = "data.frame", row.names = c(NA, -18L))

We can try
library(data.table)
dcast(unique(setDT(df1[c('Category', 'Mode', 'Id')])),
Category~Mode, value.var='Id', length)
# Category K L M
#1: X 2 2 1
#2: Y 1 1 2
#3: Z 0 1 1
Or with dplyr
library(dplyr)
df1 %>%
distinct(Id, Category, Mode) %>%
group_by(Category, Mode) %>%
tally() %>%
spread(Mode, n, fill=0)
# Category K L M
# (chr) (dbl) (dbl) (dbl)
#1 X 2 2 1
#2 Y 1 1 2
#3 Z 0 1 1
Or as #David Arenburg suggested, a variant of the above is
df1 %>%
distinct(Id, Category, Mode) %>%
select(Category, Mode) %>%
table()

Related

Find the centroid of Clusters generated by the hclust function

I need to get the centroid for each cluster computed by the hierarchical method.
First, this is a part of my dataset to get reproductible example:
> dput(DATABASE[1:20,])
structure(list(TYPE_PEAU = c(2L, 2L, 3L, 2L, 2L, 2L, 2L, 4L,
3L, 2L, 2L, 2L, 2L, 1L, 4L, 2L, 2L, 2L, 4L, 2L), SENSIBILITE = c(3L,
2L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 3L,
1L, 3L, 3L), IMPERFECTIONS = c(2L, 2L, 3L, 3L, 1L, 2L, 2L, 3L,
2L, 2L, 2L, 1L, 1L, 1L, 3L, 1L, 2L, 1L, 2L, 2L), BRILLANCE = c(3L,
3L, 1L, 3L, 1L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L,
3L, 3L, 3L), GRAIN_PEAU = c(3L, 3L, 3L, 1L, 3L, 3L, 3L, 2L, 3L,
2L, 1L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 3L), RIDES_VISAGE = c(3L,
1L, 1L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 2L, 1L, 3L, 1L, 3L, 3L, 3L,
3L, 3L, 3L), MAINS = c(2L, 2L, 3L, 3L, 1L, 1L, 1L, 3L, 3L, 3L,
3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L), PEAU_CORPS = c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 3L, 2L, 3L, 2L, 3L, 2L,
2L, 1L), INTERET_ALIM_NATURELLE = c(1L, 1L, 3L, 1L, 1L, 1L, 1L,
1L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 1L, 1L, 1L), INTERET_ORIGINE_GEO = c(1L,
1L, 2L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 1L, 3L, 3L, 1L,
1L, 1L, 1L), INTERET_VACANCES = c(1L, 2L, 3L, 1L, 1L, 1L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 1L, 2L), INTERET_ENVIRONNEMENT = c(1L,
3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), AGE_INTERVAL = c(3L, 3L, 4L, 2L, 2L, 3L, 3L, 4L,
4L, 3L, 4L, 2L, 1L, 3L, 3L, 2L, 2L, 2L, 2L, 3L), ATTENTE_BEAUTE_1 = c(1L,
6L, 4L, 4L, 6L, 6L, 3L, 1L, 1L, 4L, 3L, 6L, 2L, 5L, 5L, 6L, 7L,
4L, 6L, 3L), ATTENTE_BEAUTE_2 = c(2L, 2L, 3L, 6L, 4L, 1L, 4L,
7L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 2L, 6L, 2L, 2L, 2L), MILIEU_VIE = c(1L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), PROFIL_SELECTIONNE = c(1L, 32L, 21L, 23L, 34L, 31L,
15L, 6L, 1L, 20L, 14L, 34L, 9L, 28L, 28L, 32L, 42L, 20L, 32L,
14L), NOMBRE_ACHAT = c(14L, 6L, 3L, 9L, 8L, 13L, 10L, 14L, 4L,
3L, 10L, 8L, 12L, 3L, 7L, 6L, 4L, 13L, 3L, 3L), NOMBRE_CADEAU = c(2L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
2L, 1L, 1L)), .Names = c("TYPE_PEAU", "SENSIBILITE", "IMPERFECTIONS",
"BRILLANCE", "GRAIN_PEAU", "RIDES_VISAGE", "MAINS", "PEAU_CORPS",
"INTERET_ALIM_NATURELLE", "INTERET_ORIGINE_GEO", "INTERET_VACANCES",
"INTERET_ENVIRONNEMENT", "AGE_INTERVAL", "ATTENTE_BEAUTE_1",
"ATTENTE_BEAUTE_2", "MILIEU_VIE", "PROFIL_SELECTIONNE", "NOMBRE_ACHAT",
"NOMBRE_CADEAU"), row.names = c(NA, 20L), class = "data.frame")
then I used as follow :
mydist = dist(DATABASE)
clusters = cutree(hclust(mydist),k=3)
> clusters
[1] 1 2 3 3 2 2 3 1 1 3 1 2 1 3 2 2 2 3 2 1 3 2 1 1 1 1 2 1 2 1 3 3 2 3 2 2 1 1 1 1 3 2 1 1 3 2 1 2 2 1 2 2 3 1 3 1 3
[58] 1 3 2 2 1 1 2 1 2 2 2 3 2 3 1 2 2 1 1 3 3 2 1 2 2 1 2 3 3 3 1 2 1 2 1 1 1 1 1 3 2 2 2 1 1 3 2 2 1 1 1 2 1 1 1 1 3
[115] 1 2 2 1 2 3 1 1 2 3 1 1 1 2 1 3 1 2 3 2 2 1 2 1 1 3 3 2 1 2 2 1 1 1 1 2 1 2 2 3 3 1 1 3 1 3 3 3 3 2 3 1 2 3 3 3 1
[172] 1 2 2 1 1 2 1 2 2 1 3 3 1 2 2 1 1 1 2 2 1 1 1 1 3 2 3 3 1 1 2 2 2 3 1 1 1 2 2 1 2 1 3 1 2 1 3 3 1 1 1 1 2 1 2 2 2
[229] 3 3 1 1 2 1 3 2 2 2 1 1 2 1 3 1 2 1 3 1 3 1 3 1 1 1 1 2 2 1 3 3 3 2 1 2 3 2 2 1 1 3 1 2 3 1 1 2 1 1 1 1 2 2 2 3 2
[286] 1 2 1 1 2 1 2 1 2 2 1 2 3 1 3 1 3 1 1 3 1 1 2 2 1 3 3 2 2 1 2 1 1 2 2 1 3 3 2 2 1 3 3 3 1 1 1 1 3 3 2 1 3 1 2 1 2
[343] 1 2 3 3 2 3 1 3 2 3 3 1 2 2 1 2 2 3 2 1 3 2 2 1 2 3 2 3 3 3 2 2 3 2 1 1 1 2 3 2 2 1 2 2 2 1 2 1 1 1 3 1 2 2 1 1 2
[400] 1 1 1 1 1 2 2 2
Please Note that the objectif is to compute the inter and intra inertia:
So i need to compute the distance between each centroid and all points that are included in its cluster.
So I need to compute the distance between each centroid and its concerned cluster
to used then for computing the inter and intra inertia.

You can define the centroids as the means of variables, per cluster, in DATABASE.
mydist <- dist(DATABASE)
clusters <- cutree(hclust(mydist), k = 3)
## Col means in each cluster
apply(DATABASE, 2, function (x) tapply(x, clusters, mean))
## or
DATABASE$cluster <- clusters # add cluster to DATABASE
# Now take means per group
library(dplyr)
centroids <- DATABASE %>%
group_by(cluster) %>%
summarise_all(funs(mean))
## Distance between centroids
dist(centroids[, -1], method = "euclidean")
## Example for distance in cluster 1 (distance between all observations of cluster 1)
DATABASE %>%
filter(cluster == 1) %>%
select(-cluster) %>%
dist()

you might want to specify your k value into 1:3 not just 3
here is the code and how to find the center (mean)

how to select specific row by a column

I have a data, as an example I show below
a = rep(1:5, each=3)
b = rep(c("a","b","c","a","c"), each = 3)
df = data.frame(a,b)
I want to select all the rows that have the "a"
I tried to do it with
df[df$a %in% a,]
Can someone give me an idea how to get them out?
df2<- structure(list(V1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), V2 = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), .Label = c("B02", "B03",
"B04", "B05", "B06", "B07", "C02", "C03", "C04", "C05", "C06",
"C07"), class = "factor")), .Names = c("V1", "V2"), class = "data.frame", row.names = c(NA,
-24L))
I want to select specific rows that start with B but not all of them and just 02, 03, 04, 05
1 B02
1 B03
1 B04
1 B05
2 B02
2 B03
2 B04
2 B05
I also want to have the original data without them too

We need to check the 'b' column
df[df$b %in% 'a',]
For the updated question with 'df2', we can use paste to create the strings 'B02' to 'B05' and use %in% to subset
df2[df2$V2 %in% paste0("B0", 2:5),]
Or another option is grep
df2[grep("^B0[2-5]$", df2$V2),]

> df
a b
1 1 a
2 1 a
3 1 a
4 2 b
5 2 b
6 2 b
7 3 c
8 3 c
9 3 c
10 4 a
11 4 a
12 4 a
13 5 c
14 5 c
15 5 c
This basically says:
For all columns in df choose rows that have value equal to a
> rows_with_a<-df[df$b=='a', ]
> rows_with_a
a b
1 1 a
2 1 a
3 1 a
10 4 a
11 4 a
12 4 a

ggplot2 - how to create a clustered timeline?

How would you go about creating the graph below in R? I want to show the duration of different treatments for different patients.
Mock data here:
Start Day Stop Day
Patient 1 Drug 1 1 3
Drug 2 2 5
Drug 3 3 8
Patient 2 Drug 1 2 4
Drug 2 2 5
Drug 3 1 6
Patient 3 Drug 1 4 7
Drug 2 3 8
Drug 3 5 6

Your graph can be generated using geom_segment in the ggplot2 package:
df <- structure(list(Patient = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
3L, 3L, 3L), .Label = c("Patient1", "Patient2", "Patient3"), class = "factor"),
Drug = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("Drug1",
"Drug2", "Drug3"), class = "factor"), StartDay = c(1L, 2L,
3L, 2L, 2L, 1L, 4L, 3L, 5L), StopDay = c(3L, 5L, 8L, 4L,
5L, 6L, 7L, 8L, 6L)), .Names = c("Patient", "Drug", "StartDay",
"StopDay"), class = "data.frame", row.names = c(NA, -9L))
df$Drug <- factor(df$Drug, levels(df$Drug)[c(3,2,1)])
library(ggplot2)
ggplot(data=df, aes(color=Drug))+
geom_segment(aes(x=StartDay, xend=StopDay, y=Drug, yend=Drug),lwd=12)+
facet_grid(Patient~.)+xlab("Days")

How to select every 5 other observations in R? [duplicate]

This question already has answers here:
Subset dataframe by multiple logical conditions of rows to remove
(8 answers)
Closed 6 years ago.
I have a dataset that contains 10 "houses" with energy production for every minute of the day. Like so:
HouseID Time KwH
1 1 X
2 1 X
3 1 X
4 1 X
5 1 X
6 1 X
7 1 X
8 1 X
9 1 X
10 1 X
1 2 X
2 2 X
3 2 X
4 2 X
5 2 X
6 2 X
7 2 X
8 2 X
9 2 X
10 2 X
I would like to delete the rows with houseIDs 6 until 10 so that I would be left with only the observations of houseID 1,2,3,4 and 5.

You can try
newdf <- df1[!df1$HouseID %in% 6:10,]
# HouseID Time KwH
#1 1 1 X
#2 2 1 X
#3 3 1 X
#4 4 1 X
#5 5 1 X
#11 1 2 X
#12 2 2 X
#13 3 2 X
#14 4 2 X
#15 5 2 X
data
df1 <- structure(list(HouseID = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), Time = c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), KwH = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "X",
class = "factor")), .Names = c("HouseID", "Time", "KwH"),
class = "data.frame", row.names = c(NA, -20L))

Assuming df is the name of your data frame then just use the following:
df2 <- subset(df, df$HouseID==1:5)

Rescaling by group across data frames

I have two data frames
df1 <- structure(list(g1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), g2 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"), val1 = 1:20, val2 = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 4L, 1L, 2L, 3L)), .Names = c("g1", "g2", "val1", "val2"), row.names = c(NA, -20L), class = "data.frame")
df2 <- structure(list(g1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), g2 = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"), val3 = c(5L, 6L, 7L, 3L, 4L, 5L, 2L, 3L, 4L, 8L, 9L, 10L, 4L, 5L, 6L, 5L, 6L)), .Names = c("g1", "g2", "val3"), row.names = c(NA, -17L), class = "data.frame")
> df1
g1 g2 val1 val2
1 A a 1 1
2 A a 2 2
3 A a 3 3
4 A a 4 4
5 A b 5 1
6 A b 6 2
7 A b 7 3
8 A c 8 1
9 A c 9 2
10 A c 10 3
11 B a 11 1
12 B a 12 2
13 B a 13 3
14 B b 14 1
15 B b 15 2
16 B b 16 3
17 B b 17 4
18 B c 18 1
19 B c 19 2
20 B c 20 3
> df2
g1 g2 val3
1 A a 5
2 A a 6
3 A a 7
4 A b 3
5 A b 4
6 A b 5
7 A c 2
8 A c 3
9 B c 4
10 B a 8
11 B a 9
12 B a 10
13 B b 4
14 B b 5
15 B b 6
16 B c 5
17 B c 6
My aim is to rescale df1$val2 to take values between the min and max values of df2$val3 within the respective groups.
I tried this:
library(dplyr)
df1 <- df1 %.% group_by(g1, g2) %.% mutate(rescaled=(max(df2$val3)-min(df2$val3))*(val2-min(val2))/(max(val2)-min(val2))+min(df2$val3))
But the output is different from what I expect. The problem is that I can neither cbind nor merge the two data frames due to their different lengths. Any hints?

Does this work?
library(plyr)
df3 <- ddply(df2, .(g1, g2), summarize, max.val=max(val3), min.val=min(val3))
merged.df <- merge(df1, df3, by=c("g1", "g2"), all.x=TRUE)
## Now rescale merged.df$val2 as desired

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Get sum of unique rows in table function in R - r

Related

Find the centroid of Clusters generated by the hclust function

how to select specific row by a column

ggplot2 - how to create a clustered timeline?

How to select every 5 other observations in R? [duplicate]

Rescaling by group across data frames

Categories

Resources