Find the centroid of Clusters generated by the hclust function - r

I need to get the centroid for each cluster computed by the hierarchical method.
First, this is a part of my dataset to get reproductible example:
> dput(DATABASE[1:20,])
structure(list(TYPE_PEAU = c(2L, 2L, 3L, 2L, 2L, 2L, 2L, 4L,
3L, 2L, 2L, 2L, 2L, 1L, 4L, 2L, 2L, 2L, 4L, 2L), SENSIBILITE = c(3L,
2L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 3L,
1L, 3L, 3L), IMPERFECTIONS = c(2L, 2L, 3L, 3L, 1L, 2L, 2L, 3L,
2L, 2L, 2L, 1L, 1L, 1L, 3L, 1L, 2L, 1L, 2L, 2L), BRILLANCE = c(3L,
3L, 1L, 3L, 1L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L,
3L, 3L, 3L), GRAIN_PEAU = c(3L, 3L, 3L, 1L, 3L, 3L, 3L, 2L, 3L,
2L, 1L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 3L), RIDES_VISAGE = c(3L,
1L, 1L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 2L, 1L, 3L, 1L, 3L, 3L, 3L,
3L, 3L, 3L), MAINS = c(2L, 2L, 3L, 3L, 1L, 1L, 1L, 3L, 3L, 3L,
3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L), PEAU_CORPS = c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 3L, 2L, 3L, 2L, 3L, 2L,
2L, 1L), INTERET_ALIM_NATURELLE = c(1L, 1L, 3L, 1L, 1L, 1L, 1L,
1L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 1L, 1L, 1L), INTERET_ORIGINE_GEO = c(1L,
1L, 2L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 1L, 3L, 3L, 1L,
1L, 1L, 1L), INTERET_VACANCES = c(1L, 2L, 3L, 1L, 1L, 1L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 1L, 2L), INTERET_ENVIRONNEMENT = c(1L,
3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), AGE_INTERVAL = c(3L, 3L, 4L, 2L, 2L, 3L, 3L, 4L,
4L, 3L, 4L, 2L, 1L, 3L, 3L, 2L, 2L, 2L, 2L, 3L), ATTENTE_BEAUTE_1 = c(1L,
6L, 4L, 4L, 6L, 6L, 3L, 1L, 1L, 4L, 3L, 6L, 2L, 5L, 5L, 6L, 7L,
4L, 6L, 3L), ATTENTE_BEAUTE_2 = c(2L, 2L, 3L, 6L, 4L, 1L, 4L,
7L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 2L, 6L, 2L, 2L, 2L), MILIEU_VIE = c(1L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), PROFIL_SELECTIONNE = c(1L, 32L, 21L, 23L, 34L, 31L,
15L, 6L, 1L, 20L, 14L, 34L, 9L, 28L, 28L, 32L, 42L, 20L, 32L,
14L), NOMBRE_ACHAT = c(14L, 6L, 3L, 9L, 8L, 13L, 10L, 14L, 4L,
3L, 10L, 8L, 12L, 3L, 7L, 6L, 4L, 13L, 3L, 3L), NOMBRE_CADEAU = c(2L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
2L, 1L, 1L)), .Names = c("TYPE_PEAU", "SENSIBILITE", "IMPERFECTIONS",
"BRILLANCE", "GRAIN_PEAU", "RIDES_VISAGE", "MAINS", "PEAU_CORPS",
"INTERET_ALIM_NATURELLE", "INTERET_ORIGINE_GEO", "INTERET_VACANCES",
"INTERET_ENVIRONNEMENT", "AGE_INTERVAL", "ATTENTE_BEAUTE_1",
"ATTENTE_BEAUTE_2", "MILIEU_VIE", "PROFIL_SELECTIONNE", "NOMBRE_ACHAT",
"NOMBRE_CADEAU"), row.names = c(NA, 20L), class = "data.frame")
then I used as follow :
mydist = dist(DATABASE)
clusters = cutree(hclust(mydist),k=3)
> clusters
[1] 1 2 3 3 2 2 3 1 1 3 1 2 1 3 2 2 2 3 2 1 3 2 1 1 1 1 2 1 2 1 3 3 2 3 2 2 1 1 1 1 3 2 1 1 3 2 1 2 2 1 2 2 3 1 3 1 3
[58] 1 3 2 2 1 1 2 1 2 2 2 3 2 3 1 2 2 1 1 3 3 2 1 2 2 1 2 3 3 3 1 2 1 2 1 1 1 1 1 3 2 2 2 1 1 3 2 2 1 1 1 2 1 1 1 1 3
[115] 1 2 2 1 2 3 1 1 2 3 1 1 1 2 1 3 1 2 3 2 2 1 2 1 1 3 3 2 1 2 2 1 1 1 1 2 1 2 2 3 3 1 1 3 1 3 3 3 3 2 3 1 2 3 3 3 1
[172] 1 2 2 1 1 2 1 2 2 1 3 3 1 2 2 1 1 1 2 2 1 1 1 1 3 2 3 3 1 1 2 2 2 3 1 1 1 2 2 1 2 1 3 1 2 1 3 3 1 1 1 1 2 1 2 2 2
[229] 3 3 1 1 2 1 3 2 2 2 1 1 2 1 3 1 2 1 3 1 3 1 3 1 1 1 1 2 2 1 3 3 3 2 1 2 3 2 2 1 1 3 1 2 3 1 1 2 1 1 1 1 2 2 2 3 2
[286] 1 2 1 1 2 1 2 1 2 2 1 2 3 1 3 1 3 1 1 3 1 1 2 2 1 3 3 2 2 1 2 1 1 2 2 1 3 3 2 2 1 3 3 3 1 1 1 1 3 3 2 1 3 1 2 1 2
[343] 1 2 3 3 2 3 1 3 2 3 3 1 2 2 1 2 2 3 2 1 3 2 2 1 2 3 2 3 3 3 2 2 3 2 1 1 1 2 3 2 2 1 2 2 2 1 2 1 1 1 3 1 2 2 1 1 2
[400] 1 1 1 1 1 2 2 2
Please Note that the objectif is to compute the inter and intra inertia:
So i need to compute the distance between each centroid and all points that are included in its cluster.
So I need to compute the distance between each centroid and its concerned cluster
to used then for computing the inter and intra inertia.

You can define the centroids as the means of variables, per cluster, in DATABASE.
mydist <- dist(DATABASE)
clusters <- cutree(hclust(mydist), k = 3)
## Col means in each cluster
apply(DATABASE, 2, function (x) tapply(x, clusters, mean))
## or
DATABASE$cluster <- clusters # add cluster to DATABASE
# Now take means per group
library(dplyr)
centroids <- DATABASE %>%
group_by(cluster) %>%
summarise_all(funs(mean))
## Distance between centroids
dist(centroids[, -1], method = "euclidean")
## Example for distance in cluster 1 (distance between all observations of cluster 1)
DATABASE %>%
filter(cluster == 1) %>%
select(-cluster) %>%
dist()

you might want to specify your k value into 1:3 not just 3
here is the code and how to find the center (mean)

Related

Efficient way to calculate percentage of a specific value in a specific time

I have a csv file like these: this csv filled is called df_plane in R
Situation
flight_uses
People-ID
1
1
1
2
1
1
3
0
1
1
1
2
2
1
2
3
1
2
1
1
3
2
0
3
3
1
3
1
1
4
2
1
4
3
0
4
1
1
5
2
0
5
3
0
5
1
1
6
2
1
6
3
NA
6
1
NA
7
2
1
7
3
1
7
1
1
8
2
0
8
3
0
8
1
NA
9
2
NA
9
3
1
9
1
1
10
2
1
10
3
0
10
1
0
11
2
0
11
3
0
11
I would like to find out what percentage of people uses airplane in situation 2. I would like to know if there is a more efficient way than use the code below. Because with the below code I have to calculate it manually.
table(select(df_plane,situation,flight_uses))
You can use functions from the janitor package.
library(tidyverse)
library(janitor)
#>
#> Attaching package: 'janitor'
#> The following objects are masked from 'package:stats':
#>
#> chisq.test, fisher.test
df_plane <- tibble::tribble(
~Situation, ~flight_uses, ~`People-ID`,
1L, 1L, 1L,
2L, 1L, 1L,
3L, 0L, 1L,
1L, 1L, 2L,
2L, 1L, 2L,
3L, 1L, 2L,
1L, 1L, 3L,
2L, 0L, 3L,
3L, 1L, 3L,
1L, 1L, 4L,
2L, 1L, 4L,
3L, 0L, 4L,
1L, 1L, 5L,
2L, 0L, 5L,
3L, 0L, 5L,
1L, 1L, 6L,
2L, 1L, 6L,
3L, NA, 6L,
1L, NA, 7L,
2L, 1L, 7L,
3L, 1L, 7L,
1L, 1L, 8L,
2L, 0L, 8L,
3L, 0L, 8L,
1L, NA, 9L,
2L, NA, 9L,
3L, 1L, 9L,
1L, 1L, 10L,
2L, 1L, 10L,
3L, 0L, 10L,
1L, 0L, 11L,
2L, 0L, 11L,
3L, 0L, 11L
) |>
clean_names()
df_plane |>
tabyl(situation, flight_uses) |>
adorn_percentages() |>
adorn_pct_formatting()
#> situation 0 1 NA_
#> 1 9.1% 72.7% 18.2%
#> 2 36.4% 54.5% 9.1%
#> 3 54.5% 36.4% 9.1%
Created on 2022-10-26 with reprex v2.0.2
In Situation 2, 54.5% of passengers uses airplane.
You can use mean to calculate the proportion
> with(df_plane,mean(replace(flight_uses, is.na(flight_uses), 0)[Situation==2]))
[1] 0.5454545
Are you asking, of those rows where Situation==2, what is the percent where flight_uses==1?
dplyr approach
dplyr is useful for these types of manipulations:
library(dplyr)
df_plane |>
filter(Situation == 2) |>
summarise(
percent_using_plane = sum(flight_uses==1, na.rm=T) / n() * 100
)
# percent_using_plane
# 1 54.54545
base R
If you want to stick with the base R table syntax (which seems fine in this case but can become unwieldy once calculations get more complicated), you were nearly there:
table(df_plane[df_plane$Situation==2,]$flight_uses) / nrow(df_plane[df_plane$Situation==2,])*100
# 0 1
# 36.36364 54.54545
Use with instead of dplyr::select and wrap it in proportions.
proportions(with(df_plane, table(flight_uses, Situation, useNA='ifany')), 2)
# Situation
# flight_uses 1 2 3
# 0 0.09090909 0.36363636 0.54545455
# 1 0.72727273 0.54545455 0.36363636
# <NA> 0.18181818 0.09090909 0.09090909

remove rows with duplicate values in any other adjacent column

How can i remove rows with any same value that is in another column of the same row? For example,
df<-structure(list(V1 = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L), V2 = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), V3 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L)), row.names = c(NA, -27L
), class = "data.frame")
##Top three rows
V1 V2 V3
1 1 1 1
2 2 1 1
3 3 1 1
4 1 2 1
5 2 2 1
6 3 2 1
7 1 3 1
8 2 3 1
In the following case (only showing 8 rows), I would remove every row accept rows 6 and 8 since they do not have any duplicate values in any column of the same row. I'm preferably looking for a data.table solution since I have a much larger dataframe.
You may use anyDuplicated for each row.
library(data.table)
setDT(df)
df[apply(df, 1, anyDuplicated) == 0]
# V1 V2 V3
#1: 3 2 1
#2: 2 3 1
#3: 3 1 2
#4: 1 3 2
#5: 2 1 3
#6: 1 2 3
An option using pairwise combn on the columns to check if there are equal values
df[!Reduce(`|`, combn(df, 2, FUN = function(x)
x[[1]] == x[[2]], simplify = FALSE))]
V1 V2 V3
1: 3 2 1
2: 2 3 1
3: 3 1 2
4: 1 3 2
5: 2 1 3
6: 1 2 3

How to loop over a grouped dataframe using dplyr::count() and purrr::map()

Background
I want to loop over a grouped dataframe of factor variables to count the
occurrences of each value within a variable using count function from dplyr,
and I think that the purrr::map function would be the most suitable.
However, I cannot get this to work.
I tried to use this post for my needs, but this did not work either.
I also tried to hack together a function based on
this post, but could not get this to work with the grouping variable.
Question
Is it possible to loop over a grouped dataframe in the way that I want? If so,
how?
Thanks in advance for your consideration.
Reproducible example
library(tidyverse)
vars_df <-
structure(list(c = structure(c(2L, 3L, 3L, 2L, 3L, 3L, 2L, 2L,
1L, 2L, 2L, 2L, 3L, 1L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
2L, 3L, 1L, 2L, 1L, 3L, 3L, 2L, 1L, 2L, 2L, 2L, 3L, 3L, 2L, 1L,
1L, 2L, 3L, 3L, 2L, 2L, 3L, 3L, 3L, 2L), .Label = c("1", "2",
"3"), class = "factor"), pastpsyc = structure(c(2L, 1L, NA, 2L,
1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, NA, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), pastmed = structure(c(2L, 1L, NA, 2L,
1L, 1L, 1L, 1L, NA, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
2L, 2L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), hxsuicide = structure(c(2L, 1L, NA,
2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 2L, 1L, 1L, 2L, NA, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), hxdsh = structure(c(2L, 1L, NA, 1L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 2L, NA, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), hxtrauma = structure(c(2L, 1L, NA, 2L,
1L, 1L, 1L, 1L, NA, 1L, 1L, NA, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 1L, NA, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor")), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
map_dfr(setNames(c('1', '2', '3'),
c('1', '2', '3')), ~
vars_df %>%
group_by(c) %>%
summarise(across(everything(), function(x)
sum(x == .x, na.rm = TRUE)), .groups = 'drop'), .id = 'var')
#> # A tibble: 9 x 7
#> var c pastpsyc pastmed hxsuicide hxdsh hxtrauma
#> <chr> <fct> <int> <int> <int> <int> <int>
#> 1 1 1 3 2 2 5 1
#> 2 1 2 16 9 18 16 10
#> 3 1 3 12 3 8 11 9
#> 4 2 1 0 0 0 0 0
#> 5 2 2 0 0 0 0 0
#> 6 2 3 0 0 0 0 0
#> 7 3 1 0 0 0 0 0
#> 8 3 2 0 0 0 0 0
#> 9 3 3 0 0 0 0 0
vars_df %>%
group_by(c) %>%
count(pastpsyc)
#> # A tibble: 7 x 3
#> # Groups: c [3]
#> c pastpsyc n
#> <fct> <fct> <int>
#> 1 1 0 4
#> 2 1 1 3
#> 3 2 0 8
#> 4 2 1 16
#> 5 3 0 5
#> 6 3 1 12
#> 7 3 <NA> 2
vars_df %>%
group_by(c) %>%
map(~ count(.))
#> Error in UseMethod("count"): no applicable method for 'count' applied to an object of class "factor"
.get_count <-
function(mygroup) {
quo_var <- enquo(mygroup)
vars_df %>%
group_by(!! quo_var) %>%
count() %>%
ungroup()
}
vars <-
vars_df %>%
colnames()
vars %>%
syms() %>%
map(function(var) .get_count(!!var))
#> [[1]]
#> # A tibble: 3 x 2
#> c n
#> <fct> <int>
#> 1 1 7
#> 2 2 24
#> 3 3 19
#>
#> [[2]]
#> # A tibble: 3 x 2
#> pastpsyc n
#> <fct> <int>
#> 1 0 17
#> 2 1 31
#> 3 <NA> 2
#>
#> [[3]]
#> # A tibble: 3 x 2
#> pastmed n
#> <fct> <int>
#> 1 0 33
#> 2 1 14
#> 3 <NA> 3
#>
#> [[4]]
#> # A tibble: 3 x 2
#> hxsuicide n
#> <fct> <int>
#> 1 0 20
#> 2 1 28
#> 3 <NA> 2
#>
#> [[5]]
#> # A tibble: 3 x 2
#> hxdsh n
#> <fct> <int>
#> 1 0 16
#> 2 1 32
#> 3 <NA> 2
#>
#> [[6]]
#> # A tibble: 3 x 2
#> hxtrauma n
#> <fct> <int>
#> 1 0 26
#> 2 1 20
#> 3 <NA> 4
vars %>%
syms() %>%
group_by(c) %>%
map(function(var) .get_count(!!var))
#> Error in UseMethod("group_by"): no applicable method for 'group_by' applied to an object of class "list"
# Created on 2021-05-26 by the reprex package (v2.0.0)
You can use map as -
library(tidyverse)
vars %>% map(~vars_df %>% count(c, .data[[.x]]))
#[[1]]
# A tibble: 3 x 2
# c n
# <fct> <int>
#1 1 7
#2 2 24
3 3 19
#[[2]]
# A tibble: 7 x 3
# c pastpsyc n
# <fct> <fct> <int>
#1 1 0 4
#2 1 1 3
#3 2 0 8
#4 2 1 16
#5 3 0 5
#6 3 1 12
#7 3 NA 2
#...
#...
A different way to show the output in a long format -
vars_df %>% pivot_longer(cols = -c) %>% count(c, name, value)
# c name value n
# <fct> <chr> <fct> <int>
# 1 1 hxdsh 0 2
# 2 1 hxdsh 1 5
# 3 1 hxsuicide 0 5
# 4 1 hxsuicide 1 2
# 5 1 hxtrauma 0 5
# 6 1 hxtrauma 1 1
# 7 1 hxtrauma NA 1
# 8 1 pastmed 0 4
# 9 1 pastmed 1 2
#10 1 pastmed NA 1
# … with 28 more rows

window function is not working as expected

I have a monthly time series - monthlyTs:
monthlyTs <- ts(all.xts , frequency = 12, start=decimal_date(ymd("2012-01-29")))
head(index(monthlyTs))
1 "2012-01-29 00:00:00 UTC" "2012-02-26 01:22:47 UTC" "2012-03-25
02:45:35 UTC" "2012-04-29 04:29:04 UTC"
[5] "2012-05-27 05:51:52 UTC" "2012-06-24 07:14:39 UTC"
I want to apply a time windows that starts from 2013:
head(window(monthly, start = 2013))
2012-01-29 00:00:00 2
2012-02-26 01:22:47 8 2012-03-25 02:45:35 6 2012-04-29 04:29:04
5 2012-05-27 05:51:52 4 2012-06-24 07:14:39 4
So looks like window function is not filtering as expected. What is wrong?
Fully reproducible example as requested:
christmas.csv - tiny CSV file (google trends for 'Christmas' request)
#Reading data from the csv. Format - [week start date], [views per week]
data = read.csv('christmas.csv', sep=",", header = FALSE, skip = 3,col.names = c("Week","Views"))[[2]]
# creating time series
myTs <- ts(data[[2]], freq=365.25/7, start=decimal_date(ymd("2012-01-29")))
#converting from weekly to month time series
all.xts <- xts(myTs, date_decimal(index(myTs)))
monthlyTs <- ts(all.xts , frequency = 12, start=decimal_date(ymd("2012-01-29")))
head(window(monthlyTs, start = 2013))
2012-01-29 00:00:00 2
2012-02-26 01:22:47 8 2012-03-25 02:45:35 6 2012-04-29 04:29:04 5
2012-05-27 05:51:52 4 2012-06-24 07:14:39 4
There are two problems :
the object all.xts is a weekly and not a monthly time
The value your pass for the argument frequency is not correct
For the second point, try to change the value you pass for the argument start in your call of the function ts with
c(lubridate::year("2012-01-29"), lubridate::month("2012-01-29"))
and change the frequency to value 12. i.e use the line :
ts(all.xts , frequency = 12, start = c(lubridate::year("2012-01-29"), lubridate::month("2012-01-29")) )
Using the output from dput, your code rewrite as follow :
data <- c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 4L, 5L, 5L, 6L, 8L, 11L, 16L, 22L, 33L, 42L,
45L, 55L, 64L, 8L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 6L, 8L,
12L, 16L, 21L, 27L, 43L, 47L, 56L, 79L, 10L, 5L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 5L, 5L, 6L, 8L, 12L, 17L, 21L, 27L, 43L, 47L, 53L,
87L, 12L, 5L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 6L, 6L, 8L, 13L,
17L, 20L, 27L, 44L, 50L, 54L, 100L, 15L, 6L, 3L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 4L, 5L, 5L, 6L, 8L, 11L, 16L, 21L, 29L, 43L, 48L, 53L, 80L,
46L, 8L, 3L, 2L)
myTs <- ts(data, freq=365.25/7, start=decimal_date(ymd("2012-01-29")))
all.xts <- xts::xts(myTs, date_decimal(index(myTs)))
monthlyTs <- ts(all.xts , frequency = 12, start = c(lubridate::year("2012-01-29"), lubridate::month("2012-01-29")) )
window(monthlyTs, start= c(2013))
The last line will print :
> window(monthlyTs, start= c(2013))
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
2013 1 1 1 1 1 1 1 1 1 1 1 1
2014 1 1 1 1 2 2 2 2 3 3 3 4
2015 5 5 6 8 11 16 22 33 42 45 55 64
2016 8 4 2 2 2 2 2 2 1 1 1 1
2017 1 1 1 1 1 1 1 1 1 1 1 1
2018 1 1 1 1 1 1 1 2 2 2 2 2
2019 3 3 3 4 4 5 6 8 12 16 21 27
2020 43 47 56 79 10 5 2 2 2 1 1 1
2021 1 1 1 1 1 1 1 1 1 1 1 1
2022 1 1 1 1 1 1 1 1 1 1 2 2
2023 2 2 2 2 3 3 3 4 5 5 6 8
2024 12 17 21 27 43 47 53 87 12 5 2 2
2025 2 1 1 1 1 1 1 1 1 1 1 1
2026 1 1 1 1 1 1 1 1 1 1 1 1
2027 1 2 2 2 2 2 2 2 3 3 3 4
2028 5 6 6 8 13 17 20 27 44 50 54 100
2029 15 6 3 2 2 1 1 1 1 1 1 1
2030 1 1 1 1 1 1 1 1 1 1 1 1
2031 1 1 1 1 1 1 2 2 2 2 2 2
2032 3 3 3 4 5 5 6 8 11 16 21 29
2033 43 48 53 80 46 8 3 2

Get sum of unique rows in table function in R

Suppose I have data which looks like this
Id Name Price sales Profit Month Category Mode Supplier
1 A 2 5 8 1 X K John
1 A 2 6 9 2 X K John
1 A 2 5 8 3 X K John
2 B 2 4 6 1 X L Sam
2 B 2 3 4 2 X L Sam
2 B 2 5 7 3 X L Sam
3 C 2 5 11 1 X M John
3 C 2 5 11 2 X L John
3 C 2 5 11 3 X K John
4 D 2 8 10 1 Y M John
4 D 2 8 10 2 Y K John
4 D 2 5 7 3 Y K John
5 E 2 5 9 1 Y M Sam
5 E 2 5 9 2 Y L Sam
5 E 2 5 9 3 Y M Sam
6 F 2 4 7 1 Z M Kyle
6 F 2 5 8 2 Z L Kyle
6 F 2 5 8 3 Z M Kyle
if I apply table function, it will just combines are the rows and result will be
K L M
X 4 4 1
Y 2 1 3
Z 0 1 2
Now what if I want not the sum of all rows but only sum of those rows with Unique Id
so it looks like
K L M
X 2 2 1
Y 1 1 2
Z 0 1 1
Thanks
If df is your data.frame:
# Subset original data.frame to keep columns of interest
df1 <- df[,c("Id", "Category", "Mode")]
# Remove duplicated rows
df1 <- df1[!duplicated(df1),]
# Create table
with(df1, table(Category, Mode))
# Mode
# Category K L M
# X 2 2 1
# Y 1 1 2
# Z 0 1 1
Or in one line using unique
table(unique(df[c("Id", "Category", "Mode")])[-1])
df <- structure(list(Id = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L), Name = structure(c(1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L), .Label = c("A",
"B", "C", "D", "E", "F"), class = "factor"), Price = c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), sales = c(5L, 6L, 5L, 4L, 3L, 5L, 5L, 5L, 5L, 8L, 8L, 5L,
5L, 5L, 5L, 4L, 5L, 5L), Profit = c(8L, 9L, 8L, 6L, 4L, 7L, 11L,
11L, 11L, 10L, 10L, 7L, 9L, 9L, 9L, 7L, 8L, 8L), Month = c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L), Category = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("X", "Y", "Z"
), class = "factor"), Mode = structure(c(1L, 1L, 1L, 2L, 2L,
2L, 3L, 2L, 1L, 3L, 1L, 1L, 3L, 2L, 3L, 3L, 2L, 3L), .Label = c("K",
"L", "M"), class = "factor"), Supplier = structure(c(1L, 1L,
1L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L
), .Label = c("John", "Kyle", "Sam"), class = "factor")), .Names = c("Id",
"Name", "Price", "sales", "Profit", "Month", "Category", "Mode",
"Supplier"), class = "data.frame", row.names = c(NA, -18L))
We can try
library(data.table)
dcast(unique(setDT(df1[c('Category', 'Mode', 'Id')])),
Category~Mode, value.var='Id', length)
# Category K L M
#1: X 2 2 1
#2: Y 1 1 2
#3: Z 0 1 1
Or with dplyr
library(dplyr)
df1 %>%
distinct(Id, Category, Mode) %>%
group_by(Category, Mode) %>%
tally() %>%
spread(Mode, n, fill=0)
# Category K L M
# (chr) (dbl) (dbl) (dbl)
#1 X 2 2 1
#2 Y 1 1 2
#3 Z 0 1 1
Or as #David Arenburg suggested, a variant of the above is
df1 %>%
distinct(Id, Category, Mode) %>%
select(Category, Mode) %>%
table()

Resources