R- group by 2 factor variables to calculate quartile - r

I have a data set which has number of records collected [reccount] by [hourtime] and [Feedcodes]. What I'm trying to do is to create a column, that tells me which quartile each record falls into (probs=0:4/4), so that I can set up an alert if anything falls below 1st or the 2nd quartile and I can investigate the feed to see if something is out of the ordinary.
I tried first with this, but realized it wasn't grouping by hourtime and feedcode
df<-within(ds, quartile<-as.integer(cut(ds$reccount,quantile(ds$reccount,probs=0:4/4),inlcude.lowest=TRUE)))
Tried this but still it's not returning what I'm expecting
as<-ddply(ds,.(as.factor(ds$hourtime),ds$FeedCode) , function(df)quantile(ds$reccount,probs=0:4/4))
I just need to add a column that classifies it as which quartile.
Here's the data:
dput(head(dss,30))
structure(list(rownames = c(2371L, 2428L, 2459L, 2493L, 2573L,
2581L, 2606L, 2633L, 2668L, 2683L, 2693L, 2748L, 2756L, 2819L,
2865L, 2889L, 2896L, 2970L, 2988L, 3005L, 3047L, 3067L, 3111L,
3132L, 3154L, 3177L, 3209L, 3241L, 3272L), hourtime = c(3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), reccount = c(2864L,
3492L, 968L, 3271L, 6078L, 767L, 1365L, 6222L, 2515L, 3986L,
4327L, 5764L, 3676L, 5338L, 6407L, 1217L, 3058L, 5673L, 3569L,
3391L, 3169L, 6446L, 4201L, 884L, 3529L, 6461L, 3414L, 3246L,
5486L), FeedCode = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "MDSWJD", class = "factor"), quartile = c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L)), .Names = c("rownames",
"hourtime", "reccount", "FeedCode", "quartile"), row.names = c(NA,
29L), class = "data.frame")

You can use ave() to run the cut/quantile by grouping variables:
dss$quartile <- with(dss,
ave(reccount, hourtime, FeedCode,
FUN=function(x).bincode(x, quantile(x), T,T)
)
)

You got me confused with the quartiles, but having 5 groups {0,1,2,3,4}.
I don't know if I'm missing something, but here is a dplyr approach.
The first one calculates Q25% by group {hourtime, FeedCode} and flags everything below that. The second one splits reccount at 4 groups (quartiles) at each group and assigns the group number {1 to 4}.
Run the code step by step and let me know if you spot mistakes.
library(dplyr)
# example dataset
dt = data.frame(hourtime = c(1,1,1,1,1,2,2,2,2,2),
FeedCode = c("A","B","A","B","A","B","A","B","A","B"),
reccount = c(946,184,1404,937,137,1199,698,1311,1302,560))
dt %>%
group_by(hourtime, FeedCode) %>%
mutate(Q25 = quantile(reccount,0.25),
FlagBelowQ25 = ifelse(reccount < Q25, 1, 0)) %>%
ungroup
# hourtime FeedCode reccount Q25 FlagBelowQ25
# 1 1 A 946 541.50 0
# 2 1 B 184 372.25 1
# 3 1 A 1404 541.50 0
# 4 1 B 937 372.25 0
# 5 1 A 137 541.50 1
# 6 2 B 1199 879.50 0
# 7 2 A 698 849.00 1
# 8 2 B 1311 879.50 0
# 9 2 A 1302 849.00 0
# 10 2 B 560 879.50 1
dt %>%
group_by(hourtime, FeedCode) %>%
mutate(Quartile = ntile(reccount,4)) %>%
ungroup
# hourtime FeedCode reccount Quartile
# 1 1 A 946 2
# 2 1 B 184 1
# 3 1 A 1404 3
# 4 1 B 937 3
# 5 1 A 137 1
# 6 2 B 1199 2
# 7 2 A 698 1
# 8 2 B 1311 3
# 9 2 A 1302 3
# 10 2 B 560 1

Related

aggregation of data in R with assign of a dummy variable by condition

I have the following dataset
mydata=structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), ad_id = c(111L, 111L, 111L,
111L, 1111L, 1111L, 11111L, 11111L, 11111L, 111L, 111L, 1111L,
1111L, 11111L, 11111L, 11111L, 111111L, 111111L), price = c(1L,
0L, 1L, 0L, 2L, 0L, 3L, 0L, 0L, 1L, 0L, 2L, 0L, 3L, 0L, 0L, 1L,
0L), rev = c(2L, 0L, 0L, 2L, 3L, 3L, 4L, 4L, 4L, 2L, 2L, 3L,
3L, 4L, 4L, 4L, 0L, 0L), data = structure(c(1L, 2L, 2L, 3L, 1L,
3L, 1L, 3L, 4L, 1L, 3L, 1L, 3L, 1L, 3L, 4L, 1L, 3L), .Label = c("01.01.2018",
"01.02.2018", "01.03.2018", "02.03.2018"), class = "factor")), .Names = c("id",
"ad_id", "price", "rev", "data"), class = "data.frame", row.names = c(NA,
-18L))
How can I create a dummy variable according to the following logic:
For each id and ad_id I need to aggregate by data price and rev. Each ad_id has a date column (data).
If for each id and ad_idfor the period up to 90 days(data column -d-m-y) rev is greater than the price, then the flag is set to 1 otherwise the flag is 0.
In this reproducible example , I just take 1 id and 4 ad_id.
In aggregated by sum form it is view
id ad_id price rev
1 1 111 2 4
2 1 1111 2 6
3 1 11111 3 12
4 1 111111 1 0
So for id=1 , all ad_id (besides ad_id = 111111) satisfy rev > price, so in initial data
ad_id = 111, 1111, 111111 must have flag = 1 and 111111 must have flag = 0.
Here is the desired output:
id ad_id price rev data flag
1 1 111 1 2 01.01.2018 1
2 1 111 0 0 01.02.2018 1
3 1 111 1 0 01.02.2018 1
4 1 111 0 2 01.03.2018 1
5 1 1111 2 3 01.01.2018 1
6 1 1111 0 3 01.03.2018 1
7 1 11111 3 4 01.01.2018 1
8 1 11111 0 4 01.03.2018 1
9 1 11111 0 4 02.03.2018 1
10 1 111111 1 0 01.01.2018 0
11 1 111111 0 0 01.03.2018 0
How to perform such condition
I am not sure if understood you correctly, but is this what you are looking for:
library(tidyverse)
mydata %>% as_tibble() %>%
group_by(id, ad_id) %>%
summarise_at(vars("price", "rev"), sum) %>%
mutate(flag = if_else(price > rev, 0, 1)) %>%
select(id, ad_id, flag) %>%
left_join(mydata, ., by = c("id", "ad_id"))

Find the centroid of Clusters generated by the hclust function

I need to get the centroid for each cluster computed by the hierarchical method.
First, this is a part of my dataset to get reproductible example:
> dput(DATABASE[1:20,])
structure(list(TYPE_PEAU = c(2L, 2L, 3L, 2L, 2L, 2L, 2L, 4L,
3L, 2L, 2L, 2L, 2L, 1L, 4L, 2L, 2L, 2L, 4L, 2L), SENSIBILITE = c(3L,
2L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 3L,
1L, 3L, 3L), IMPERFECTIONS = c(2L, 2L, 3L, 3L, 1L, 2L, 2L, 3L,
2L, 2L, 2L, 1L, 1L, 1L, 3L, 1L, 2L, 1L, 2L, 2L), BRILLANCE = c(3L,
3L, 1L, 3L, 1L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L,
3L, 3L, 3L), GRAIN_PEAU = c(3L, 3L, 3L, 1L, 3L, 3L, 3L, 2L, 3L,
2L, 1L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 3L), RIDES_VISAGE = c(3L,
1L, 1L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 2L, 1L, 3L, 1L, 3L, 3L, 3L,
3L, 3L, 3L), MAINS = c(2L, 2L, 3L, 3L, 1L, 1L, 1L, 3L, 3L, 3L,
3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L), PEAU_CORPS = c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 3L, 2L, 3L, 2L, 3L, 2L,
2L, 1L), INTERET_ALIM_NATURELLE = c(1L, 1L, 3L, 1L, 1L, 1L, 1L,
1L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 1L, 1L, 1L), INTERET_ORIGINE_GEO = c(1L,
1L, 2L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 1L, 3L, 3L, 1L,
1L, 1L, 1L), INTERET_VACANCES = c(1L, 2L, 3L, 1L, 1L, 1L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 1L, 2L), INTERET_ENVIRONNEMENT = c(1L,
3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), AGE_INTERVAL = c(3L, 3L, 4L, 2L, 2L, 3L, 3L, 4L,
4L, 3L, 4L, 2L, 1L, 3L, 3L, 2L, 2L, 2L, 2L, 3L), ATTENTE_BEAUTE_1 = c(1L,
6L, 4L, 4L, 6L, 6L, 3L, 1L, 1L, 4L, 3L, 6L, 2L, 5L, 5L, 6L, 7L,
4L, 6L, 3L), ATTENTE_BEAUTE_2 = c(2L, 2L, 3L, 6L, 4L, 1L, 4L,
7L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 2L, 6L, 2L, 2L, 2L), MILIEU_VIE = c(1L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), PROFIL_SELECTIONNE = c(1L, 32L, 21L, 23L, 34L, 31L,
15L, 6L, 1L, 20L, 14L, 34L, 9L, 28L, 28L, 32L, 42L, 20L, 32L,
14L), NOMBRE_ACHAT = c(14L, 6L, 3L, 9L, 8L, 13L, 10L, 14L, 4L,
3L, 10L, 8L, 12L, 3L, 7L, 6L, 4L, 13L, 3L, 3L), NOMBRE_CADEAU = c(2L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
2L, 1L, 1L)), .Names = c("TYPE_PEAU", "SENSIBILITE", "IMPERFECTIONS",
"BRILLANCE", "GRAIN_PEAU", "RIDES_VISAGE", "MAINS", "PEAU_CORPS",
"INTERET_ALIM_NATURELLE", "INTERET_ORIGINE_GEO", "INTERET_VACANCES",
"INTERET_ENVIRONNEMENT", "AGE_INTERVAL", "ATTENTE_BEAUTE_1",
"ATTENTE_BEAUTE_2", "MILIEU_VIE", "PROFIL_SELECTIONNE", "NOMBRE_ACHAT",
"NOMBRE_CADEAU"), row.names = c(NA, 20L), class = "data.frame")
then I used as follow :
mydist = dist(DATABASE)
clusters = cutree(hclust(mydist),k=3)
> clusters
[1] 1 2 3 3 2 2 3 1 1 3 1 2 1 3 2 2 2 3 2 1 3 2 1 1 1 1 2 1 2 1 3 3 2 3 2 2 1 1 1 1 3 2 1 1 3 2 1 2 2 1 2 2 3 1 3 1 3
[58] 1 3 2 2 1 1 2 1 2 2 2 3 2 3 1 2 2 1 1 3 3 2 1 2 2 1 2 3 3 3 1 2 1 2 1 1 1 1 1 3 2 2 2 1 1 3 2 2 1 1 1 2 1 1 1 1 3
[115] 1 2 2 1 2 3 1 1 2 3 1 1 1 2 1 3 1 2 3 2 2 1 2 1 1 3 3 2 1 2 2 1 1 1 1 2 1 2 2 3 3 1 1 3 1 3 3 3 3 2 3 1 2 3 3 3 1
[172] 1 2 2 1 1 2 1 2 2 1 3 3 1 2 2 1 1 1 2 2 1 1 1 1 3 2 3 3 1 1 2 2 2 3 1 1 1 2 2 1 2 1 3 1 2 1 3 3 1 1 1 1 2 1 2 2 2
[229] 3 3 1 1 2 1 3 2 2 2 1 1 2 1 3 1 2 1 3 1 3 1 3 1 1 1 1 2 2 1 3 3 3 2 1 2 3 2 2 1 1 3 1 2 3 1 1 2 1 1 1 1 2 2 2 3 2
[286] 1 2 1 1 2 1 2 1 2 2 1 2 3 1 3 1 3 1 1 3 1 1 2 2 1 3 3 2 2 1 2 1 1 2 2 1 3 3 2 2 1 3 3 3 1 1 1 1 3 3 2 1 3 1 2 1 2
[343] 1 2 3 3 2 3 1 3 2 3 3 1 2 2 1 2 2 3 2 1 3 2 2 1 2 3 2 3 3 3 2 2 3 2 1 1 1 2 3 2 2 1 2 2 2 1 2 1 1 1 3 1 2 2 1 1 2
[400] 1 1 1 1 1 2 2 2
Please Note that the objectif is to compute the inter and intra inertia:
So i need to compute the distance between each centroid and all points that are included in its cluster.
So I need to compute the distance between each centroid and its concerned cluster
to used then for computing the inter and intra inertia.
You can define the centroids as the means of variables, per cluster, in DATABASE.
mydist <- dist(DATABASE)
clusters <- cutree(hclust(mydist), k = 3)
## Col means in each cluster
apply(DATABASE, 2, function (x) tapply(x, clusters, mean))
## or
DATABASE$cluster <- clusters # add cluster to DATABASE
# Now take means per group
library(dplyr)
centroids <- DATABASE %>%
group_by(cluster) %>%
summarise_all(funs(mean))
## Distance between centroids
dist(centroids[, -1], method = "euclidean")
## Example for distance in cluster 1 (distance between all observations of cluster 1)
DATABASE %>%
filter(cluster == 1) %>%
select(-cluster) %>%
dist()
you might want to specify your k value into 1:3 not just 3
here is the code and how to find the center (mean)

Getting missing values in aggregate function

I have following data with peculiar missing values situation (all values of vnum1 for vcat1==3 are missing):
> head(mydf)
vnum1 vcat1
1 -0.1624229 1
2 0.2465567 1
3 NA 3
4 0.7067778 2
5 NA 3
6 -0.2241726 4
> dput(mydf)
structure(list(vnum1 = c(-0.162422853864248, 0.246556718176803,
NA, 0.706777793886275, NA, -0.224172615208867, 0.0545850414695318,
NA, NA, -1.94778020954922, 1.89581259201036, 0.901973743223488,
-0.31255172156186, -1.67311124367419, 0.491316838004494, NA,
-0.699315343799762, 0.668020448193884, 1.45492995320554, 1.17747976289091,
-0.65137204397438, 1.78678696473193, 2.58978935829221, NA, 1.26534157843481,
0.629748102812663, 0.246596558590885, 0.968707124353133, 0.108668693948881,
-0.219419917000748, 2.25307417017233, -0.626124211646445, -1.16298694223082,
-1.23524906047676, -2.34636152907898, NA, 0.408667368960836,
0.272596114054819, 0.747455245383144, -0.745843219461836, -0.0966351379737077,
1.44803320811527, -1.5434982335725, -0.782902668540696, -0.448286848257394,
NA, 0.168327130336994, -0.493721325506037, 0.397253883862878,
1.57070527855864), vcat1 = structure(c(1L, 1L, 3L, 2L, 3L, 4L,
4L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 4L, 3L, 4L, 4L, 4L, 1L, 2L, 4L,
1L, 3L, 2L, 4L, 2L, 1L, 4L, 2L, 2L, 4L, 2L, 1L, 1L, 3L, 1L, 4L,
4L, 4L, 4L, 2L, 4L, 1L, 4L, 3L, 1L, 4L, 4L, 1L), .Label = c("1",
"2", "3", "4"), class = "factor")), .Names = c("vnum1", "vcat1"
), row.names = c(NA, 50L), class = "data.frame")
If I use tapply, I clearly see the missing category:
> with(mydf,tapply(vnum1, vcat1, mean))
1 2 3 4
0.09172749 0.48575555 NA 0.09632024
But it is totally ignored in aggregate function:
> aggregate(vnum1~vcat1, mydf, mean)
vcat1 vnum1
1 1 0.09172749
2 2 0.48575555
3 4 0.09632024
I want to get it in aggregate function also. How can I do it? Thanks.
In the formula method, use na.action = NULL to keep the NA result.
aggregate(vnum1 ~ vcat1, mydf, mean, na.action = NULL)
# vcat1 vnum1
# 1 1 0.09172749
# 2 2 0.48575555
# 3 3 NA
# 4 4 0.09632024
You could have also used the data frame method and not have this worry.
with(mydf, aggregate(list(vnum1 = vnum1), list(vcat1 = vcat1), mean))

sum and conditionally count based on a second column

I have gotten frustrated trying to solve this seemingly simple problem. I have a dataset (df) like this:
structure(list(Year = c(2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L), Unknown = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), Temp = c(21L, 21L, 21L, 23L, 23L, 21L, 21L, 22L, 21L, 23L,
23L, 22L, 21L, 21L, 22L, 22L, 21L, 21L, 23L, 23L), Obs = structure(c(1L,
1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L), .Label = c("mdk", "sde"), class = "factor"), State = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "ma", class = "factor"), Zone = c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), Segment = c(8L, 7L, 4L, 17L, 18L, 7L, 2L, 12L, 1L, 17L,
18L, 12L, 9L, 7L, 13L, 11L, 8L, 9L, 17L, 18L), Subseg = c(1L,
3L, 3L, 2L, 2L, 2L, 4L, 0L, 10L, 4L, 2L, 0L, 1L, 1L, 3L, 1L,
2L, 2L, 1L, 1L), Wdir = structure(c(2L, 2L, 1L, 3L, 3L, 2L, 2L,
1L, 2L, 3L, 3L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L), .Label = c("na",
"ne", "nw"), class = "factor"), Wvel = structure(c(1L, 1L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L), .Label = c("5", "na"), class = "factor"), Clouds = structure(c(1L,
1L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 1L,
1L, 3L, 3L), .Label = c("1", "4", "na"), class = "factor"), Temp.1 = structure(c(1L,
1L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 1L,
1L, 3L, 3L), .Label = c("20", "25", "na"), class = "factor"),
Species = structure(c(7L, 21L, 1L, 21L, 16L, 4L, 16L, 6L,
1L, 17L, 5L, 7L, 5L, 1L, 1L, 6L, 7L, 7L, 24L, 5L), .Label = c("ABDU",
"ABDU", "ABDU", "ABDU", "ABDU", "CAGO", "CAGO", "CAGO", "CAGO",
"CAGO", "GOLD", "GOLD", "GOLD", "GOLD", "GOLD", "MERG", "MERG",
"MERG", "MERG", "MERG", "SCOT", "SCOT", "SCOT", "SCOT",
"SCOT", "SCOT", "SCOT"), class = "factor"), Count = c(5L,
1L, 150L, 3L, 20L, 8L, 5L, 10L, 5L, 1L, 20L, 10L, 2L, 2L,
80L, 40L, 1L, 1000L, 2L, 20L)), .Names = c("Year", "Unknown",
"Temp", "Obs", "State", "Zone", "Segment", "Subseg", "Wdir",
"Wvel", "Clouds", "Temp.1", "Species", "Count"), row.names = c(666L,
614L, 2060L, 1738L, 1459L, 536L, 197L, 2467L, 98L, 1794L, 1449L,
2464L, 696L, 483L, 2644L, 2350L, 686L, 844L, 2989L, 2934L), class = "data.frame")
With a header that looks like this:
Year Unknown Temp Obs State Zone Segment Subseg Wdir Wvel
666 2015 1 21 mdk ma 2 8 1 ne 5
614 2015 1 21 mdk ma 2 7 3 ne 5
2060 2015 1 21 sde ma 2 4 3 na na
1738 2015 1 23 mdk ma 2 17 2 nw 5
1459 2015 1 23 mdk ma 2 18 2 nw 5
536 2015 1 21 mdk ma 2 7 2 ne 5
Clouds Temp.1 Species Count
666 1 20 CAGO 5
614 1 20 SCOT 1
2060 na na ABDU 150
1738 1 20 SCOT 3
1459 1 20 MERG 20
536 1 20 ABDU 8
Among other things within dplyr, I want to get a sum of each species as a new column, when I am grouping by segment. This is the final code I have tried with many variations.
df_group = df %>%
group_by(Segment) %>%
summarise(temp = round(mean(Temp)),
WDir = round(mean(Wdir)),
ABDU = sum(which(Species=="ABDU"),Count),
CAGO = sum(which(Species=="CAGO"),Count),
GOLD = sum(which(Species=="GOLD"),Count),
MERG = sum(which(Species=="MERG"),Count),
SCOT = sum(which(Species=="SCOT"),Count))
And this is what I get (to show correct format):
Segment temp WDir ABDU CAGO GOLD MERG SCOT
1 1 21 2 6 5 5 5 5
2 2 21 2 5 5 5 6 5
3 4 21 1 151 150 150 150 150
4 7 21 2 16 11 11 11 12
5 8 21 2 6 9 6 6 6
6 9 21 2 1003 1004 1002 1002 1002
The format and general idea are what I want, but the numbers are not adding up the way I want them to. I'm sure it is simple but need some help! Thanks.
The problem is that which returns a vector of the positions, but you're not using those to subset. So the sum you are getting is of the positions which are true in addition to the count variable. e.g.
x <- c("a", "b", "b")
count <- c(10, 11, 12)
sum(which(c("a", "b", "b") == "b"), count)
# 38 because it is 2 + 3 + 10 + 11 + 12
I believe what you want is (or at least one way of writing it):
sum(ifelse(x == "b", count, 0))
# 23 because it is equal to 0 + 11 + 12
Translating into dplyr syntax, your example could look like this:
df_group = df %>%
group_by(Segment) %>%
summarise(temp = round(mean(Temp)),
WDir = round(mean(Wdir)),
ABDU = sum(ifelse(Species=="ABDU", Count, 0L)),
CAGO = sum(ifelse(Species=="CAGO", Count, 0L)),
GOLD = sum(ifelse(Species=="GOLD", Count, 0L)),
MERG = sum(ifelse(Species=="MERG", Count, 0L)),
SCOT = sum(ifelse(Species=="SCOT", Count, 0L)))
Another approach, in case you don't want to type out the sum for all your species:
library(reshape2)
library(dplyr)
# I had a problem with duplicate factor levels from your dput,
# so I re-factored species
df$Species = as.factor(as.character(df$Species))
species.counts = select(df, Segment, Species, Count) %>%
dcast(formula = Segment ~ Species, value.var = "Count", fun.aggregate = sum)
> head(species.counts)
Segment ABDU CAGO MERG SCOT
1 1 5 0 0 0
2 2 0 0 5 0
3 4 150 0 0 0
4 7 10 0 0 1
5 8 0 6 0 0
6 9 2 1000 0 0
df %>% group_by(Segment) %>%
summarise(temp = round(mean(Temp))) %>%
left_join(species.counts)
Source: local data frame [11 x 6]
Segment temp ABDU CAGO MERG SCOT
1 1 21 5 0 0 0
2 2 21 0 0 5 0
3 4 21 150 0 0 0
4 7 21 10 0 0 1
5 8 21 0 6 0 0
6 9 21 2 1000 0 0
I also couldn't do the wind direction average, because your dput data only has that as a factor with the directions, not like the head() you showed, but the technique generalizes.

Subset data.table using min condition

There is probably a really simple solution to this problem, but I couldn't find it from googling, or the data.table FAQ.
I have a data.table like so:
> test
chr bp ID REF ALT AF AC AN EFFECT IMPACT FUNCLASS CODING GENE pos effRank
1: 1 860416 rs61464428 G A 0.5000000 14 28 UPSTREAM MODIFIER CODING SAMD11 1:860416 21
2: 1 860416 rs61464428 G A 0.5000000 14 28 UPSTREAM MODIFIER CODING SAMD11 1:860416 21
3: 1 860416 rs61464428 G A 0.5000000 14 28 DOWNSTREAM MODIFIER CODING AL645608.1 1:860416 22
4: 1 860461 rs57465118 G A 1.0000000 62 62 UPSTREAM MODIFIER CODING SAMD11 1:860461 21
5: 1 860461 rs57465118 G A 1.0000000 62 62 UPSTREAM MODIFIER CODING SAMD11 1:860461 21
6: 1 860461 rs57465118 G A 1.0000000 62 62 DOWNSTREAM MODIFIER CODING AL645608.1 1:860461 22
7: 1 860521 rs57924093 C A 0.9840000 61 62 UPSTREAM MODIFIER CODING SAMD11 1:860521 21
8: 1 860521 rs57924093 C A 0.9840000 61 62 UPSTREAM MODIFIER CODING SAMD11 1:860521 21
9: 1 860521 rs57924093 C A 0.9840000 61 62 DOWNSTREAM MODIFIER CODING AL645608.1 1:860521 22
10: 1 861261 rs144896029 G A 0.0027270 3 1100 UPSTREAM MODIFIER CODING SAMD11 1:861261 21
11: 1 861261 rs144896029 G A 0.0027270 3 1100 DOWNSTREAM MODIFIER CODING AL645608.1 1:861261 22
12: 1 861332 G A 0.0009074 1 1102 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING AL645608.1 1:861332 11
13: 1 861332 G A 0.0009074 1 1102 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING SAMD11 1:861332 11
14: 1 861332 G A 0.0009074 1 1102 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING SAMD11 1:861332 11
15: 1 861332 G A 0.0009074 1 1102 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING SAMD11 1:861332 11
16: 1 861332 G A 0.0009074 1 1102 UPSTREAM MODIFIER CODING SAMD11 1:861332 21
17: 1 865455 C G 0.0033190 3 904 UPSTREAM MODIFIER CODING SAMD11 1:865455 21
18: 1 865628 rs41285790 G A 0.0027780 3 1080 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING SAMD11 1:865628 11
19: 1 865628 rs41285790 G A 0.0027780 3 1080 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING SAMD11 1:865628 11
20: 1 865628 rs41285790 G A 0.0027780 3 1080 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING SAMD11 1:865628 11
21: 1 865628 rs41285790 G A 0.0027780 3 1080 SYNONYMOUS_CODING LOW SILENT CODING AL645608.1 1:865628 14
22: 1 865628 rs41285790 G A 0.0027780 3 1080 UPSTREAM MODIFIER CODING SAMD11 1:865628 21
23: 1 866437 rs139076934 C T 0.0009074 1 1102 SYNONYMOUS_CODING LOW SILENT CODING AL645608.1 1:866437 14
24: 1 866437 rs139076934 C T 0.0009074 1 1102 SYNONYMOUS_CODING LOW SILENT CODING SAMD11 1:866437 14
25: 1 866437 rs139076934 C T 0.0009074 1 1102 SYNONYMOUS_CODING LOW SILENT CODING SAMD11 1:866437 14
26: 1 866437 rs139076934 C T 0.0009074 1 1102 SYNONYMOUS_CODING LOW SILENT CODING SAMD11 1:866437 14
27: 1 866461 rs148884928 G A 0.0009074 1 1102 SYNONYMOUS_CODING LOW SILENT CODING SAMD11 1:866461 14
28: 1 866461 rs148884928 G A 0.0009074 1 1102 SYNONYMOUS_CODING LOW SILENT CODING SAMD11 1:866461 14
29: 1 866461 rs148884928 G A 0.0009074 1 1102 SYNONYMOUS_CODING LOW SILENT CODING SAMD11 1:866461 14
30: 1 866461 rs148884928 G A 0.0009074 1 1102 UPSTREAM MODIFIER CODING AL645608.1 1:866461 21
31: 1 866511 rs71576583 CCCCT CCCCTCCCT 1.0000000 148 148 UPSTREAM MODIFIER CODING AL645608.1 1:866511 21
32: 1 871057 C T 0.0009074 1 1102 UPSTREAM MODIFIER CODING SAMD11 1:871057 21
33: 1 871057 C T 0.0009074 1 1102 UPSTREAM MODIFIER CODING AL645608.1 1:871057 21
34: 1 871057 C T 0.0009074 1 1102 UPSTREAM MODIFIER CODING SAMD11 1:871057 21
35: 1 871215 rs28419423 C G 0.0036300 4 1102 SYNONYMOUS_CODING LOW SILENT CODING SAMD11 1:871215 14
36: 1 871215 rs28419423 C G 0.0036300 4 1102 SYNONYMOUS_CODING LOW SILENT CODING SAMD11 1:871215 14
37: 1 871215 rs28419423 C G 0.0036300 4 1102 UPSTREAM MODIFIER CODING SAMD11 1:871215 21
38: 1 871215 rs28419423 C G 0.0036300 4 1102 UPSTREAM MODIFIER CODING SAMD11 1:871215 21
39: 1 871215 rs28419423 C G 0.0036300 4 1102 UPSTREAM MODIFIER CODING AL645608.1 1:871215 21
40: 1 871215 rs28419423 C G 0.0036300 4 1102 DOWNSTREAM MODIFIER CODING SAMD11 1:871215 22
41: 1 871287 C G 0.0009107 1 1098 UPSTREAM MODIFIER CODING SAMD11 1:871287 21
42: 1 871287 C G 0.0009107 1 1098 UPSTREAM MODIFIER CODING SAMD11 1:871287 21
43: 1 871287 C G 0.0009107 1 1098 UPSTREAM MODIFIER CODING AL645608.1 1:871287 21
44: 1 871287 C G 0.0009107 1 1098 DOWNSTREAM MODIFIER CODING SAMD11 1:871287 22
45: 1 871334 rs4072383 G T 0.6680000 474 710 UPSTREAM MODIFIER CODING SAMD11 1:871334 21
46: 1 871334 rs4072383 G T 0.6680000 474 710 UPSTREAM MODIFIER CODING SAMD11 1:871334 21
47: 1 871334 rs4072383 G T 0.6680000 474 710 UPSTREAM MODIFIER CODING AL645608.1 1:871334 21
48: 1 871334 rs4072383 G T 0.6680000 474 710 DOWNSTREAM MODIFIER CODING SAMD11 1:871334 22
49: 1 874415 rs74047412 C T 0.0018250 2 1096 UPSTREAM MODIFIER CODING SAMD11 1:874415 21
50: 1 874415 rs74047412 C T 0.0018250 2 1096 UPSTREAM MODIFIER CODING SAMD11 1:874415 21
chr bp ID REF ALT AF AC AN EFFECT IMPACT FUNCLASS CODING GENE pos effRank
As you can see, the values in the many of the rows are repeats, for some of the columns. What I want to do is remove the duplicated rows, based on the value (the min) of the effRank variable. I have set the key to be chr, bp, and effRank. So the table should be sorted on the basis of those three columns. I got kind of close. The following command returns the rows that I want, but does not return all columns, which I want.
> test[,min(effRank), by=pos]
pos V1
1: 1:860416 21
2: 1:860461 21
3: 1:860521 21
4: 1:861261 21
5: 1:861332 11
6: 1:865455 21
7: 1:865628 11
8: 1:866437 14
9: 1:866461 14
10: 1:866511 21
11: 1:871057 21
12: 1:871215 14
13: 1:871287 21
14: 1:871334 21
15: 1:874415 21
All I need is a way to make the above command return all columns in the data.table, not just the ones mentioned in the expressions. Otherwise, works perfectly. Any help is appreciated. The output of dput is below, for those that with to make their own example.
Cheers,
Davy
> dput(test)
structure(list(chr = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), bp = c(860416L, 860416L, 860416L,
860461L, 860461L, 860461L, 860521L, 860521L, 860521L, 861261L,
861261L, 861332L, 861332L, 861332L, 861332L, 861332L, 865455L,
865628L, 865628L, 865628L, 865628L, 865628L, 866437L, 866437L,
866437L, 866437L, 866461L, 866461L, 866461L, 866461L, 866511L,
871057L, 871057L, 871057L, 871215L, 871215L, 871215L, 871215L,
871215L, 871215L, 871287L, 871287L, 871287L, 871287L, 871334L,
871334L, 871334L, 871334L, 874415L, 874415L), ID = structure(c(10L,
10L, 10L, 8L, 8L, 8L, 9L, 9L, 9L, 3L, 3L, 1L, 1L, 1L, 1L, 1L,
1L, 7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 11L,
1L, 1L, 1L, 5L, 5L, 5L, 5L, 5L, 5L, 1L, 1L, 1L, 1L, 6L, 6L, 6L,
6L, 12L, 12L), .Label = c("", "rs139076934", "rs144896029", "rs148884928",
"rs28419423", "rs4072383", "rs41285790", "rs57465118", "rs57924093",
"rs61464428", "rs71576583", "rs74047412"), class = "factor"),
REF = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L,
1L, 3L, 3L, 3L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 1L, 1L), .Label = c("C",
"CCCCT", "G"), class = "factor"), ALT = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L,
1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L, 4L,
4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L), .Label = c("A", "CCCCTCCCT", "G", "T"), class = "factor"),
AF = c(0.5, 0.5, 0.5, 1, 1, 1, 0.984, 0.984, 0.984, 0.002727,
0.002727, 0.0009074, 0.0009074, 0.0009074, 0.0009074, 0.0009074,
0.003319, 0.002778, 0.002778, 0.002778, 0.002778, 0.002778,
0.0009074, 0.0009074, 0.0009074, 0.0009074, 0.0009074, 0.0009074,
0.0009074, 0.0009074, 1, 0.0009074, 0.0009074, 0.0009074,
0.00363, 0.00363, 0.00363, 0.00363, 0.00363, 0.00363, 0.0009107,
0.0009107, 0.0009107, 0.0009107, 0.668, 0.668, 0.668, 0.668,
0.001825, 0.001825), AC = c(14L, 14L, 14L, 62L, 62L, 62L,
61L, 61L, 61L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L,
3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 148L, 1L, 1L, 1L,
4L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 474L, 474L, 474L,
474L, 2L, 2L), AN = c(28L, 28L, 28L, 62L, 62L, 62L, 62L,
62L, 62L, 1100L, 1100L, 1102L, 1102L, 1102L, 1102L, 1102L,
904L, 1080L, 1080L, 1080L, 1080L, 1080L, 1102L, 1102L, 1102L,
1102L, 1102L, 1102L, 1102L, 1102L, 148L, 1102L, 1102L, 1102L,
1102L, 1102L, 1102L, 1102L, 1102L, 1102L, 1098L, 1098L, 1098L,
1098L, 710L, 710L, 710L, 710L, 1096L, 1096L), EFFECT = structure(c(4L,
4L, 1L, 4L, 4L, 1L, 4L, 4L, 1L, 4L, 1L, 2L, 2L, 2L, 2L, 4L,
4L, 2L, 2L, 2L, 3L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
4L, 4L, 4L, 3L, 3L, 4L, 4L, 4L, 1L, 4L, 4L, 4L, 1L, 4L, 4L,
4L, 1L, 4L, 4L), .Label = c("DOWNSTREAM", "NON_SYNONYMOUS_CODING",
"SYNONYMOUS_CODING", "UPSTREAM"), class = "factor"), IMPACT = structure(c(3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L,
3L, 2L, 2L, 2L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L,
3L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L), .Label = c("LOW", "MODERATE", "MODIFIER"
), class = "factor"), FUNCLASS = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L,
2L, 2L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L,
1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("", "MISSENSE", "SILENT"), class = "factor"),
CODING = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "CODING", class = "factor"),
GENE = structure(c(2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("AL645608.1",
"SAMD11"), class = "factor"), pos = structure(c(1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 7L,
7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 11L,
11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L,
14L, 14L, 14L, 14L, 15L, 15L), .Label = c("1:860416", "1:860461",
"1:860521", "1:861261", "1:861332", "1:865455", "1:865628",
"1:866437", "1:866461", "1:866511", "1:871057", "1:871215",
"1:871287", "1:871334", "1:874415"), class = "factor"), effRank = c(21L,
21L, 22L, 21L, 21L, 22L, 21L, 21L, 22L, 21L, 22L, 11L, 11L,
11L, 11L, 21L, 21L, 11L, 11L, 11L, 14L, 21L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 21L, 21L, 21L, 21L, 21L, 14L, 14L, 21L,
21L, 21L, 22L, 21L, 21L, 21L, 22L, 21L, 21L, 21L, 22L, 21L,
21L)), .Names = c("chr", "bp", "ID", "REF", "ALT", "AF",
"AC", "AN", "EFFECT", "IMPACT", "FUNCLASS", "CODING", "GENE",
"pos", "effRank"), row.names = c(NA, -50L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000004260788>, sorted = c("chr",
"bp", "effRank"))
You can use the internal variable .I, which gives the row number. Then subset using those values, as follows:
DT[DT[, .I[which.min(effRank)], pos]$V1]
It's easier to understand if you write it in two lines as follows:
tmp <- DT[, .I[which.min(effRank)], pos]
DT[tmp$V1]
The first line generates a column V1 with all the row numbers of the minimum positions (from your j expression) grouped by pos. Then you just subset them.

Resources