I have gotten frustrated trying to solve this seemingly simple problem. I have a dataset (df) like this:
structure(list(Year = c(2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L), Unknown = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), Temp = c(21L, 21L, 21L, 23L, 23L, 21L, 21L, 22L, 21L, 23L,
23L, 22L, 21L, 21L, 22L, 22L, 21L, 21L, 23L, 23L), Obs = structure(c(1L,
1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L), .Label = c("mdk", "sde"), class = "factor"), State = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "ma", class = "factor"), Zone = c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), Segment = c(8L, 7L, 4L, 17L, 18L, 7L, 2L, 12L, 1L, 17L,
18L, 12L, 9L, 7L, 13L, 11L, 8L, 9L, 17L, 18L), Subseg = c(1L,
3L, 3L, 2L, 2L, 2L, 4L, 0L, 10L, 4L, 2L, 0L, 1L, 1L, 3L, 1L,
2L, 2L, 1L, 1L), Wdir = structure(c(2L, 2L, 1L, 3L, 3L, 2L, 2L,
1L, 2L, 3L, 3L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L), .Label = c("na",
"ne", "nw"), class = "factor"), Wvel = structure(c(1L, 1L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L), .Label = c("5", "na"), class = "factor"), Clouds = structure(c(1L,
1L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 1L,
1L, 3L, 3L), .Label = c("1", "4", "na"), class = "factor"), Temp.1 = structure(c(1L,
1L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 1L,
1L, 3L, 3L), .Label = c("20", "25", "na"), class = "factor"),
Species = structure(c(7L, 21L, 1L, 21L, 16L, 4L, 16L, 6L,
1L, 17L, 5L, 7L, 5L, 1L, 1L, 6L, 7L, 7L, 24L, 5L), .Label = c("ABDU",
"ABDU", "ABDU", "ABDU", "ABDU", "CAGO", "CAGO", "CAGO", "CAGO",
"CAGO", "GOLD", "GOLD", "GOLD", "GOLD", "GOLD", "MERG", "MERG",
"MERG", "MERG", "MERG", "SCOT", "SCOT", "SCOT", "SCOT",
"SCOT", "SCOT", "SCOT"), class = "factor"), Count = c(5L,
1L, 150L, 3L, 20L, 8L, 5L, 10L, 5L, 1L, 20L, 10L, 2L, 2L,
80L, 40L, 1L, 1000L, 2L, 20L)), .Names = c("Year", "Unknown",
"Temp", "Obs", "State", "Zone", "Segment", "Subseg", "Wdir",
"Wvel", "Clouds", "Temp.1", "Species", "Count"), row.names = c(666L,
614L, 2060L, 1738L, 1459L, 536L, 197L, 2467L, 98L, 1794L, 1449L,
2464L, 696L, 483L, 2644L, 2350L, 686L, 844L, 2989L, 2934L), class = "data.frame")
With a header that looks like this:
Year Unknown Temp Obs State Zone Segment Subseg Wdir Wvel
666 2015 1 21 mdk ma 2 8 1 ne 5
614 2015 1 21 mdk ma 2 7 3 ne 5
2060 2015 1 21 sde ma 2 4 3 na na
1738 2015 1 23 mdk ma 2 17 2 nw 5
1459 2015 1 23 mdk ma 2 18 2 nw 5
536 2015 1 21 mdk ma 2 7 2 ne 5
Clouds Temp.1 Species Count
666 1 20 CAGO 5
614 1 20 SCOT 1
2060 na na ABDU 150
1738 1 20 SCOT 3
1459 1 20 MERG 20
536 1 20 ABDU 8
Among other things within dplyr, I want to get a sum of each species as a new column, when I am grouping by segment. This is the final code I have tried with many variations.
df_group = df %>%
group_by(Segment) %>%
summarise(temp = round(mean(Temp)),
WDir = round(mean(Wdir)),
ABDU = sum(which(Species=="ABDU"),Count),
CAGO = sum(which(Species=="CAGO"),Count),
GOLD = sum(which(Species=="GOLD"),Count),
MERG = sum(which(Species=="MERG"),Count),
SCOT = sum(which(Species=="SCOT"),Count))
And this is what I get (to show correct format):
Segment temp WDir ABDU CAGO GOLD MERG SCOT
1 1 21 2 6 5 5 5 5
2 2 21 2 5 5 5 6 5
3 4 21 1 151 150 150 150 150
4 7 21 2 16 11 11 11 12
5 8 21 2 6 9 6 6 6
6 9 21 2 1003 1004 1002 1002 1002
The format and general idea are what I want, but the numbers are not adding up the way I want them to. I'm sure it is simple but need some help! Thanks.
The problem is that which returns a vector of the positions, but you're not using those to subset. So the sum you are getting is of the positions which are true in addition to the count variable. e.g.
x <- c("a", "b", "b")
count <- c(10, 11, 12)
sum(which(c("a", "b", "b") == "b"), count)
# 38 because it is 2 + 3 + 10 + 11 + 12
I believe what you want is (or at least one way of writing it):
sum(ifelse(x == "b", count, 0))
# 23 because it is equal to 0 + 11 + 12
Translating into dplyr syntax, your example could look like this:
df_group = df %>%
group_by(Segment) %>%
summarise(temp = round(mean(Temp)),
WDir = round(mean(Wdir)),
ABDU = sum(ifelse(Species=="ABDU", Count, 0L)),
CAGO = sum(ifelse(Species=="CAGO", Count, 0L)),
GOLD = sum(ifelse(Species=="GOLD", Count, 0L)),
MERG = sum(ifelse(Species=="MERG", Count, 0L)),
SCOT = sum(ifelse(Species=="SCOT", Count, 0L)))
Another approach, in case you don't want to type out the sum for all your species:
library(reshape2)
library(dplyr)
# I had a problem with duplicate factor levels from your dput,
# so I re-factored species
df$Species = as.factor(as.character(df$Species))
species.counts = select(df, Segment, Species, Count) %>%
dcast(formula = Segment ~ Species, value.var = "Count", fun.aggregate = sum)
> head(species.counts)
Segment ABDU CAGO MERG SCOT
1 1 5 0 0 0
2 2 0 0 5 0
3 4 150 0 0 0
4 7 10 0 0 1
5 8 0 6 0 0
6 9 2 1000 0 0
df %>% group_by(Segment) %>%
summarise(temp = round(mean(Temp))) %>%
left_join(species.counts)
Source: local data frame [11 x 6]
Segment temp ABDU CAGO MERG SCOT
1 1 21 5 0 0 0
2 2 21 0 0 5 0
3 4 21 150 0 0 0
4 7 21 10 0 0 1
5 8 21 0 6 0 0
6 9 21 2 1000 0 0
I also couldn't do the wind direction average, because your dput data only has that as a factor with the directions, not like the head() you showed, but the technique generalizes.
Related
I have a monthly time series - monthlyTs:
monthlyTs <- ts(all.xts , frequency = 12, start=decimal_date(ymd("2012-01-29")))
head(index(monthlyTs))
1 "2012-01-29 00:00:00 UTC" "2012-02-26 01:22:47 UTC" "2012-03-25
02:45:35 UTC" "2012-04-29 04:29:04 UTC"
[5] "2012-05-27 05:51:52 UTC" "2012-06-24 07:14:39 UTC"
I want to apply a time windows that starts from 2013:
head(window(monthly, start = 2013))
2012-01-29 00:00:00 2
2012-02-26 01:22:47 8 2012-03-25 02:45:35 6 2012-04-29 04:29:04
5 2012-05-27 05:51:52 4 2012-06-24 07:14:39 4
So looks like window function is not filtering as expected. What is wrong?
Fully reproducible example as requested:
christmas.csv - tiny CSV file (google trends for 'Christmas' request)
#Reading data from the csv. Format - [week start date], [views per week]
data = read.csv('christmas.csv', sep=",", header = FALSE, skip = 3,col.names = c("Week","Views"))[[2]]
# creating time series
myTs <- ts(data[[2]], freq=365.25/7, start=decimal_date(ymd("2012-01-29")))
#converting from weekly to month time series
all.xts <- xts(myTs, date_decimal(index(myTs)))
monthlyTs <- ts(all.xts , frequency = 12, start=decimal_date(ymd("2012-01-29")))
head(window(monthlyTs, start = 2013))
2012-01-29 00:00:00 2
2012-02-26 01:22:47 8 2012-03-25 02:45:35 6 2012-04-29 04:29:04 5
2012-05-27 05:51:52 4 2012-06-24 07:14:39 4
There are two problems :
the object all.xts is a weekly and not a monthly time
The value your pass for the argument frequency is not correct
For the second point, try to change the value you pass for the argument start in your call of the function ts with
c(lubridate::year("2012-01-29"), lubridate::month("2012-01-29"))
and change the frequency to value 12. i.e use the line :
ts(all.xts , frequency = 12, start = c(lubridate::year("2012-01-29"), lubridate::month("2012-01-29")) )
Using the output from dput, your code rewrite as follow :
data <- c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 4L, 5L, 5L, 6L, 8L, 11L, 16L, 22L, 33L, 42L,
45L, 55L, 64L, 8L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 6L, 8L,
12L, 16L, 21L, 27L, 43L, 47L, 56L, 79L, 10L, 5L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 5L, 5L, 6L, 8L, 12L, 17L, 21L, 27L, 43L, 47L, 53L,
87L, 12L, 5L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 6L, 6L, 8L, 13L,
17L, 20L, 27L, 44L, 50L, 54L, 100L, 15L, 6L, 3L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 4L, 5L, 5L, 6L, 8L, 11L, 16L, 21L, 29L, 43L, 48L, 53L, 80L,
46L, 8L, 3L, 2L)
myTs <- ts(data, freq=365.25/7, start=decimal_date(ymd("2012-01-29")))
all.xts <- xts::xts(myTs, date_decimal(index(myTs)))
monthlyTs <- ts(all.xts , frequency = 12, start = c(lubridate::year("2012-01-29"), lubridate::month("2012-01-29")) )
window(monthlyTs, start= c(2013))
The last line will print :
> window(monthlyTs, start= c(2013))
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
2013 1 1 1 1 1 1 1 1 1 1 1 1
2014 1 1 1 1 2 2 2 2 3 3 3 4
2015 5 5 6 8 11 16 22 33 42 45 55 64
2016 8 4 2 2 2 2 2 2 1 1 1 1
2017 1 1 1 1 1 1 1 1 1 1 1 1
2018 1 1 1 1 1 1 1 2 2 2 2 2
2019 3 3 3 4 4 5 6 8 12 16 21 27
2020 43 47 56 79 10 5 2 2 2 1 1 1
2021 1 1 1 1 1 1 1 1 1 1 1 1
2022 1 1 1 1 1 1 1 1 1 1 2 2
2023 2 2 2 2 3 3 3 4 5 5 6 8
2024 12 17 21 27 43 47 53 87 12 5 2 2
2025 2 1 1 1 1 1 1 1 1 1 1 1
2026 1 1 1 1 1 1 1 1 1 1 1 1
2027 1 2 2 2 2 2 2 2 3 3 3 4
2028 5 6 6 8 13 17 20 27 44 50 54 100
2029 15 6 3 2 2 1 1 1 1 1 1 1
2030 1 1 1 1 1 1 1 1 1 1 1 1
2031 1 1 1 1 1 1 2 2 2 2 2 2
2032 3 3 3 4 5 5 6 8 11 16 21 29
2033 43 48 53 80 46 8 3 2
I have asked this question earlier and received a reply which was not in accordance with my wish. At the time I used stata to do the job. However as I routinely work with such data, I wish to use R to create what I wanted. I have a data set of daily hospital admission by age, sex and diagnoses. I wish to aggregate and reshape the data from long to wide. How could I achieve this objective? Sample data and required output are shown below. The column headers designate prefix of sex, age and diagnoses.
Thanks
Sample data
structure(list(diag = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("card", "cere"), class = "factor"), sex = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("Female", "Male"), class = "factor"),
age = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("35-64",
"65-74"), class = "factor"), admissions = c(1L, 1L, 0L, 0L,
6L, 6L, 6L, 1L, 4L, 0L, 0L, 0L, 4L, 6L, 5L, 2L, 2L, 4L, 1L,
0L, 6L, 5L, 6L, 4L), bdate = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), .Label = c("1987-01-01", "1987-01-02",
"1987-01-03"), class = "factor")), .Names = c("diag", "sex",
"age", "admissions", "bdate"), row.names = c(NA, -24L), class = "data.frame")
Required output
structure(list(date = structure(1:3, .Label = c("01jan1987",
"02jan1987", "03jan1987"), class = "factor"), f3564card = c(1L,
4L, 2L), f6574card = c(1L, 0L, 4L), m3564card = c(0L, 0L, 1L),
m6574card = c(0L, 0L, 0L), f3564cere = c(6L, 4L, 6L), f6574cere = c(6L,
6L, 5L), m3564cere = c(6L, 5L, 6L), m6574cere = c(1L, 2L,
4L)), .Names = c("date", "f3564card", "f6574card", "m3564card",
"m6574card", "f3564cere", "f6574cere", "m3564cere", "m6574cere"
), class = "data.frame", row.names = c(NA, -3L))
Your data are already in a long format that can be used easily by "reshape2", like this:
library(reshape)
dcast(df, bdate ~ sex + age + diag, value.var = "admissions")
# bdate Female_35-64_card Female_35-64_cere Female_65-74_card Female_65-74_cere
# 1 1987-01-01 1 6 1 6
# 2 1987-01-02 4 4 0 6
# 3 1987-01-03 2 6 4 5
# Male_35-64_card Male_35-64_cere Male_65-74_card Male_65-74_cere
# 1 0 6 0 1
# 2 0 5 0 2
# 3 1 6 0 4
I don't see any aggregation in your sample output, but if aggregation is required, you can achieve this with the fun.aggregate function within dcast.
df <- read.table("D:/Programacao/R/Stackoverflow/Nova pasta/sample.csv",
head = T, dec = '.', sep = ',',
stringsAsFactors = F)
head(df)
date sex cvd ACS age
1 01 Jul 91 female 0 0 35-64
2 01 Jul 91 male 0 0 35-64
3 01 Jul 91 female 0 0 35-64
4 01 Jul 91 male 1 1 35-64
5 01 Jul 91 female 0 0 65-74
6 02 Jul 91 male 0 0 65-74
Consdering that cvd and ACS are not mutually exclusive to males and females respectively,
library(dplyr)
df %.%
group_by(date, sex, age) %.%
summarise(vcvd = sum(cvd),
vacs = sum(ACS))
Source: local data frame [111 x 5]
Groups: date, sex
date sex age vcvd vacs
1 01 Jul 91 female 35-64 0 0
2 01 Jul 91 female 65-74 0 0
3 01 Jul 91 male 35-64 1 1
4 02 Aug 91 female 35-64 0 0
5 02 Jul 91 female 65-74 1 0
6 02 Jul 91 male 65-74 0 0
7 03 Aug 91 female 65-74 0 0
8 03 Jul 91 female 35-64 0 0
9 04 Jul 91 male 35-64 1 0
10 04 Jul 91 male 65-74 0 0
.. ... ... ... ... ...
With the following dataframe I need to obtain monthly sums of the following two variables: "CallsHandled" and "Engaged"
By the following grouping variables: "Month","ID","Location","LANGUAGE","MemRegion"
structure(list(Month = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Week = c(1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L), ID = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A1234",
"F1234"), class = "factor"), Location = structure(c(2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L), .Label = c("Corona", "Denver"), class = "factor"), LANGUAGE = structure(c(1L,
2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
2L, 2L, 1L), .Label = c("English", "Spanish"), class = "factor"),
MemRegion = structure(c(1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NCAL",
"SCAL"), class = "factor"), CallsHandled = c(1L, 1L, 8L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 10L, 1L, 3L, 1L, 8L, 1L, 6L,
1L, 1L, 2L), Engaged = c(120L, 30L, 1243L, 75L, 45L, 55L,
200L, 120L, 30L, 230L, 2065L, 45L, 55L, 200L, 1483L, 30L,
1243L, 75L, 45L, 55L), QueueA = c(0L, 0L, 0L, 1L, 1L, 0L,
0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L),
QueueB = c(1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L), QueueC = c(0L, 1L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L,
0L)), .Names = c("Month", "Week", "ID", "Location", "LANGUAGE",
"MemRegion", "CallsHandled", "Engaged", "QueueA", "QueueB", "QueueC"
), class = "data.frame", row.names = c(NA, -20L))
Additionally, in order to include "Queues A:C" as grouping variables, would I have to combine them into a single column? If so, how?
So there are 2 parts to this question, firstly how do you group things up and sum, and secondly how could you combine Queue A:C into one column.
For the first question you can use the library dplyr which makes it a lot easier and more intuitive.
df <- structure(list(Month = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
Week = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L),
ID = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A1234", "F1234"), class = "factor"),
Location = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("Corona", "Denver"), class = "factor"),
LANGUAGE = structure(c(1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L), .Label = c("English", "Spanish"), class = "factor"),
MemRegion = structure(c(1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NCAL", "SCAL"), class = "factor"),
CallsHandled = c(1L, 1L, 8L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 10L, 1L, 3L, 1L, 8L, 1L, 6L, 1L, 1L, 2L),
Engaged = c(120L, 30L, 1243L, 75L, 45L, 55L, 200L, 120L, 30L, 230L, 2065L, 45L, 55L, 200L, 1483L, 30L, 1243L, 75L, 45L, 55L),
QueueA = c(0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L),
QueueB = c(1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L),
QueueC = c(0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L)), .Names = c("Month", "Week", "ID", "Location", "LANGUAGE", "MemRegion", "CallsHandled", "Engaged", "QueueA", "QueueB", "QueueC"),
class = "data.frame", row.names = c(NA, -20L))
library(dplyr)
df %>% group_by(Month, ID, Location, LANGUAGE) %>%
mutate(TotalCallsHandled = sum(CallsHandled),
TotalEngaged = sum(Engaged))
So firstly, we use group_by to group by specifically the variables you have listed, and mutate to sum everything up, and this will do what you think intuitively.
For combining everything into one column, there are probably many ways to do this, but probably the most straight forward way is to create some kind of unique identifier for each column and combine all the columns into one.
df$Queue <- as.factor(df$QueueA + df$QueueB*2 + df$QueueC*3)
levels(df$Queue) <- c("A", "B", "C")
Since everything should be a 0, 1 flag, we can recreate the flags to be 1 -> A, 2 -> B, 3 -> C, and then relevel the factors to be A, B, C again. Then we can simply use group_by function again to get the intended result as above.
df %>% group_by(Month, ID, Location, LANGUAGE, Queue) %>%
mutate(TotalCallsHandled = sum(CallsHandled),
TotalEngaged = sum(Engaged)) %>%
select(-QueueA, -QueueB, -QueueC)
With output:
Source: local data frame [20 x 11]
Groups: Month, ID, Location, LANGUAGE, Queue
Month Week ID Location LANGUAGE MemRegion CallsHandled Engaged Queue
1 1 1 F1234 Denver English NCAL 1 120 B
2 1 2 F1234 Corona Spanish SCAL 1 30 C
3 1 3 F1234 Corona English NCAL 8 1243 B
4 1 4 F1234 Corona Spanish NCAL 1 75 A
5 1 5 F1234 Corona Spanish SCAL 1 45 A
6 1 6 F1234 Denver English SCAL 2 55 B
7 1 7 F1234 Corona English NCAL 1 200 C
8 1 8 F1234 Corona English NCAL 1 120 B
9 1 9 F1234 Denver English NCAL 1 30 A
10 1 10 F1234 Corona Spanish NCAL 1 230 C
11 1 1 A1234 Corona English NCAL 10 2065 C
12 1 2 A1234 Corona English SCAL 1 45 A
13 1 3 A1234 Corona Spanish NCAL 3 55 A
14 1 4 A1234 Corona English NCAL 1 200 A
15 1 5 A1234 Corona English SCAL 8 1483 B
16 1 6 A1234 Denver English SCAL 1 30 B
17 1 7 A1234 Corona Spanish SCAL 6 1243 C
18 1 8 A1234 Corona Spanish SCAL 1 75 B
19 1 9 A1234 Corona Spanish SCAL 1 45 C
20 1 10 A1234 Corona English SCAL 2 55 B
Variables not shown: TotalCallsHandled (int), TotalEngaged (int)
To make the Queue variables into a single factor variable, you could do this:
queues <- which(dat[ , c("QueueA", "QueueB", "QueueC")]==1, arr.ind=TRUE)
queues<-queues[
order(queues[,"row"]), "col"]
queues<-factor(queues, labels=c("QueueA", "QueueB", "QueueC"))
dat <- data.frame(dat, queues)
Though, #chappers approach for this is nicer.
Then, you can use aggregate:
aggregate(dat[,c("CallsHandled", "Engaged")],
by=list(dat$Month, dat$ID, dat$Location, dat$LANGUAGE, dat$MemRegion, dat$queues),
sum)
# Group.1 Group.2 Group.3 Group.4 Group.5 Group.6 CallsHandled Engaged
#1 1 A1234 Corona English NCAL QueueA 1 200
#2 1 F1234 Denver English NCAL QueueA 1 30
#3 1 A1234 Corona Spanish NCAL QueueA 3 55
#4 1 F1234 Corona Spanish NCAL QueueA 1 75
#5 1 A1234 Corona English SCAL QueueA 1 45
#6 1 F1234 Corona Spanish SCAL QueueA 1 45
#7 1 F1234 Corona English NCAL QueueB 9 1363
#8 1 F1234 Denver English NCAL QueueB 1 120
#9 1 A1234 Corona English SCAL QueueB 10 1538
#10 1 A1234 Denver English SCAL QueueB 1 30
#11 1 F1234 Denver English SCAL QueueB 2 55
#12 1 A1234 Corona Spanish SCAL QueueB 1 75
#13 1 A1234 Corona English NCAL QueueC 10 2065
#14 1 F1234 Corona English NCAL QueueC 1 200
#15 1 F1234 Corona Spanish NCAL QueueC 1 230
#16 1 A1234 Corona Spanish SCAL QueueC 7 1288
#17 1 F1234 Corona Spanish SCAL QueueC 1 30
#chappers solution aggregates correctly but leaves me with a bunch of duplicate rows for some reason that I can't figure out. This works for factors and reduces the number of rows in my actual dataframe (no duplicates):
aggregate(cbind(CallsHandled,Engaged~Month + ID + Location + LANGUAGE + MemRegion, data=df, sum, na.rm=TRUE)
I have asked this question earlier and received a reply which was not in accordance with my wish. At the time I used stata to do the job. However as I routinely work with such data, I wish to use R to create what I wanted. I have a data set of daily hospital admission by age, sex and diagnoses. I wish to aggregate and reshape the data from long to wide. How could I achieve this objective? Sample data and required output are shown below. The column headers designate prefix of sex, age and diagnoses.
Thanks
Sample data
structure(list(diag = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("card", "cere"), class = "factor"), sex = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("Female", "Male"), class = "factor"),
age = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("35-64",
"65-74"), class = "factor"), admissions = c(1L, 1L, 0L, 0L,
6L, 6L, 6L, 1L, 4L, 0L, 0L, 0L, 4L, 6L, 5L, 2L, 2L, 4L, 1L,
0L, 6L, 5L, 6L, 4L), bdate = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), .Label = c("1987-01-01", "1987-01-02",
"1987-01-03"), class = "factor")), .Names = c("diag", "sex",
"age", "admissions", "bdate"), row.names = c(NA, -24L), class = "data.frame")
Required output
structure(list(date = structure(1:3, .Label = c("01jan1987",
"02jan1987", "03jan1987"), class = "factor"), f3564card = c(1L,
4L, 2L), f6574card = c(1L, 0L, 4L), m3564card = c(0L, 0L, 1L),
m6574card = c(0L, 0L, 0L), f3564cere = c(6L, 4L, 6L), f6574cere = c(6L,
6L, 5L), m3564cere = c(6L, 5L, 6L), m6574cere = c(1L, 2L,
4L)), .Names = c("date", "f3564card", "f6574card", "m3564card",
"m6574card", "f3564cere", "f6574cere", "m3564cere", "m6574cere"
), class = "data.frame", row.names = c(NA, -3L))
Your data are already in a long format that can be used easily by "reshape2", like this:
library(reshape)
dcast(df, bdate ~ sex + age + diag, value.var = "admissions")
# bdate Female_35-64_card Female_35-64_cere Female_65-74_card Female_65-74_cere
# 1 1987-01-01 1 6 1 6
# 2 1987-01-02 4 4 0 6
# 3 1987-01-03 2 6 4 5
# Male_35-64_card Male_35-64_cere Male_65-74_card Male_65-74_cere
# 1 0 6 0 1
# 2 0 5 0 2
# 3 1 6 0 4
I don't see any aggregation in your sample output, but if aggregation is required, you can achieve this with the fun.aggregate function within dcast.
df <- read.table("D:/Programacao/R/Stackoverflow/Nova pasta/sample.csv",
head = T, dec = '.', sep = ',',
stringsAsFactors = F)
head(df)
date sex cvd ACS age
1 01 Jul 91 female 0 0 35-64
2 01 Jul 91 male 0 0 35-64
3 01 Jul 91 female 0 0 35-64
4 01 Jul 91 male 1 1 35-64
5 01 Jul 91 female 0 0 65-74
6 02 Jul 91 male 0 0 65-74
Consdering that cvd and ACS are not mutually exclusive to males and females respectively,
library(dplyr)
df %.%
group_by(date, sex, age) %.%
summarise(vcvd = sum(cvd),
vacs = sum(ACS))
Source: local data frame [111 x 5]
Groups: date, sex
date sex age vcvd vacs
1 01 Jul 91 female 35-64 0 0
2 01 Jul 91 female 65-74 0 0
3 01 Jul 91 male 35-64 1 1
4 02 Aug 91 female 35-64 0 0
5 02 Jul 91 female 65-74 1 0
6 02 Jul 91 male 65-74 0 0
7 03 Aug 91 female 65-74 0 0
8 03 Jul 91 female 35-64 0 0
9 04 Jul 91 male 35-64 1 0
10 04 Jul 91 male 65-74 0 0
.. ... ... ... ... ...
There is probably a really simple solution to this problem, but I couldn't find it from googling, or the data.table FAQ.
I have a data.table like so:
> test
chr bp ID REF ALT AF AC AN EFFECT IMPACT FUNCLASS CODING GENE pos effRank
1: 1 860416 rs61464428 G A 0.5000000 14 28 UPSTREAM MODIFIER CODING SAMD11 1:860416 21
2: 1 860416 rs61464428 G A 0.5000000 14 28 UPSTREAM MODIFIER CODING SAMD11 1:860416 21
3: 1 860416 rs61464428 G A 0.5000000 14 28 DOWNSTREAM MODIFIER CODING AL645608.1 1:860416 22
4: 1 860461 rs57465118 G A 1.0000000 62 62 UPSTREAM MODIFIER CODING SAMD11 1:860461 21
5: 1 860461 rs57465118 G A 1.0000000 62 62 UPSTREAM MODIFIER CODING SAMD11 1:860461 21
6: 1 860461 rs57465118 G A 1.0000000 62 62 DOWNSTREAM MODIFIER CODING AL645608.1 1:860461 22
7: 1 860521 rs57924093 C A 0.9840000 61 62 UPSTREAM MODIFIER CODING SAMD11 1:860521 21
8: 1 860521 rs57924093 C A 0.9840000 61 62 UPSTREAM MODIFIER CODING SAMD11 1:860521 21
9: 1 860521 rs57924093 C A 0.9840000 61 62 DOWNSTREAM MODIFIER CODING AL645608.1 1:860521 22
10: 1 861261 rs144896029 G A 0.0027270 3 1100 UPSTREAM MODIFIER CODING SAMD11 1:861261 21
11: 1 861261 rs144896029 G A 0.0027270 3 1100 DOWNSTREAM MODIFIER CODING AL645608.1 1:861261 22
12: 1 861332 G A 0.0009074 1 1102 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING AL645608.1 1:861332 11
13: 1 861332 G A 0.0009074 1 1102 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING SAMD11 1:861332 11
14: 1 861332 G A 0.0009074 1 1102 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING SAMD11 1:861332 11
15: 1 861332 G A 0.0009074 1 1102 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING SAMD11 1:861332 11
16: 1 861332 G A 0.0009074 1 1102 UPSTREAM MODIFIER CODING SAMD11 1:861332 21
17: 1 865455 C G 0.0033190 3 904 UPSTREAM MODIFIER CODING SAMD11 1:865455 21
18: 1 865628 rs41285790 G A 0.0027780 3 1080 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING SAMD11 1:865628 11
19: 1 865628 rs41285790 G A 0.0027780 3 1080 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING SAMD11 1:865628 11
20: 1 865628 rs41285790 G A 0.0027780 3 1080 NON_SYNONYMOUS_CODING MODERATE MISSENSE CODING SAMD11 1:865628 11
21: 1 865628 rs41285790 G A 0.0027780 3 1080 SYNONYMOUS_CODING LOW SILENT CODING AL645608.1 1:865628 14
22: 1 865628 rs41285790 G A 0.0027780 3 1080 UPSTREAM MODIFIER CODING SAMD11 1:865628 21
23: 1 866437 rs139076934 C T 0.0009074 1 1102 SYNONYMOUS_CODING LOW SILENT CODING AL645608.1 1:866437 14
24: 1 866437 rs139076934 C T 0.0009074 1 1102 SYNONYMOUS_CODING LOW SILENT CODING SAMD11 1:866437 14
25: 1 866437 rs139076934 C T 0.0009074 1 1102 SYNONYMOUS_CODING LOW SILENT CODING SAMD11 1:866437 14
26: 1 866437 rs139076934 C T 0.0009074 1 1102 SYNONYMOUS_CODING LOW SILENT CODING SAMD11 1:866437 14
27: 1 866461 rs148884928 G A 0.0009074 1 1102 SYNONYMOUS_CODING LOW SILENT CODING SAMD11 1:866461 14
28: 1 866461 rs148884928 G A 0.0009074 1 1102 SYNONYMOUS_CODING LOW SILENT CODING SAMD11 1:866461 14
29: 1 866461 rs148884928 G A 0.0009074 1 1102 SYNONYMOUS_CODING LOW SILENT CODING SAMD11 1:866461 14
30: 1 866461 rs148884928 G A 0.0009074 1 1102 UPSTREAM MODIFIER CODING AL645608.1 1:866461 21
31: 1 866511 rs71576583 CCCCT CCCCTCCCT 1.0000000 148 148 UPSTREAM MODIFIER CODING AL645608.1 1:866511 21
32: 1 871057 C T 0.0009074 1 1102 UPSTREAM MODIFIER CODING SAMD11 1:871057 21
33: 1 871057 C T 0.0009074 1 1102 UPSTREAM MODIFIER CODING AL645608.1 1:871057 21
34: 1 871057 C T 0.0009074 1 1102 UPSTREAM MODIFIER CODING SAMD11 1:871057 21
35: 1 871215 rs28419423 C G 0.0036300 4 1102 SYNONYMOUS_CODING LOW SILENT CODING SAMD11 1:871215 14
36: 1 871215 rs28419423 C G 0.0036300 4 1102 SYNONYMOUS_CODING LOW SILENT CODING SAMD11 1:871215 14
37: 1 871215 rs28419423 C G 0.0036300 4 1102 UPSTREAM MODIFIER CODING SAMD11 1:871215 21
38: 1 871215 rs28419423 C G 0.0036300 4 1102 UPSTREAM MODIFIER CODING SAMD11 1:871215 21
39: 1 871215 rs28419423 C G 0.0036300 4 1102 UPSTREAM MODIFIER CODING AL645608.1 1:871215 21
40: 1 871215 rs28419423 C G 0.0036300 4 1102 DOWNSTREAM MODIFIER CODING SAMD11 1:871215 22
41: 1 871287 C G 0.0009107 1 1098 UPSTREAM MODIFIER CODING SAMD11 1:871287 21
42: 1 871287 C G 0.0009107 1 1098 UPSTREAM MODIFIER CODING SAMD11 1:871287 21
43: 1 871287 C G 0.0009107 1 1098 UPSTREAM MODIFIER CODING AL645608.1 1:871287 21
44: 1 871287 C G 0.0009107 1 1098 DOWNSTREAM MODIFIER CODING SAMD11 1:871287 22
45: 1 871334 rs4072383 G T 0.6680000 474 710 UPSTREAM MODIFIER CODING SAMD11 1:871334 21
46: 1 871334 rs4072383 G T 0.6680000 474 710 UPSTREAM MODIFIER CODING SAMD11 1:871334 21
47: 1 871334 rs4072383 G T 0.6680000 474 710 UPSTREAM MODIFIER CODING AL645608.1 1:871334 21
48: 1 871334 rs4072383 G T 0.6680000 474 710 DOWNSTREAM MODIFIER CODING SAMD11 1:871334 22
49: 1 874415 rs74047412 C T 0.0018250 2 1096 UPSTREAM MODIFIER CODING SAMD11 1:874415 21
50: 1 874415 rs74047412 C T 0.0018250 2 1096 UPSTREAM MODIFIER CODING SAMD11 1:874415 21
chr bp ID REF ALT AF AC AN EFFECT IMPACT FUNCLASS CODING GENE pos effRank
As you can see, the values in the many of the rows are repeats, for some of the columns. What I want to do is remove the duplicated rows, based on the value (the min) of the effRank variable. I have set the key to be chr, bp, and effRank. So the table should be sorted on the basis of those three columns. I got kind of close. The following command returns the rows that I want, but does not return all columns, which I want.
> test[,min(effRank), by=pos]
pos V1
1: 1:860416 21
2: 1:860461 21
3: 1:860521 21
4: 1:861261 21
5: 1:861332 11
6: 1:865455 21
7: 1:865628 11
8: 1:866437 14
9: 1:866461 14
10: 1:866511 21
11: 1:871057 21
12: 1:871215 14
13: 1:871287 21
14: 1:871334 21
15: 1:874415 21
All I need is a way to make the above command return all columns in the data.table, not just the ones mentioned in the expressions. Otherwise, works perfectly. Any help is appreciated. The output of dput is below, for those that with to make their own example.
Cheers,
Davy
> dput(test)
structure(list(chr = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), bp = c(860416L, 860416L, 860416L,
860461L, 860461L, 860461L, 860521L, 860521L, 860521L, 861261L,
861261L, 861332L, 861332L, 861332L, 861332L, 861332L, 865455L,
865628L, 865628L, 865628L, 865628L, 865628L, 866437L, 866437L,
866437L, 866437L, 866461L, 866461L, 866461L, 866461L, 866511L,
871057L, 871057L, 871057L, 871215L, 871215L, 871215L, 871215L,
871215L, 871215L, 871287L, 871287L, 871287L, 871287L, 871334L,
871334L, 871334L, 871334L, 874415L, 874415L), ID = structure(c(10L,
10L, 10L, 8L, 8L, 8L, 9L, 9L, 9L, 3L, 3L, 1L, 1L, 1L, 1L, 1L,
1L, 7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 11L,
1L, 1L, 1L, 5L, 5L, 5L, 5L, 5L, 5L, 1L, 1L, 1L, 1L, 6L, 6L, 6L,
6L, 12L, 12L), .Label = c("", "rs139076934", "rs144896029", "rs148884928",
"rs28419423", "rs4072383", "rs41285790", "rs57465118", "rs57924093",
"rs61464428", "rs71576583", "rs74047412"), class = "factor"),
REF = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L,
1L, 3L, 3L, 3L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 1L, 1L), .Label = c("C",
"CCCCT", "G"), class = "factor"), ALT = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L,
1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L, 4L,
4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L), .Label = c("A", "CCCCTCCCT", "G", "T"), class = "factor"),
AF = c(0.5, 0.5, 0.5, 1, 1, 1, 0.984, 0.984, 0.984, 0.002727,
0.002727, 0.0009074, 0.0009074, 0.0009074, 0.0009074, 0.0009074,
0.003319, 0.002778, 0.002778, 0.002778, 0.002778, 0.002778,
0.0009074, 0.0009074, 0.0009074, 0.0009074, 0.0009074, 0.0009074,
0.0009074, 0.0009074, 1, 0.0009074, 0.0009074, 0.0009074,
0.00363, 0.00363, 0.00363, 0.00363, 0.00363, 0.00363, 0.0009107,
0.0009107, 0.0009107, 0.0009107, 0.668, 0.668, 0.668, 0.668,
0.001825, 0.001825), AC = c(14L, 14L, 14L, 62L, 62L, 62L,
61L, 61L, 61L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L,
3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 148L, 1L, 1L, 1L,
4L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 474L, 474L, 474L,
474L, 2L, 2L), AN = c(28L, 28L, 28L, 62L, 62L, 62L, 62L,
62L, 62L, 1100L, 1100L, 1102L, 1102L, 1102L, 1102L, 1102L,
904L, 1080L, 1080L, 1080L, 1080L, 1080L, 1102L, 1102L, 1102L,
1102L, 1102L, 1102L, 1102L, 1102L, 148L, 1102L, 1102L, 1102L,
1102L, 1102L, 1102L, 1102L, 1102L, 1102L, 1098L, 1098L, 1098L,
1098L, 710L, 710L, 710L, 710L, 1096L, 1096L), EFFECT = structure(c(4L,
4L, 1L, 4L, 4L, 1L, 4L, 4L, 1L, 4L, 1L, 2L, 2L, 2L, 2L, 4L,
4L, 2L, 2L, 2L, 3L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
4L, 4L, 4L, 3L, 3L, 4L, 4L, 4L, 1L, 4L, 4L, 4L, 1L, 4L, 4L,
4L, 1L, 4L, 4L), .Label = c("DOWNSTREAM", "NON_SYNONYMOUS_CODING",
"SYNONYMOUS_CODING", "UPSTREAM"), class = "factor"), IMPACT = structure(c(3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L,
3L, 2L, 2L, 2L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L,
3L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L), .Label = c("LOW", "MODERATE", "MODIFIER"
), class = "factor"), FUNCLASS = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L,
2L, 2L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L,
1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("", "MISSENSE", "SILENT"), class = "factor"),
CODING = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "CODING", class = "factor"),
GENE = structure(c(2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("AL645608.1",
"SAMD11"), class = "factor"), pos = structure(c(1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 7L,
7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 11L,
11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L,
14L, 14L, 14L, 14L, 15L, 15L), .Label = c("1:860416", "1:860461",
"1:860521", "1:861261", "1:861332", "1:865455", "1:865628",
"1:866437", "1:866461", "1:866511", "1:871057", "1:871215",
"1:871287", "1:871334", "1:874415"), class = "factor"), effRank = c(21L,
21L, 22L, 21L, 21L, 22L, 21L, 21L, 22L, 21L, 22L, 11L, 11L,
11L, 11L, 21L, 21L, 11L, 11L, 11L, 14L, 21L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 21L, 21L, 21L, 21L, 21L, 14L, 14L, 21L,
21L, 21L, 22L, 21L, 21L, 21L, 22L, 21L, 21L, 21L, 22L, 21L,
21L)), .Names = c("chr", "bp", "ID", "REF", "ALT", "AF",
"AC", "AN", "EFFECT", "IMPACT", "FUNCLASS", "CODING", "GENE",
"pos", "effRank"), row.names = c(NA, -50L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000004260788>, sorted = c("chr",
"bp", "effRank"))
You can use the internal variable .I, which gives the row number. Then subset using those values, as follows:
DT[DT[, .I[which.min(effRank)], pos]$V1]
It's easier to understand if you write it in two lines as follows:
tmp <- DT[, .I[which.min(effRank)], pos]
DT[tmp$V1]
The first line generates a column V1 with all the row numbers of the minimum positions (from your j expression) grouped by pos. Then you just subset them.