Her is my data:
x<- data.frame(P=c("M","C","M","C","C","M","C","M"),
Q=c(13,12,12,14,19,15,12,11),
R=c(15,13,21,32,32,21,13,32),
T=c(15,12,12,14,12,11,19,15))
I want to calculate means for variables within each category.
For example means for Q is: M= (13+12+15+11)= 12.75 and for C= (12+14+19+12)= 14.25 and so on.
Next, I want to rank the means for each variable and get the following table:
P Q R T
M 2 2 2
C 1 1 1
I want to get equal rank for my real data.
For example. if I have three 12, all will get the same rank
You can try
aggregate(.~P, x, mean)
Or
library(dplyr)
x1 <- x %>%
group_by(P) %>%
summarise_each(funs(mean))
x1
# P Q R T
#1 C 14.25 22.50 14.25
#2 M 12.75 22.25 13.25
For ranking
x1 %>%
mutate_each(funs(rank(-.)), Q:T)
# P Q R T
#1 C 1 1 1
#2 M 2 2 2
Update
Suppose if there are ties,
x1$Q[2] <- x1$Q[1]
rank will get the ties averaged by default. You can specify the ties.method to min or use min_rank
x1 %>%
mutate_each(funs(min_rank(-.)), Q:T)
# P Q R T
#1 C 1 1 1
#2 M 1 2 2
For completeness, here's a possible data.table solution using frank from the devel version on GitHub
For means
library(data.table) ## v >= 1.9.5
(Res <- setDT(x)[, lapply(.SD, mean), by = P])
# P Q R T
# 1: M 12.75 22.25 13.25
# 2: C 14.25 22.50 14.25
For ranks
Res[, 2:4 := lapply(-.SD, frank, ties.method = "dense"), .SDcols = 2:4]
Res
# P Q R T
# 1: M 2 2 2
# 2: C 1 1 1
to get the mean you can do this
do.call(rbind,lapply(split(x, f = x$P), function(x) data.frame(P = unique(x$P), Q = mean(x$Q), R = mean(x$R), T = mean(x$T))))
Related
I am trying to do a rolling sum of match by working with two tables:
DT1:
M
A1
A2
M01
A
G
M02
G
A
M03
T
C
Mnn
A
G
DT2:
IND
Group
M01
M02
Mnn
I1
1
A
G
G
I2
1
A
G
G
I3
1
G
A
A
I4
2
G
A
G
In
2
G
A
G
I being the n individual of the group 1 or 2 and with its information about n Markers.
The output is the sum of both Alleles for both group and for every n Markers.
##Code for replicability
#DT1
DT1<-data.table(M=c("M01","M02","M03","Mnn"),
A1= c("A","G","T","A"),
A2=c("G","A","C","G"))
#DT2
DT2<-data.table(IND=c("I1","I2","I3","I4","In"),
Group=c(1,1,1,2,2),
M01=c("A","A","A","G","G"),
M02=c("G","G","A","G","G"),
M03=c("C","C","C","T","C"),
Mnn=c("G","A","A","G","A"))
#M being the nn marker with its Allele1 and Allele2
#What I did found so far:
for (i in colnames(DT2)){
print(i)
DT1$A1G1[DT1$M==i]<- sum(DT2[[i]][DT2$Group==1] == DT1$A1[DT1$M==i])
DT1$A2G1[DT1$M==i]<- sum(DT2[[i]][DT2$Group==1] == DT1$A2[DT1$M==i])
DT1$A1G2[DT1$M==i]<- sum(DT2[[i]][DT2$Group==2] == DT1$A1[DT1$M==i])
DT1$A2G2[DT1$M==i]<- sum(DT2[[i]][DT2$Group==2] == DT1$A2[DT1$M==i])
}
#The output I want would be the sum of both A for the two group and for every Mnn.
# M A1 A2 A1G1 A2G1 A1G2 A2G2
#1: M01 A G 3 0 0 2
#2: M02 G A 2 1 2 0
#3: M03 T C 0 3 1 1
#4: Mnn A G 2 1 1 1
It does the job but I feel like data.table could do it in one line and with less computation time by avoiding looping as Mnn is up to 50k and In is up to 15k it takes a long time.
Anyone with solution would greatly help me as I have trouble working with data.table logic of key and indexes when working with two different tables.
We could make the loop a bit more efficient by using colSums. Also, reduce the number of == by splitting the 'DT2' by 'Group'
mcols <- grep("^M", names(DT2), value = TRUE)
lst1 <- split(DT2[, ..mcols], DT2$Group)
for(i in seq_along(lst1)) {
tmp <- lst1[[i]]
DT1[, paste0("A1G", i) := colSums(tmp == A1[col(tmp)], na.rm = TRUE)]
DT1[, paste0("A2G", i) := colSums(tmp == A2[col(tmp)], na.rm = TRUE)][]
}
-output
> DT1
M A1 A2 A1G1 A2G1 A1G2 A2G2
<char> <char> <char> <num> <num> <num> <num>
1: M01 A G 3 0 0 2
2: M02 G A 2 1 2 0
3: M03 T C 0 3 1 1
4: Mnn A G 2 1 1 1
Benchmarks
On a slightly bigger dataset, checked the timings with OP's method and this
# data
set.seed(24)
DT1test<-data.table(M=sprintf('M%02d', 1:5000),
A1= sample(c("A","G","T","C"), 5000, replace = TRUE),
A2=sample(c("G","A","T","C"), 5000, replace = TRUE))
DT1testold <- copy(DT1test)
set.seed(42)
m1 <- matrix(sample(c("A", "G", "T", "C"), 5000 * 15000,
replace = TRUE), ncol = 5000, dimnames = list(NULL, DT1test$M))
DT2test<-data.table(IND=paste0("I", 1:15000),
Group=rep(1:300, each = 50))
DT2test <- cbind(DT2test, m1)
timings - old method
system.time({
for (i in colnames(DT2test)){
for(j in unique(DT2test$Group)) {
DT1testold[[paste0("A1G", j)]][DT1testold$M==i] <-
sum(DT2testold[[i]][DT2test$Group==j] == DT1testold$A1[DT1test$M==i])
DT1testold[[paste0("A2G", j)]][DT1testold$M==i] <-
sum(DT2test[[i]][DT2test$Group==j] == DT1testold$A1[DT1test$M==i])
}
}
})
user system elapsed
502.603 106.631 610.908
timings-new method
system.time({
mcols <- grep("^M", names(DT2test), value = TRUE)
lst1 <- split(DT2test[, ..mcols], DT2test$Group)
for(i in seq_along(lst1)) {
tmp <- lst1[[i]]
DT1test[, paste0("A1G", i) := colSums(tmp == A1[col(tmp)],
na.rm = TRUE)]
DT1test[, paste0("A2G", i) := colSums(tmp == A2[col(tmp)],
na.rm = TRUE)][]
}
})
#user system elapsed
#36.079 0.968 36.934
If you melt your two tables, and do a join on M and value, you can count by group, allele, and marker:
pivot these tables long, and join
DT_long = melt(DT2,id = c("IND", "Group"),variable.name = "M")[melt(DT1, id="M",variable.name="allele"), on=.(M,value)]
join DT1 back on to a wide version of the sum over allele, group, and marker
DT1[dcast(
DT_long[,.N, .(col =paste0(allele,"G",Group),M)],
M~col,value.var="N",fill=0
), on="M"]
Output:
M A1 A2 A1G1 A1G2 A2G1 A2G2
1: M01 A G 3 0 0 2
2: M02 G A 2 2 1 0
3: M03 T C 0 1 3 1
4: Mnn A G 2 1 1 1
Update:
I still find the melt - dcast solution to be faster than the looping approaches. Here is an option, that does the dcast separately for each "A" column using a helper function:
DT2_long <- melt(DT2,id = c("IND", "Group"),variable.name = "M")[, .N, .(Group,M, value)]
f <- function(ma, allele) {
dcast(DT2_long[ma, on=.(M,value)][,col:=paste0(allele, "G",Group)],M~col,value.var="N")
}
do.call(cbind, lapply(c("A1", "A2"), \(a) f(DT1[, .(M, value=get(a))], a)))
I have a dataframe and the row values are first ordered from smallest to largest. I compute row value differences between adjacent rows, combine rows with similar differences (e.g., smaller than 1), and return averaged values of combined rows. I could check each row differences with a for loop, but seems a very inefficient way. Any better ideas? Thanks.
library(dplyr)
DF <- data.frame(ID=letters[1:12],
Values=c(1, 2.2, 3, 5, 6.2, 6.8, 7, 8.5, 10, 12.2, 13, 14))
DF <- DF %>%
mutate(Diff=c(0, diff(Values)))
The expected output of DF would be
ID Values
a 1.0
b/c 2.6 # (2.2+3.0)/2
d 5.0
e/f/g 6.67 # (6.2+6.8+7.0)/3
h 8.5
i 10.0
j/k 12.6 # (12.2+13.0)/2
i 14.0
Here is an option with data.table
library(data.table)
setDT(DF)[, .(ID = toString(ID), Values = round(mean(Values), 2)),
by = .(Diff = cumsum(c(TRUE, diff(Values)>=1)))][, -1, with = FALSE]
# ID Values
#1: a 1.00
#2: b, c 2.60
#3: d 5.00
#4: e, f, g 6.67
#5: h 8.50
#6: i 10.00
#7: j, k 12.60
#8: l 14.00
Calculate difference between Values of every row and check if those are >= 1. Cumulative sum of that >=1 will provide you distinct group on which one can summarize to get desired result.
library(dplyr)
DF %>% arrange(Values) %>%
group_by(Diff = cumsum(c(1,diff(Values)) >= 1) ) %>%
summarise(ID = paste0(ID, collapse = "/"), Values = mean(Values)) %>%
ungroup() %>% select(-Diff)
# # A tibble: 8 x 2
# ID Values
# <chr> <dbl>
# 1 a 1.00
# 2 b/c 2.60
# 3 d 5.00
# 4 e/f/g 6.67
# 5 h 8.50
# 6 i 10.0
# 7 j/k 12.6
# 8 l 14.0
library(magrittr)
df <- DF[order(DF$Values),]
df$Values %>%
#Find groups for each row
outer(., ., function(x, y) x >= y & x < y + 1) %>%
# Remove sub-groups
`[<-`(apply(., 1, cumsum) > 1, F) %>%
# Remove sub-group columns
.[, colSums(.) > 0] %>%
# select these groups from data
apply(2, function(x) data.frame(ID = paste(df$ID[x], collapse = '/')
, Values = mean(df$Values[x]))) %>%
# bind results by row
do.call(what = rbind)
# ID Values
# 1 a 1.000000
# 2 b/c 2.600000
# 4 d 5.000000
# 5 e/f/g 6.666667
# 8 h 8.500000
# 9 i 10.000000
# 10 j/k 12.600000
# 12 l 14.000000
Note:
This method is different from those using diff because it groups rows together only if all Values are within < 1 of each other.
Example:
Change the dataset so that Value is 7.3 at ID g.
Above method: The IDs e, f, and g are no longer grouped together because the value at ID e is 6.2 and 7.2 - 6.2 > 1.
Diff Method: IDs e, f, and g are still grouped together because the diff of IDs at e and f is < 1 and the diff of IDs F and G is < 1
I'm working with a large dataset and doing some calculation with the aggregate() function.
This time I need to group by two different columns and for my calculation I need a user defined function that also uses two columns of the data.frame. That's where I'm stuck.
Here's an example data set:
dat <- data.frame(Kat = c("a","b","c","a","c","b","a","c"),
Sex = c("M","F","F","F","M","M","F","M"),
Val1 = c(1,2,3,4,5,6,7,8)*10,
Val2 = c(2,6,3,3,1,4,7,4))
> dat
Kat Sex Val1 Val2
a M 10 2
b F 20 6
c F 30 3
a F 40 3
c M 50 1
b M 60 4
a F 70 7
c M 80 4
Example of user defined function:
sum(Val1 * Val2) # but grouped by Kat and Sex
I tried this:
aggregate((dat$Val1),
by = list(dat$Kat, dat$Sex),
function(x, y = dat$Val2){sum(x*y)})
Output:
Group.1 Group.2 x
a F 1710
b F 600
c F 900
a M 300
b M 1800
c M 2010
But my expected output would be:
Group.1 Group.2 x
a F 610
b F 120
c F 90
a M 20
b M 240
c M 370
Is there any way to do this with aggregate()?
As #jogo suggested :
aggregate(Val1 * Val2 ~ Kat + Sex, FUN = sum, data = dat)
Or in a tidyverse style
library(dplyr)
dat %>%
group_by(Kat, Sex) %>%
summarize(sum(Val1 * Val2))
Or with data.table
library(data.table)
setDT(dat)
dat[ , sum(Val1 * Val2), by = list(Kat, Sex)]
I have a large data table with over 300 columns. I would like to get by each letter column
-- summary of (each observation in column * weight of observation).
-- summary of weight if obs. in a letter column is more than 0.
Here I provided a example for a column.
id <- c("0001", "0002", "0003", "0004")
a <- c(0, 9, 8, 5)
b <- c(0,5,5,0)
c <- c(1.5, 0.55, 0, 0.06)
weight <- c(102.354, 34.998, 84.664, .657)
data <- data.frame(id, a, b, c, weight)
data
id a b c weight
1 0001 0 0 1.50 102.354
2 0002 9 5 0.55 34.998
3 0003 8 5 0.00 84.664
4 0004 5 0 0.06 0.657
sum(data$a * data$weight)
[1] 995.579
sum(data$weight[data$a >0])
[1] 120.319
Any idea?
A possible data.table solution
You could define an helper function
tempfunc <- function(x) c(sum(x * data$weight), sum(data$weight[x > 0]))
Then do either
library(data.table)
setDT(data)[, lapply(.SD, tempfunc), .SDcols = -c("id", "weight")]
# a b c
# 1: 995.579 598.310 172.8193
# 2: 120.319 119.662 138.0090
Or
library(dplyr)
setDT(data) %>% summarise_each(funs(tempfunc), -c(id, weight))
## a b c
## 1: 995.579 598.310 172.8193
## 2: 120.319 119.662 138.0090
The following code should solve your question:
my.names <- names(data)[names(data) %in% letters]
res <- lapply(my.names, function(x){
c(sum(data[[x]]*data[["weight"]]), sum(data[["weight"]][data[[x]]>0]))
})
names(res) <- my.names
or directly to data.frame:
do.call("rbind", lapply(my.names, function(letter){
data.frame(letter, "sum1_name" = sum(data[[letter]]*data[["weight"]]),
"sum2_name" = sum(data[["weight"]][data[[letter]]>0]))
}))
# letter sum1_name sum2_name
# 1 a 995.5790 120.319
# 2 b 598.3100 119.662
# 3 c 172.8193 138.009
I got a lot of good feedback on a question I recently asked and was guided to use dplyr to transform some data. I'm having an issue with lm() and trying to find a slope from this transformed data and thought I'd open up a new question.
First I have data that looks like this:
Var1 Var2 Var3 Time Temp
a w j 9/9/2014 20
a w j 9/9/2014 15
a w k 9/20/2014 10
a w j 9/10/2014 0
b x L 9/12/2014 30
b x L 9/12/2014 10
b y k 9/13/2014 20
b y k 9/13/2014 15
c z j 9/14/2014 20
c z j 9/14/2014 10
c z k 9/14/2014 11
c w l 9/10/2014 45
a d j 9/22/2014 20
a d k 9/15/2014 4
a d l 9/15/2014 23
a d k 9/15/2014 11
And I want it in the form of this (values for Slope and Pearson simulated for illustration):
V1 V2 V3 Slope Pearson
a w j -3 -0.9
a w k 2 0
a d j 1.5 0.6
a d k 0 0.5
a d l -0.5 -0.6
b x L 12 0.7
b y k 4 0.6
c z j -1 -0.5
c z k -3 -0.4
c w l -10 -0.9
The slope being a linear-least-squares slope. In theory, the script would look like so:
library(dplyr)
data <- read.table("clipboard",sep="\t",quote="",header=T)
newdata = summarise(group_by(data
,Var1
,Var2
,Var3
)
,Slope = lm(Temp ~ Time)$coeff[2]
,Pearson = cor(Time, Temp, method="pearson")
)
But R throws an error like it can't find Time or Temp. It can run lm(data$Temp ~ data$Time)$coeff[2], but returns the slope for the entire data set and not the subsetted form that I'm looking for. cor() seems to run just fine in the group_by section, so is there a specific syntax I need to pass to lm() to have it run in a similar manner or use a different function entirely to get a slope passed from the subset?
You have several issues here.
If you group your data by 3 variables (or even 2) you don't have enough distinct values in order to run a linear regression model
Pearson requires two numeric values, while Time is a factor which converting to numeric won't make much sense
The third issue here is that you will need to use do in order to run your linear model
Here's an illustration for grouping only on V1
data %>%
group_by(Var1) %>% # You can add here additional grouping variables if your real data set enables it
do(mod = lm(Temp ~ Time, data = .)) %>%
mutate(Slope = summary(mod)$coeff[2]) %>%
select(-mod)
# Source: local data frame [3 x 2]
# Groups: <by row>
#
# Var1 Slope
# 1 a 12.66667
# 2 b -2.50000
# 3 c -31.33333
If you do have two numeric variables, you can use do in order to calculate correlation too, for example (I will create some dummy numeric variables for illustration)
data %>%
mutate(test1 = sample(1:3, n(), replace = TRUE), # Creating some numeric variables
test2 = sample(1:3, n(), replace = TRUE)) %>%
group_by(Var1) %>%
do(mod = lm(Temp ~ Time, data = .),
mod2 = cor(.$test1, .$test2, method = "pearson")) %>%
mutate(Slope = summary(mod)$coeff[2],
Pearson = mod2[1]) %>%
select(-mod, -mod2)
# Source: local data frame [3 x 3]
# Groups: <by row>
#
# Var1 Slope Pearson
# 1 a 12.66667 0.25264558
# 2 b -2.50000 -0.09090909
# 3 c -31.33333 0.30151134
Bonus solution: you can do this quite efficiently/easily with data.table package too
library(data.table)
setDT(data)[, list(Slope = summary(lm(Temp ~ Time))$coeff[2]), Var1]
# Var1 Slope
# 1: a 12.66667
# 2: b -2.50000
# 3: c -31.33333
Or if we want to create some dummy variables too
library(data.table)
setDT(data)[, `:=`(test1 = sample(1:3, .N, replace = TRUE),
test2 = sample(1:3, .N, replace = TRUE))][,
list(Slope = summary(lm(Temp ~ Time))$coeff[2],
Pearson = cor(test1, test2, method = "pearson")), Var1]
# Var1 Slope Pearson
# 1: a 12.66667 -0.02159168
# 2: b -2.50000 -0.81649658
# 3: c -31.33333 -1.00000000