Related
I have data as follows, with which I want to calculate observations by group as follows:
library(data.table)
dat <- fread("col1 col2 col3 group
1 2 4 A
3 2 2 A
1 NA 1 B
3 2 1 B")
vars_of_interest <- c("col1", "col2")
vars_of_interest_obs <- paste0(vars_of_interest, "_obs_tot")
dat <- setDT(dat)[, (vars_of_interest_obs) := sum(!is.na(vars_of_interest)), by = c("group")]
However the outcome of this is:
col1 col2 col3 group col1_obs_tot col2_obs_tot
1: 1 2 4 A 2 2
2: 3 2 2 A 2 2
3: 1 NA 1 B 2 2
4: 3 2 1 B 2 2
Where the last column should be
col2_obs_tot
2
2
1
1
What am I doing wrong here?
This is because vars_of_interest are evaluated litteraly:
sum(is.na('col1','col2')) = 2
You need to get their content:
setDT(dat)[, (vars_of_interest_obs) := lapply(vars_of_interest, function(x) sum(!is.na(get(x)))), by = c("group")][]
col1 col2 col3 group col1_obs_tot col2_obs_tot
<int> <int> <int> <char> <int> <int>
1: 1 2 4 A 2 2
2: 3 2 2 A 2 2
3: 1 NA 1 B 2 1
4: 3 2 1 B 2 1
# have
> aDT <- data.table(colA = c(1,1,1,1,2,2,2,2,3,3,3,3), colB = c(4,NA,NA,1,4,3,NA,NA,4,NA,2,NA))
> aDT
colA colB
1: 1 4
2: 1 NA
3: 1 NA
4: 1 1
5: 2 4
6: 2 3
7: 2 NA
8: 2 NA
9: 3 4
10: 3 NA
11: 3 2
12: 3 NA
# want
> bDT <- data.table(colA = c(1,1,1,1,2,2,2,2,3,3,3,3), colB = c(4,1,1,1,4,3,3,3,4,2,2,2))
> bDT
colA colB
1: 1 4
2: 1 1
3: 1 1
4: 1 1
5: 2 4
6: 2 3
7: 2 3
8: 2 3
9: 3 4
10: 3 2
11: 3 2
12: 3 2
Would like to fill missing values according to the algorithm below:
within each group ('colA'),
use the value from one row below, if it's still NA, keeps going until the last row within that group
if all NAs in rows below, look at rows above (go up 1 row at a time)
if all NAs, then NA
Since the dataset is quite large, algorithmic efficiency is part of consideration. Not sure if there's any package for this type of operation already. How to do it?
With data.table and zoo:
library(data.table)
library(zoo)
# Last observation carried forward from last row of group
dt <- dt[, colB := na.locf0(colB, fromLast = TRUE), by = colA]
# Last observation carried forward for first row of group
dt[, colB := na.locf(colB), by = colA][]
Or in a single chain:
dt[, colB := na.locf0(colB, fromLast = TRUE), by = colA][
, colB := na.locf(colB), by = colA][]
Both return:
colA colB
1: 1 4
2: 1 1
3: 1 1
4: 1 1
5: 2 4
6: 2 3
7: 2 3
8: 2 3
9: 3 4
10: 3 2
11: 3 2
12: 3 2
Data:
text <- "colA colB
1 4
1 NA
1 NA
1 1
2 4
2 3
2 NA
2 NA
3 4
3 NA
3 2
3 NA"
dt <- fread(input = text, stringsAsFactors = FALSE)
Here is one way using tidyverse and zoo::na.locf:
library(tidyverse);
library(zoo);
df %>%
group_by(colA) %>%
arrange(colA) %>%
mutate(colB = na.locf(colB, na.rm = F, fromLast = TRUE)) %>%
mutate(colB = na.locf(colB, na.rm = F));
## A tibble: 12 x 2
## Groups: colA [3]
# colA colB
# <dbl> <dbl>
# 1 1.00 4.00
# 2 1.00 1.00
# 3 1.00 1.00
# 4 1.00 1.00
# 5 2.00 4.00
# 6 2.00 3.00
# 7 2.00 3.00
# 8 2.00 3.00
# 9 3.00 4.00
#10 3.00 2.00
#11 3.00 2.00
#12 3.00 2.00
Or the data.table way:
library(data.table);
dt[, .(na.locf(na.locf(colB, na.rm = F, fromLast = T), na.rm = F)), by = .(colA)];
# colA V1
# 1: 1 4
# 2: 1 1
# 3: 1 1
# 4: 1 1
# 5: 2 4
# 6: 2 3
# 7: 2 3
# 8: 2 3
# 9: 3 4
#10: 3 2
#11: 3 2
#12: 3 2
The key in both cases is to apply na.locf twice: First to replace NAs from the bottom, then replace the remaining NAs from the top.
Sample data
# As data.frame
df <- data.frame(colA = c(1,1,1,1,2,2,2,2,3,3,3,3), colB = c(4,NA,NA,1,4,3,NA,NA,4,NA,2,NA));
# As data.table
dt <- data.table(colA = c(1,1,1,1,2,2,2,2,3,3,3,3), colB = c(4,NA,NA,1,4,3,NA,NA,4,NA,2,NA));
library(tidyverse)
aDT%>%group_by(colA)%>%fill(colB,.direction="up")%>%fill(colB)
# A tibble: 12 x 2
# Groups: colA [3]
colA colB
<dbl> <dbl>
1 1 4
2 1 1
3 1 1
4 1 1
5 2 4
6 2 3
7 2 3
8 2 3
9 3 4
10 3 2
11 3 2
12 3 2
This question already has answers here:
merge data.frames based on year and fill in missing values
(4 answers)
Closed 5 years ago.
I want to replace all values in one data.table column (col1).
I thought to try to do it over loop but only few values are replaced.
Example:
#Generate data.table
library(data.table)
dt2 <- data.table(col1=rep(c("a","b","c","d"),each=2), col2=rep(c(1,2,3,4),2),col3=rep(c(1,2,3,4),2))
> dt2
col1 col2 col3
1: a 1 1
2: a 2 2
3: b 3 3
4: b 4 4
5: c 1 1
6: c 2 2
7: d 3 3
8: d 4 4
Change values:
#new col1 elements
new_col1 = c("WT","m1","m9","m10")
#change col1
col1_names = unique(dt$col1)
for (i in 1:length(col1_names)){
dt2[col1==col1_names[i],col1:=new_col1[i]]
}
> dt2
col1 col2 col3
1: WT 1 1
2: WT 2 2
3: m1 3 3
4: m1 4 4
5: c 1 1
6: c 2 2
7: d 3 3
8: d 4 4
Values in col1 are only partially replaced.
Can anybody explain it or suggest better way how to replace?
I tried to replace them at once but this does not work either properly:
#generate data.table
dt2 <- data.table(col1=rep(c("a","b","c","d"),each=2), col2=rep(c(1,2,3,4),2),col3=rep(c(1,2,3,4),2))
col1 col2 col3
1: a 1 1
2: a 2 2
3: b 3 3
4: b 4 4
5: c 1 1
6: c 2 2
7: d 3 3
8: d 4 4
#replace values
col1_names = unique(dt$col1)
dt2[col1==col1_names,col1:=new_col1]
> dt2
col1 col2 col3
1: WT 1 1
2: m1 2 2
3: m9 3 3
4: m10 4 4
5: WT 1 1
6: m1 2 2
7: m9 3 3
8: m10 4 4
Values are just replaced without any condition.
I agree with the lookup-table answer. You might want to try:
library(data.table)
dt2 <- data.table(col1=rep(c("a","b","c","d"),each=2),
col2=rep(c(1,2,3,4),2),
col3=rep(c(1,2,3,4),2))
new_col1 <- c(a="WT",b="m1",c="m9",d="m10") # .. have to change slightly
dt2$col1<-unname(new_col1[dt2$col1])
Or with data.table "update join" syntax:
lookup = data.table(old = c("a","b","c","d"), new = c("WT","m1","m9","m10"))
dt2[lookup, on=.(col1 = old), col1 := i.new ]
How do I use one variable in a data.frame to refer to another?
say I have:
col col1 col2
"col1" 1 5
"col2" 2 6
"col1" 3 7
"col2" 4 8
and I want:
col col1 col2 answer
"col1" 1 5 1
"col2" 2 6 6
"col1" 3 7 3
"col2" 4 8 8
,
df$answer = df[,df$col]
isn't working, and a for loop is taking forever.
I know it's already answered, but I thought another approach might be useful:
read.table(text='col col1 col2
"col1" 1 5
"col2" 2 6
"col1" 3 7
"col2" 4 8',h=T)->df
df$answer <- as.integer(df[ cbind(c(1:nrow(df)), match(df$col, names(df))) ])
df
# col col1 col2 answer
# 1 col1 1 5 1
# 2 col2 2 6 6
# 3 col1 3 7 3
# 4 col2 4 8 8
This doesn't look very hard, but the solution I found isn't very elegant, there are probably better ways. But you can use match and then subset according to the match:
dat <- read.table(text="col col1 col2
col1 1 5
col2 2 6
col1 3 7
col2 4 8", header = T, stringsAsFactors = FALSE)
cols <- unique(dat$col)
matches <- match(dat$col, cols)
dat$answer <- sapply(seq_along(matches), function (i) {
dat[i,cols[matches[i]]]
})
And the result:
> dat
col col1 col2 answer
1 col1 1 5 1
2 col2 2 6 6
3 col1 3 7 3
4 col2 4 8 8
Edit
Actually, here's an already much better approach:
dat$answer <- sapply(1:nrow(dat), function(r) {
dat[r,dat$col[r]]
})
This is apparently what you have tried, but using sapply instead of unlist(lapply, so yeah, not sure if this helps.
In this case with only 2 columns ifelsemight be the fastest and most straightforward solution.
df$answer <- ifelse(df[,1] == "col1",df[,"col1"],df[,"col2”])
col col1 col2 answer
1 col1 1 5 1
2 col2 2 6 6
3 col1 3 7 3
4 col2 4 8 8
Addition as N8TRO asked in his comment for a more general solution.
A simple switch might be all that is needed:
for(i in 1:nrow(df)) df$ans[i] <- switch(df[i,1],df[i,df[i,1]])
or without a "for" loop:
df$ans <- sapply(1:nrow(df),function(i) switch(df[i,1],df[i,df[i,1]]))
example:
df <- data.frame(col=sample(paste0('col',1:5),10,replace=T),col1=1:10,col2=11:20,col3=21:30,col4=31:40,col5=41:50,stringsAsFactors = F)
select the elements:
df$ans <- sapply(1:nrow(df),function(i) switch(df[i,1],df[i,df[i,1]]))
df
col col1 col2 col3 col4 col5 ans
1 col1 1 11 21 31 41 1
2 col1 2 12 22 32 42 2
3 col5 3 13 23 33 43 43
4 col2 4 14 24 34 44 14
5 col3 5 15 25 35 45 25
6 col4 6 16 26 36 46 36
7 col5 7 17 27 37 47 47
8 col3 8 18 28 38 48 28
9 col1 9 19 29 39 49 9
10 col5 10 20 30 40 50 50
I wish to count the number of times each combination of two elements appears in the same group.
For example, with:
> dat = data.table(group = c(1,1,1,2,2,2,3,3), id=c(10,11,12,10,11,13,11,13))
> dat
group id
1: 1 10
2: 1 11
3: 1 12
4: 2 10
5: 2 11
6: 2 13
7: 3 11
8: 3 13
The expected result would be:
id.1 id.2 nb_common_appearances
10 11 2 (in group 1 and 2)
10 12 1 (in group 1)
11 12 1 (in group 1)
10 13 1 (in group 2)
11 13 2 (in group 2 and 3)
Here is a data.table approach (roughly the same as #josilber's from plyr):
pairs <- dat[, c(id=split(combn(id,2),1:2)), by=group ]
pairs[, .N, by=.(id.1,id.2) ]
# id.1 id.2 N
# 1: 10 11 2
# 2: 10 12 1
# 3: 11 12 1
# 4: 10 13 1
# 5: 11 13 2
You might also consider viewing the results in a table:
pairs[, table(id.1,id.2) ]
# id.2
# id.1 11 12 13
# 10 2 1 1
# 11 0 1 2
You can use merges instead of combn:
setkey(dat, group)
dat[ dat, allow.cartesian=TRUE ][ id<i.id, .N, by=.(id,i.id) ]
Benchmarks. For large data, the merges can be a little faster (as hypothesized by #DavidArenburg). #Arun's answer is faster still:
DT <- data.table(g=1,id=1:(1.5e3),key="id")
system.time({a <- combn(DT$id,2)})
# user system elapsed
# 0.81 0.00 0.81
system.time({b <- DT[DT,allow.cartesian=TRUE][id<i.id]})
# user system elapsed
# 0.13 0.00 0.12
system.time({d <- DT[,.(rep(id,(.N-1L):0L),id[indices(.N-1L)])]})
# user system elapsed
# 0.01 0.00 0.02
(I left out the group-by operation as I don't think it will be important to the timings.)
In defense of combn. The combn approach extends nicely to larger combos, while merges and #Arun's answer, while much faster for pairs, do not (as far as I can see):
DT2 <- data.table(g=rep(1:2,each=5),id=1:5)
tuple_size <- 4
tuples <- DT2[, c(id=split(combn(id,tuple_size),1:tuple_size)), by=g ]
tuples[, .N, by=setdiff(names(tuples),"g")]
# id.1 id.2 id.3 id.4 N
# 1: 1 2 3 4 2
# 2: 1 2 3 5 2
# 3: 1 2 4 5 2
# 4: 1 3 4 5 2
# 5: 2 3 4 5 2
Another way using data.table:
require(data.table)
indices <- function(n) sequence(n:1L) + rep(1:n, n:1)
dat[, .(id1 = rep(id, (.N-1L):0L),
id2 = id[indices(.N-1L)]),
by=group
][, .N, by=.(id1, id2)]
# id1 id2 N
# 1: 10 11 2
# 2: 10 12 1
# 3: 11 12 1
# 4: 10 13 1
# 5: 11 13 2
You could reshape your data to have each pair in each group in a separate row (I've used split-apply-combine for that step) and then use count from the plyr package to count the frequency of unique rows:
library(plyr)
count(do.call(rbind, lapply(split(dat, dat$group), function(x) t(combn(x$id, 2)))))
# x.1 x.2 freq
# 1 10 11 2
# 2 10 12 1
# 3 10 13 1
# 4 11 12 1
# 5 11 13 2
Here is a dplyr approach, using combn to make the combinations.
dat %>%
group_by(group) %>%
do(as.data.frame(t(combn(.[["id"]], 2)))) %>%
group_by(V1, V2) %>%
summarise(n( ))
Source: local data frame [5 x 3]
Groups: V1
V1 V2 n()
1 10 11 2
2 10 12 1
3 10 13 1
4 11 12 1
5 11 13 2