I have the following data frame:
dat <- read_csv(
"s1,s2,v1,v2
a,b,10,20
a,b,22,NA
a,b,13,33
c,d,3,NA
c,d,4.5,NA
c,d,10,20"
)
dat
#> # A tibble: 6 x 4
#> s1 s2 v1 v2
#> <chr> <chr> <dbl> <int>
#> 1 a b 10.0 20
#> 2 a b 22.0 NA
#> 3 a b 13.0 33
#> 4 c d 3.0 NA
#> 5 c d 4.5 NA
#> 6 c d 10.0 20
What I want to do is
Filter row based on v1 values
Group by s1 and s2
Count total lines in every group
Count lines in every group where v2 is not NA.
For example with v1_filter >= 0 we get this:
s1 s2 total_line non_na_line
a b 3 2
c d 3 1
And with v1_filter >= 10 we get this:
s1 s2 total_line non_na_line
a b 2 1
c d 1 1
How can I achieve that with data.table or dplyr?
In reality we have around ~31M rows in dat. So we need
a fast method.
I'm stuck with this
library(data.table)
dat <- data.table(dat)
v1_filter = 0
dat[, v1 >= v1_filter,
by=list(s1,s2)]
Using sum should help. Operating on a logical vector, it treats each TRUE as 1 and FALSE as 0, so you can easily do this:
dat %>%
group_by(s1, s2) %>%
summarise(total_lines = n(),
non_na_line = sum(!is.na(v2)))
# A tibble: 2 x 4
# Groups: s1 [?]
s1 s2 total_lines non_na_line
<chr> <chr> <int> <int>
1 a b 3 2
2 c d 3 1
You'll easily be able to add in a filter between group_by and summarise, to get what you want. Keep in mind that summarise will only retain columns that you group by.
Benchmark
For what it's worth, I ran a quick benchmark, with some test data of similar size as yours.
s1charMix <- rep(letters[seq(from = 1, to = 10)], length.out = 30000000)
s2charMix <- rep(letters[seq(from = 11, to = 20)], length.out = 30000000)
s1chars <- sample(s1charMix, 30000000)
s2chars <- sample(s2charMix, 30000000)
v1Nums <- runif(30000000, min = 0, max = 20)
nomissing <- sample(1:200000,1)
int.mix <- rbinom(30000000 - nomissing, 30, 0.3)
nalist <- rep(NA, nomissing)
v2NumsNA <- sample(x = c(int.mix, nalist), 30000000)
df <- data_frame(s1 = s1chars, s2 = s2chars, v1 = v1Nums, v2 = v2NumsNA)
This should roughly replicate the size and type of the data you suggest:
df
# A tibble: 30,000,000 x 4
s1 s2 v1 v2
<chr> <chr> <dbl> <int>
1 d s 9.2123603 7
2 b q 16.6638639 11
3 g o 18.3682028 11
4 g s 0.8779067 9
5 a s 0.0719127 10
6 b q 16.8809193 12
7 h q 15.4382455 6
8 e k 2.3565489 11
9 h p 16.4508811 9
10 d n 2.7283823 11
# ... with 29,999,990 more rows
df %>%
filter(is.na(v2))
# A tibble: 116,924 x 4
s1 s2 v1 v2
<chr> <chr> <dbl> <int>
1 d r 13.1448988 NA
2 b o 0.2703848 NA
3 b t 18.8319385 NA
4 a s 11.6448437 NA
5 j m 0.5388760 NA
6 i k 8.7098427 NA
7 d s 6.1149735 NA
8 h p 2.5552694 NA
9 g r 0.9057442 NA
10 b s 19.8886830 NA
# ... with 116,914 more rows
Now, let's benchmark dplyr operations vs data.table:
### dplyr
df %>%
filter(v1 > 10) %>%
group_by(s1, s2) %>%
summarise(total_lines = n(),
non_na_line = sum(!is.na(v2)))
# A tibble: 100 x 4
# Groups: s1 [?]
s1 s2 total_lines non_na_line
<chr> <chr> <int> <int>
1 a k 150327 149734
2 a l 149655 149062
3 a m 149794 149200
4 a n 149771 149197
5 a o 149495 148942
...
> system.time(df %>% filter(v1 > 10) %>% group_by(s1, s2) %>% summarise(total_lines = n(), non_na_line = sum(!is.na(v2))))
user system elapsed
1.848 0.420 2.290
> system.time(for (i in 1:100) df %>% filter(v1 > 10) %>% group_by(s1, s2) %>% summarise(total_lines = n(), non_na_line = sum(!is.na(v2))))
user system elapsed
187.657 55.878 245.528
### Data.table
library(data.table)
dat <- data.table(df)
> dat[v1 > 10, .N, by = .(s1, s2)][dat[v1 > 10 & !is.na(v2), .N, by = .(s1, s2)] , on = c("s1", "s2") , nomatch = 0]
s1 s2 N i.N
1: b q 149968 149348
2: g o 150411 149831
3: h q 150132 149563
4: h p 150786 150224
5: e o 149951 149353
...
> system.time(dat[v1 > 10, .N, by = .(s1, s2)][dat[v1 > 10 & !is.na(v2), .N, by = .(s1, s2)] , on = c("s1", "s2") , nomatch = 0])
user system elapsed
2.027 0.228 2.271
> system.time(for (i in 1:100) dat[v1 > 10, .N, by = .(s1, s2)][dat[v1 > 10 & !is.na(v2), .N, by = .(s1, s2)] , on = c("s1", "s2") , nomatch = 0])
user system elapsed
213.281 43.949 261.664
TL;DR dplyr and data.table are similarly fast, if anything dplyr is slightly faster
> library(readr)
> dat <- read_csv(
+ "s1,s2,v1,v2
+ a,b,10,20
+ a,b,22,NA
+ a,b,13,33
+ c,d,3,NA
+ c,d,4.5,NA
+ c,d,10,20"
+ )
>
> dat
# A tibble: 6 x 4
s1 s2 v1 v2
<chr> <chr> <dbl> <int>
1 a b 10.0 20
2 a b 22.0 NA
3 a b 13.0 33
4 c d 3.0 NA
5 c d 4.5 NA
6 c d 10.0 20
Using data.table since you have a big data
> library(data.table)
data.table 1.10.4
The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
Release notes, videos and slides: http://r-datatable.com
> dat=data.table(dat)
Without removing NA and keeping V1 filter as 0.1
> dat1=dat[v1>0.1,.N,.(s1,s2)]
> dat1
s1 s2 N
1: a b 3
2: c d 3
Removing v2 NA and keeping V1 filter as 0.1
> dat2=dat[v1>0.1&is.na(v2)==F,.N,.(s1,s2)]
> dat2
s1 s2 N
1: a b 2
2: c d 1
Merging the two and keeping V1 filter as 0
> dat[v1 > 0, .N, by = .(s1, s2)][ dat[v1 > 0 & !is.na(v2), .N, by = .(s1, s2)] , on = c("s1", "s2") , nomatch = 0 ]
s1 s2 N i.N
1: a b 3 2
2: c d 3 1
Related
I need a function f(B,A) that, given a dataset with the following structure,
T1 T2 T3 T4 T5 ... P1 P2 P3 P4 P5 ...
1 2 5 8 9 ... A C B B A ...
1 3 4 6 6 ... C A C A B ...
finds the first time B and A appear in Pj columns (starting with j=1) and returns the value difference in the corresponding Ti columns.
For instance:
in line 1: B appears in P3 first, A appears in P1 first. Then:
f(B, A) = T3 - T1 = 5-1 = 4
in line 2: B appears in P5 first, A appears in P2 first. Then:
f(B, A) = T5 - T2 = 6-3 = 3
I can find in which Pj columns B and A appear using str_detect() , but I don't know how to "move" from P_j1, P_j2 to T_j1, T_j2.
Using datatable syntax (or base R) will be appreciated
Here is a data.table approach.
library(data.table)
DT <- fread("T1 T2 T3 T4 T5 P1 P2 P3 P4 P5
1 2 5 8 9 A C B B A
1 3 4 6 6 C A C A B")
# Add row ID's
DT[, id := .I]
#melt to a long format
DT.melt <- data.table::melt(DT,
id.vars = "id",
measure.vars = patterns(T = "^T", P = "^P"))
# Find first B for each id
val1 <- DT.melt[P == "B", T[1], by = .(id)]$V1
# [1] 5 6
# Find first A for each id
val2 <- DT.melt[P == "A", T[1], by = .(id)]$V1
# [1] 1 3
val1 - val2
# [1] 4 3
base R
f <- function(l1, l2){
apply(df, 1, function(x){
dfP <- x[grepl("P", names(x))]
dfT <- x[grepl("T", names(x))]
as.numeric(dfT[which(dfP == l1)[1]]) - as.numeric(dfT[which(dfP == l2)[1]])
})
}
f("B", "A")
[1] 4 3
Tidyverse
With this type of data, it's usually best to pivot to long and then back to wide: here is a tidyverse solution, with diff being the desired output.
library(tidyverse)
df %>%
mutate(id = row_number()) %>%
pivot_longer(-id, names_pattern = "(\\D)(\\d)",
names_to = c(".value", "group")) %>%
group_by(id) %>%
mutate(diff = first(T[P == "B"]) - first(T[P == "A"])) %>%
pivot_wider(c(id, diff), names_from = group, values_from = c(T, P), names_sep = "")
output
id diff T1 T2 T3 T4 T5 P1 P2 P3 P4 P5
<int> <int> <int> <int> <int> <int> <int> <chr> <chr> <chr> <chr> <chr>
1 1 4 1 2 5 8 9 A C B B A
2 2 3 1 3 4 6 6 C A C A B
I have a data frame with 2 columns min and max. I wanted to generate sequence b/w these numbers for each row.
df <- data.table("ID"=c("A","B","C"),
"mn" = c(1,2,3),
"mx" = c(10,5,10)
)
> df
ID mn mx
1: A 1 10
2: B 2 5
3: C 3 10
I want to take these 2 columns and generate 4 sequence numbers for each row
>df
ID | mn | mx | S1 | s2 | S3 | s4
A 1 10 1 4 7 10
B 2 5 2 3 4 5
C 3 10 3 5.33 7.66 10
I can iterate over rows and use seq(mn, mx, length=N) but I want to apply this operation on 1 Million rows , please suggest efficient solution .
Another vectorized option using data.table which should be faster without going through row by row:
df[, paste0("s", 1L:4L) := {
d <- (mx - mn) / 3
.(mn, mn + d, mn + 2*d, mx)
}]
To generalize, thelatemail has suggested:
N <- 4L
df[, paste0("s", seq(N)) := transpose(Map(`+`, mn,
lapply((mx - mn) / (N-1), `*`, seq(0, N-1))
))]
Or equivalently but longer winded but I think it will be faster:
N <- 4L #assuming N >= 2
d <- df[, (mx - mn) / (N - 1)]
init <- df[["mn"]]
for (n in 1L:N) {
set(df, j=paste0("s", n), value=init + (n-1) * d)
}
One way using apply :
cbind(df, t(apply(df, 1, function(x) x[1]:x[2])))
# mn mx V1 V2 V3 V4
#1: 1 4 1 2 3 4
#2: 2 5 2 3 4 5
#3: 3 6 3 4 5 6
Or mapply :
cbind(df, t(mapply(`:`, df$mn, df$mx)))
One dplyr and tidyr solution could be:
df %>%
rowwise() %>%
mutate(cols = list(mn:mx)) %>%
unnest_wider(cols) %>%
rename_at(vars(-mn, -mx), ~ paste0("s", 1:length(.)))
mn mx s1 s2 s3 s4
<dbl> <dbl> <int> <int> <int> <int>
1 1 4 1 2 3 4
2 2 5 2 3 4 5
3 3 6 3 4 5 6
Or:
df %>%
mutate(cols = Map(`:`, mn, mx)) %>%
unnest_wider(cols) %>%
rename_at(vars(-mn, -mx), ~ paste0("s", 1:length(.)))
In data.table, loop over mn and mx with Map and then transpose and assign :=:
df[, paste0("s", seq(4)) := transpose(Map(seq, mn, mx, length.out=4))]
df
# ID mn mx s1 s2 s3 s4
#1: A 1 10 1 4.000000 7.000000 10
#2: B 2 5 2 3.000000 4.000000 5
#3: C 3 10 3 5.333333 7.666667 10
We can use map2
library(dplyr)
library(purrr)
library(tidyr)
df %>%
mutate(cols = map2(mn, mx, `:`)) %>%
unnest_wider(cols)
I have the following data frame:
library(dplyr)
library(tibble)
df <- tibble(
source = c("a", "b", "c", "d", "e"),
score = c(10, 5, NA, 3, NA ) )
df
It looks like this:
# A tibble: 5 x 2
source score
<chr> <dbl>
1 a 10 . # current max value
2 b 5
3 c NA
4 d 3
5 e NA
What I want to do is to replace NA in score column with values ranging for existing max + n onwards. Where n range from 1 to total number of rows of the df
Resulting in this (hand-coded) :
source score
a 10
b 5
c 11 # obtained from 10 + 1
d 3
e 12 # obtained from 10 + 2
How can I achieve that?
Another option :
transform(df, score = pmin(max(score, na.rm = TRUE) +
cumsum(is.na(score)), score, na.rm = TRUE))
# source score
#1 a 10
#2 b 5
#3 c 11
#4 d 3
#5 e 12
If you want to do this in dplyr
library(dplyr)
df %>% mutate(score = pmin(max(score, na.rm = TRUE) +
cumsum(is.na(score)), score, na.rm = TRUE))
A base R solution
df$score[is.na(df$score)] <- seq(which(is.na(df$score))) + max(df$score,na.rm = TRUE)
such that
> df
# A tibble: 5 x 2
source score
<chr> <dbl>
1 a 10
2 b 5
3 c 11
4 d 3
5 e 12
Here is a dplyr approach,
df %>%
mutate(score = replace(score,
is.na(score),
(max(score, na.rm = TRUE) + (cumsum(is.na(score))))[is.na(score)])
)
which gives,
# A tibble: 5 x 2
source score
<chr> <dbl>
1 a 10
2 b 5
3 c 11
4 d 3
5 e 12
With dplyr:
library(dplyr)
df %>%
mutate_at("score", ~ ifelse(is.na(.), max(., na.rm = TRUE) + cumsum(is.na(.)), .))
Result:
# A tibble: 5 x 2
source score
<chr> <dbl>
1 a 10
2 b 5
3 c 11
4 d 3
5 e 12
A dplyr solution.
df %>%
mutate(na_count = cumsum(is.na(score)),
score = ifelse(is.na(score), max(score, na.rm = TRUE) + na_count, score)) %>%
select(-na_count)
## A tibble: 5 x 2
# source score
# <chr> <dbl>
#1 a 10
#2 b 5
#3 c 11
#4 d 3
#5 e 12
Another one, quite similar to ThomasIsCoding's solution:
> df$score[is.na(df$score)]<-max(df$score, na.rm=T)+(1:sum(is.na(df$score)))
> df
# A tibble: 5 x 2
source score
<chr> <dbl>
1 a 10
2 b 5
3 c 11
4 d 3
5 e 12
Not quite elegant as compared to the base R solutions, but still possible:
library(data.table)
setDT(df)
max.score = df[, max(score, na.rm = TRUE)]
df[is.na(score), score :=(1:.N) + max.score]
Or in one line but a bit slower:
df[is.na(score), score := (1:.N) + df[, max(score, na.rm = TRUE)]]
df
source score
1: a 10
2: b 5
3: c 11
4: d 3
5: e 12
Consider noMissing data frame.
library(lubridate)
set.seed(123)
value <- rnorm(300)
value[sample(1:300,10)]<- NA
b <- rep(c("a","b", "c", "d","e", "f"), each=50)
b[sample(1:300,12)] <- NA
c <- rep(rep(as.character(1:2), each = 25) , 6)
c[sample(1:300,10)] <- NA
datee <- seq(lubridate::ymd("2012-01-01"),lubridate::ymd("2012-01-01") + 24 , by = "days")
datee <- rep(datee, 12)
datee[sample(1:300,20)] <- NA
dataframe <- cbind.data.frame( b, c, datee, value)
noMissing <- dataframe[complete.cases(dataframe),]
head(noMissing)
b c datee value
1 a 1 2012-01-01 -0.56047565
2 a 1 2012-01-02 -0.23017749
3 a 1 2012-01-03 1.55870831
4 a 1 2012-01-04 0.07050839
5 a 1 2012-01-05 0.12928774
6 a 1 2012-01-06 1.71506499
Now I want to group data by columns b, and c then calculate the correlation of each group with group a in b column which has the same dates in datee column as the other group.
For example correlation between b, 1 and the refrenced group a show in the following picture
My initial solution:
b_unique <- unique(noMissing$b)
c_unique <- unique(noMissing$c)
out <- list()
v <- 0
for (i in 1:length(b_unique)) {
v <- v + 1
group <- noMissing[noMissing$b==b_unique[i] & noMissing$c == c_unique[k],]
ref <- noMissing[noMissing$b=="a" & noMissing$c == c_unique[k] ,]
inter <-ymd("1970-01-01") + intersect(ref$datee, group$datee )
x <- cor(group$value[group$datee %in% inter],ref[ref$datee %in% inter , "value"])
out[[v]] <- list(b = b_unique[i], c = c_unique[k], cor = x)
}
}
dplyr::bind_rows(out)
b c cor
<fct> <fct> <dbl>
1 a 1 1.000
2 a 2 1
3 b 1 0.175
4 b 2 -0.247
5 c 1 0.216
6 c 2 0.101
7 d 1 0.159
8 d 2 -0.253
9 e 1 0.177
10 e 2 -0.528
11 f 1 0.179
12 f 2 -0.178
I am seeking good taste coding solutions
You can do the following:
library(data.table)
# convert the data shape to have datewise information across all groups
df <- dcast(data.table(noMissing), datee+c ~ b, value.var='value')
# rename c as c_1 column as there are multiple column with c name
setnames(df, old = 2, new = 'c_1')
# groupby 'c_1' and for each group calculate correlation between b-a, c-a, d-a, e-a and so on
df <- df[,
lapply(.SD[,-c('datee'), with=F], function(x) {
cols <- c('a','b','c','d','e','f')
vals <- vector(mode = 'numeric', length = 6)
for(i in seq(cols)) {vals[i] <- (cor(get(cols[i]), get(('a')), use = complete.obs'))}
return (vals)})
,c_1]
# finally reshape the table as you posted in solution above.
df <- melt(df, id.vars = c('c_1'))
colnames(df) <- c('c','b','cor')
c b cor
1: 1 a 1.00000000
2: 1 a -0.12499728
3: 1 a -0.13133257
4: 1 a 0.02573947
5: 1 a 0.07239559
6: 1 a -0.07421281
I want to take multiple lagged values of multiple columns in R.
How do I use mutate_at to get the same results as below? Lets say the real example has 30 columns, so it doesn't make sense to write out the lag formula 30x for each time period.
df <- data_frame(time_col = 1:26, col_1 = letters, col_2 = rev(letters))
df %>% mutate(col_1_lag_1 = lag(col_1, n = 1, by = time_col),
col_2_lag_1 = lag(col_2, n = 1, by = time_col),
col_1_lag_2 = lag(col_1, n = 2, by = time_col),
col_2_lag_2 = lag(col_2, n = 2, by = time_col))
I think it should be something like this, but I don't know how to specify both sets of parameters:
df <- data_frame(time_col = 1:26, col_1 = letters, col_2 = rev(letters))
df %>% mutate_at(vars(col_1, col_2), funs(lag, lag), n = 1, n = 2, by = time_col)
A solution with the help from purrr.
library(dplyr)
library(purrr)
df <- data_frame(time_col = 1:26, col_1 = letters, col_2 = rev(letters))
map_dfc(1:2, function(x){
df2 <- df %>% transmute_at(vars(starts_with("col")),
funs(lag(., n = x, by = time_col)))
return(df2)
}) %>%
bind_cols(df, .) %>%
set_names(c(names(df), paste0("col_", 1:2, "_lag_", rep(1:2, each = 2))))
# # A tibble: 26 x 7
# time_col col_1 col_2 col_1_lag_1 col_2_lag_1 col_1_lag_2 col_2_lag_2
# <int> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 1 a z NA NA NA NA
# 2 2 b y a z NA NA
# 3 3 c x b y a z
# 4 4 d w c x b y
# 5 5 e v d w c x
# 6 6 f u e v d w
# 7 7 g t f u e v
# 8 8 h s g t f u
# 9 9 i r h s g t
# 10 10 j q i r h s
# # ... with 16 more rows
Here is an alternative purrr solution using a nested map_dfc and quasiquotation syntax
bind_cols(
df,
map_dfc(c("col_1", "col_2"), function(i) map_dfc(c(1, 2), function(n)
df %>%
transmute(!!paste0(i, "_lag_", n, collapse = "") := lag(!!rlang::sym(i), n = n, by = time_col)))))
## A tibble: 26 x 7
# time_col col_1 col_2 col_1_lag_1 col_1_lag_2 col_2_lag_1 col_2_lag_2
# <int> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 1 a z NA NA NA NA
# 2 2 b y a NA z NA
# 3 3 c x b a y z
# 4 4 d w c b x y
# 5 5 e v d c w x
# 6 6 f u e d v w
# 7 7 g t f e u v
# 8 8 h s g f t u
# 9 9 i r h g s t
#10 10 j q i h r s
## ... with 16 more rows