Max of sum of combinations across columns in a dataframe - r

I have the dataframe
df <- data.frame(e_1=c(1,2,3,4,5), e_2=c(1,3,5,7,9), e_3=c(2,4,6,8,1),
e_4=c(1,2,4,5,7), e_5=c(1,8,9,6,4), Lanes=c(3,4,3,5,4))
I try to use:
max(combn(df[,(1:5)],df$Lanes,FUN = function(i) rowSums(df[,(1:5)][i])))
I get the error
Error in combn(df[, (1:5)], df$Lanes, FUN = function(i) rowSums(df[, (1:5)][i])) : length(m) == 1L is not TRUE

I guess you can try using combn row-wise, e.g.,
df$comb <- apply(df,1,function(v) max(combn(v[1:5],v["Lanes"],sum)))
such that
> df
e_1 e_2 e_3 e_4 e_5 Lanes comb
1 1 1 2 1 1 3 4
2 2 3 4 2 8 4 17
3 3 5 6 4 9 3 20
4 4 7 8 5 6 5 30
5 5 9 1 7 4 4 25

Using dplyr and purrr for this one could look as follows.
library(dplyr)
library(purrr)
library(rlang)
df <- data.frame(e_1=c(1,2,3,4,5), e_2=c(1,3,5,7,9), e_3=c(2,4,6,8,1),
e_4=c(1,2,4,5,7), e_5=c(1,8,9,6,4), Lanes=c(3,4,3,5,4))
df %>%
mutate(eSum = pmap(list(!!!parse_exprs(colnames(.))),
~ max(colSums(combn(c(..1, ..2, ..3, ..4, ..5), ..6)))))
# e_1 e_2 e_3 e_4 e_5 Lanes eSum
# 1 1 1 2 1 1 3 4
# 2 2 3 4 2 8 4 17
# 3 3 5 6 4 9 3 20
# 4 4 7 8 5 6 5 30
# 5 5 9 1 7 4 4 25

An option with c_across from dplyr
library(dplyr)
df %>%
rowwise %>%
mutate(Comb = max(combn(c_across(starts_with('e')), Lanes, FUN = sum)))
# A tibble: 5 x 7
# Rowwise:
# e_1 e_2 e_3 e_4 e_5 Lanes Comb
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 1 2 1 1 3 4
#2 2 3 4 2 8 4 17
#3 3 5 6 4 9 3 20
#4 4 7 8 5 6 5 30
#5 5 9 1 7 4 4 25

Related

How to rename last n columns of dataframe with vector of strings, using rename_with or rename?

I'm using rename_at, but since it is superseded, I need to find a way to rename n last columns with some vector of strings using rename_with() or rename()
library(tidyverse)
df <- tibble(
a = 1:10,
b = 1:10,
c = 1:10,
d = 1:10,
e = 1:10
)
new_names <- c("1", "2", "4", "5", "10")
df %>%
rename_at(vars(names(.) %>% tail(5)), funs(paste0("", new_names))) # only `funs(new_names)` won't work
Base R approach :
n <- ncol(df)
names(df)[(n-4):n] <- new_names
df
# A tibble: 10 x 5
# `1` `2` `4` `5` `10`
# <int> <int> <int> <int> <int>
# 1 1 1 1 1 1
# 2 2 2 2 2 2
# 3 3 3 3 3 3
# 4 4 4 4 4 4
# 5 5 5 5 5 5
# 6 6 6 6 6 6
# 7 7 7 7 7 7
# 8 8 8 8 8 8
# 9 9 9 9 9 9
#10 10 10 10 10 10
Using rename_with
library(dplyr)
library(stringr)
df %>%
rename_with(~ str_c(., new_names), tail(names(.), 5))
# A tibble: 10 x 5
# a1 b2 c4 d5 e10
# <int> <int> <int> <int> <int>
# 1 1 1 1 1 1
# 2 2 2 2 2 2
# 3 3 3 3 3 3
# 4 4 4 4 4 4
# 5 5 5 5 5 5
# 6 6 6 6 6 6
# 7 7 7 7 7 7
# 8 8 8 8 8 8
# 9 9 9 9 9 9
#10 10 10 10 10 10
Or with rename
df %>%
rename(!!! setNames(tail(names(.), 5), new_names))
Or using rename_at directly on the tail of names
df %>%
rename_at(vars(tail(names(.), 5)), ~ str_c(., new_names))
-output
# A tibble: 10 x 5
# a1 b2 c4 d5 e10
# <int> <int> <int> <int> <int>
# 1 1 1 1 1 1
# 2 2 2 2 2 2
# 3 3 3 3 3 3
# 4 4 4 4 4 4
# 5 5 5 5 5 5
# 6 6 6 6 6 6
# 7 7 7 7 7 7
# 8 8 8 8 8 8
# 9 9 9 9 9 9
#10 10 10 10 10 10
if it is to just replace the names
df %>%
rename_at(vars(tail(names(.), 5)), ~ new_names)
# A tibble: 10 x 5
# `1` `2` `4` `5` `10`
# <int> <int> <int> <int> <int>
# 1 1 1 1 1 1
# 2 2 2 2 2 2
# 3 3 3 3 3 3
# 4 4 4 4 4 4
# 5 5 5 5 5 5
# 6 6 6 6 6 6
# 7 7 7 7 7 7
# 8 8 8 8 8 8
# 9 9 9 9 9 9
#10 10 10 10 10 10
In the example, there are only 5 columns. Suppose, if it is only the last 3 columns
df %>%
rename_at(vars(tail(names(.), 3)), ~ str_c(., tail(new_names, 3)))
funs take a function, so wrapping with paste0 or as.character does that instead of just a input vector

Split information from two columns, R, tidyverse

i've got some data in two columns:
# A tibble: 16 x 2
code niveau
<chr> <dbl>
1 A 1
2 1 2
3 2 2
4 3 2
5 4 2
6 5 2
7 B 1
8 6 2
9 7 2
My desired output is:
A tibble: 16 x 3
code niveau cat
<chr> <dbl> <chr>
1 A 1 A
2 1 2 A
3 2 2 A
4 3 2 A
5 4 2 A
6 5 2 A
7 B 1 B
8 6 2 B
I there a tidy way to convert these data without looping through it?
Here some dummy data:
data<-tibble(code=c('A', 1,2,3,4,5,'B', 6,7,8,9,'C',10,11,12,13), niveau=c(1, 2,2,2,2,2,1,2,2,2,2,1,2,2,2,2))
desired_output<-tibble(code=c('A', 1,2,3,4,5,'B', 6,7,8,9,'C',10,11,12,13), niveau=c(1, 2,2,2,2,2,1,2,2,2,2,1,2,2,2,2),
cat=c(rep('A', 6),rep('B', 5), rep('C', 5)))
Nicolas
Probably, you can create a new column cat and replace code values with NA where there is a number. We can then use fill to replace missing values with previous non-NA value.
library(dplyr)
data %>% mutate(cat = replace(code, grepl('\\d', code), NA)) %>% tidyr::fill(cat)
# A tibble: 16 x 3
# code niveau cat
# <chr> <dbl> <chr>
# 1 A 1 A
# 2 1 2 A
# 3 2 2 A
# 4 3 2 A
# 5 4 2 A
# 6 5 2 A
# 7 B 1 B
# 8 6 2 B
# 9 7 2 B
#10 8 2 B
#11 9 2 B
#12 C 1 C
#13 10 2 C
#14 11 2 C
#15 12 2 C
#16 13 2 C
We can use str_detect from stringr
library(dplyr)
library(stringr)
library(tidyr)
data %>%
mutate(cat = replace(code, str_detect(code, '\\d'), NA)) %>%
fill(cat)

T-test for multiple rows in R

I have a table with 40+ columns and 200.000+ rows.
Something like this:
ID GROUP-A1 GROUP-A2 GROUP A3...A20 GROUP-B1 GROUP-B2 GROUP-B3...B20
1 5 6 3 5....3 10 21 9 15
2 3 4 6 2....13 23 42 34 23
3 5 3 1 0....12 10 12 43 15
4 0 0 2 5....3 10 21 23 15
I would like to run a t-test for the two groups A (1..20) and B (1..20) for every measurement I have (each row), which are independent. And possibly, have the resulting stats in the table next to each row or in a separate table, so I can easily select the significant ones.
I looked at few R packages but they mostly would require reformatting the table I have, to put measurements and groups in columns, and I would need 200.000+ separate tables in that case.
Any idea?
Something like this?
apply(df,1,function(x){t.test(x[2:21],x[22:41])})
To save the test statistic or p-value in a new column you could do
df$st=apply(df,1,function(x){t.test(x[2:21],x[22:41])$stat})
or $p.value
You can run all tests with the following code.
i_group_a <- grep("GROUP.A", names(df1), ignore.case = TRUE)
i_group_b <- grep("GROUP.B", names(df1), ignore.case = TRUE)
ttest_list <- lapply(seq_along(i_group_a), function(k){
i <- i_group_a[k]
j <- i_group_b[k]
t.test(df1[[i]], df1[[j]])
})
ttest_list[[1]]
#
# Welch Two Sample t-test
#
#data: df1[[i]] and df1[[j]]
#t = -2.8918, df = 3.7793, p-value = 0.04763
#alternative hypothesis: true difference in means is not equal to 0
#95 percent confidence interval:
# -19.826402 -0.173598
#sample estimates:
#mean of x mean of y
# 3.25 13.25
To extract, for instance, the p-values:
pval <- sapply(ttest_list, `[[`, 'p.value')
pval
#[1] 0.04762593 0.04449075 0.04390115 0.00192454
Data.
df1 <- read.table(text = "
ID GROUP-A1 GROUP-A2 GROUP-A3 GROUP-A20 GROUP-B1 GROUP-B2 GROUP-B3 GROUP-B20
1 5 6 3 5 10 21 9 15
2 3 4 6 2 23 42 34 23
3 5 3 1 0 10 12 43 15
4 0 0 2 5 10 21 23 15
", header = TRUE)
You can do this with tidyverse using purrr. It does however require to format your data differently. Here is an example:
require(tidyverse)
set.seed(314)
simulate your data
df <- data.frame(ID = rep(1:5,each = 20),
participant = rep(rep(1:10,2),5),
group = rep(rep(c('A','B'),each = 10),5),
answer = sample(1:10,100, replace = T))
dfflat <- df %>%
unite(column, group,participant) %>%
spread(column,answer)
dfflat:
ID A_1 A_10 A_2 A_3 A_4 A_5 A_6 A_7 A_8 A_9 B_1 B_10 B_2 B_3 B_4 B_5 B_6 B_7 B_8 B_9
1 1 1 8 3 8 3 3 4 3 4 6 4 4 2 3 3 6 4 8 6 1
2 2 7 6 5 6 3 1 6 4 1 3 3 6 7 1 5 5 2 10 10 6
3 3 4 3 8 5 9 7 9 7 3 1 8 2 7 6 8 3 5 6 9 4
4 4 5 4 8 2 4 1 4 6 2 2 1 1 7 10 6 9 7 7 10 1
5 5 4 1 5 10 3 5 3 10 8 3 7 3 4 6 6 9 10 7 4 5
the equivalent in long format:
dfflat %>%
gather(participant,answer,-ID) %>%
separate(participant,c('group','number'))
ID group number answer
1 1 A 1 1
2 2 A 1 7
3 3 A 1 4
4 4 A 1 5
5 5 A 1 4
6 1 A 10 8
7 2 A 10 6
8 3 A 10 3
9 4 A 10 4
10 5 A 10 1
11 1 A 2 3
12 2 A 2 5
13 3 A 2 8
14 4 A 2 8
15 5 A 2 5
16 1 A 3 8
17 2 A 3 6
18 3 A 3 5
19 4 A 3 2
20 5 A 3 10
...
Test the hypothesis with t.test per ID and extract the p.value
dfflat %>%
gather(participant,answer,-ID) %>%
separate(participant,c('group','number')) %>%
group_by(ID) %>%
nest() %>%
mutate(test = map(data, ~ with(.x, t.test(answer[group == 'A'],answer[group == 'B']))),
p.value = map_dbl(test,pluck,'p.value'))
results in:
# A tibble: 5 x 4
ID data test p.value
<int> <list> <list> <dbl>
1 1 <tibble [20 x 3]> <S3: htest> 0.841
2 2 <tibble [20 x 3]> <S3: htest> 0.284
3 3 <tibble [20 x 3]> <S3: htest> 0.863
4 4 <tibble [20 x 3]> <S3: htest> 0.137
5 5 <tibble [20 x 3]> <S3: htest> 0.469

R how to fill in NA with rules

data=data.frame(person=c(1,1,1,2,2,2,2,3,3,3,3),
t=c(3,NA,9,4,7,NA,13,3,NA,NA,12),
WANT=c(3,6,9,4,7,10,13,3,6,9,12))
So basically I am wanting to create a new variable 'WANT' which takes the PREVIOUS value in t and ADDS 3 to it, and if there are many NA in a row then it keeps doing this. My attempt is:
library(dplyr)
data %>%
group_by(person) %>%
mutate(WANT_TRY = fill(t) + 3)
Here's one way -
data %>%
group_by(person) %>%
mutate(
# cs = cumsum(!is.na(t)), # creates index for reference value; uncomment if interested
w = case_when(
# rle() gives the running length of NA
is.na(t) ~ t[cumsum(!is.na(t))] + 3*sequence(rle(is.na(t))$lengths),
TRUE ~ t
)
) %>%
ungroup()
# A tibble: 11 x 4
person t WANT w
<dbl> <dbl> <dbl> <dbl>
1 1 3 3 3
2 1 NA 6 6
3 1 9 9 9
4 2 4 4 4
5 2 7 7 7
6 2 NA 10 10
7 2 13 13 13
8 3 3 3 3
9 3 NA 6 6
10 3 NA 9 9
11 3 12 12 12
Here is another way. We can do linear interpolation with the imputeTS package.
library(dplyr)
library(imputeTS)
data2 <- data %>%
group_by(person) %>%
mutate(WANT2 = na.interpolation(WANT)) %>%
ungroup()
data2
# # A tibble: 11 x 4
# person t WANT WANT2
# <dbl> <dbl> <dbl> <dbl>
# 1 1 3 3 3
# 2 1 NA 6 6
# 3 1 9 9 9
# 4 2 4 4 4
# 5 2 7 7 7
# 6 2 NA 10 10
# 7 2 13 13 13
# 8 3 3 3 3
# 9 3 NA 6 6
# 10 3 NA 9 9
# 11 3 12 12 12
This is harder than it seems because of the double NA at the end. If it weren't for that, then the following:
ifelse(is.na(data$t), c(0, data$t[-nrow(data)])+3, data$t)
...would give you want you want. The simplest way, that uses the same logic but doesn't look very clever (sorry!) would be:
.impute <- function(x) ifelse(is.na(x), c(0, x[-length(x)])+3, x)
.impute(.impute(data$t))
...which just cheats by doing it twice. Does that help?
You can use functional programming from purrr and "NA-safe" addition from hablar:
library(hablar)
library(dplyr)
library(purrr)
data %>%
group_by(person) %>%
mutate(WANT2 = accumulate(t, ~.x %plus_% 3))
Result
# A tibble: 11 x 4
# Groups: person [3]
person t WANT WANT2
<dbl> <dbl> <dbl> <dbl>
1 1 3 3 3
2 1 NA 6 6
3 1 9 9 9
4 2 4 4 4
5 2 7 7 7
6 2 NA 10 10
7 2 13 13 13
8 3 3 3 3
9 3 NA 6 6
10 3 NA 9 9
11 3 12 12 12

count positive negative values in column by group

I want to create two variables giving me the total number of positive and negative values by id, hopefully using dplyr.
Example data:
library(dplyr)
set.seed(42)
df <- data.frame (id=rep(1:10,each=10),
ff=rnorm(100, 0,14 ))
> head(df,20)
id ff
1 1 19.1934183
2 1 -7.9057744
3 1 5.0837978
4 1 8.8600765
5 1 5.6597565
6 1 -1.4857432
7 1 21.1613080
8 1 -1.3252265
9 1 28.2579320
10 1 -0.8779974
11 2 18.2681752
12 2 32.0130355
13 2 -19.4440498
14 2 -3.9030427
15 2 -1.8664987
16 2 8.9033056
17 2 -3.9795409
18 2 -37.1903759
19 2 -34.1665370
20 2 18.4815868
the resulting dataset should look like:
> head(df,20)
id ff pos neg
1 1 19.1934183 6 4
2 1 -7.9057744 6 4
3 1 5.0837978 6 4
4 1 8.8600765 6 4
5 1 5.6597565 6 4
6 1 -1.4857432 6 4
7 1 21.1613080 6 4
8 1 -1.3252265 6 4
9 1 28.2579320 6 4
10 1 -0.8779974 6 4
11 2 18.2681752 4 6
12 2 32.0130355 4 6
13 2 -19.4440498 4 6
14 2 -3.9030427 4 6
15 2 -1.8664987 4 6
16 2 8.9033056 4 6
17 2 -3.9795409 4 6
18 2 -37.1903759 4 6
19 2 -34.1665370 4 6
20 2 18.4815868 4 6
I have thought something similar to this will work:
df<-df%>% group_by(id) %>% mutate(pos= nrow(ff>0)) %>% ungroup()
Any help would be great, thanks.
You need sum():
df %>% group_by(id) %>%
mutate(pos = sum(ff>0),
neg = sum(ff<0))
For a fun (and a fast) solution data.table can also be used:
library(data.table)
setDT(df)
df[, ":="(pos = sum(ff > 0), neg = sum(ff < 0)), by = id]
Here's an answer that add the ifelse part of your question:
df <- df %>% group_by(id) %>%
mutate(pos = sum(ff>0), neg = sum(ff<0)) %>%
group_by(id) %>%
mutate(any_neg=ifelse(any(ff < 0), 1, 0))
Output:
> head(df, 20)
Source: local data frame [20 x 5]
Groups: id [2]
id ff pos neg any_neg
<int> <dbl> <int> <int> <dbl>
1 1 19.1934183 6 4 1
2 1 -7.9057744 6 4 1
3 1 5.0837978 6 4 1
4 1 8.8600765 6 4 1
5 1 5.6597565 6 4 1
6 1 -1.4857432 6 4 1
7 1 21.1613080 6 4 1
8 1 -1.3252265 6 4 1
9 1 28.2579320 6 4 1
10 1 -0.8779974 6 4 1
11 2 18.2681752 4 6 1
12 2 32.0130355 4 6 1
13 2 -19.4440498 4 6 1
14 2 -3.9030427 4 6 1
15 2 -1.8664987 4 6 1
16 2 8.9033056 4 6 1
17 2 -3.9795409 4 6 1
18 2 -37.1903759 4 6 1
19 2 -34.1665370 4 6 1
20 2 18.4815868 4 6 1

Resources