join and sum columns together R - r

I have a dataframe:
df <- data.frame(ca = c("a","b","a","c","b", "b"),
f = c(3,4,0,NA,3, 4),
f2 = c(NA,5,6,1,9, 7),
f3 = c(3,0,6,3,0, 8))
I want join and sum my columns "f" and "f2" and rename it in "f_news"
exemple :
df <- data.frame(ca = c("a","b","a","c","b", "b"),
f_new = c(3,9,6,1,12, 11),
f3 = c(3,0,6,3,0, 8))
Do you have an idea of how to do this with summarise, spread, group_by?

Using plyr and dplyr you can do this:
df %>%
rowwise() %>%
mutate(f_new=sum(f, f2, na.rm = T))
# A tibble: 6 x 5
# ca f f2 f3 f_new
# <fct> <dbl> <dbl> <dbl> <dbl>
#1 a 3 NA 3 3
#2 b 4 5 0 9
#3 a 0 6 6 6
#4 c NA 1 3 1
#5 b 3 9 0 12
#6 b 4 7 8 11
This method will retain and NA values

Here is an answer using tidyverse methods from dplyr and tidyr
library(tidyverse)
df <- data.frame(ca = c("a","b","a","c","b", "b"),
f = c(3,4,0,NA,3, 4),
f2 = c(NA,5,6,1,9, 7),
f3 = c(3,0,6,3,0, 8))
df %>%
replace_na(list(f = 0, f2 = 0)) %>%
mutate(f_new = f + f2)
#> ca f f2 f3 f_new
#> 1 a 3 0 3 3
#> 2 b 4 5 0 9
#> 3 a 0 6 6 6
#> 4 c 0 1 3 1
#> 5 b 3 9 0 12
#> 6 b 4 7 8 11

Dplyr can do this quite nice with the following code. Rowwise allows you to consider each row separately. And the mutate command sums whatever columns you want. the na.rm=TRUE handles the issue when you have NA's and want to ignore them. As a comment mentioned, if you do not have this, it will give you an NA if it's in any of the summed values.
library(dplyr)
df %>%
rowwise() %>%
mutate(f_new = sum(f,f2, na.rm = TRUE))

Related

R: expand grid of all possible combinations within groups and apply functions across all the pairs

data <- tibble(time = c(1,1,2,2), a = c(1,2,3,4), b =c(4,3,2,1), c = c(1,1,1,1))
The result will look like this
result <- tibble(
t = c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2),
firm1 = c("a","a","a","b","b","b","c","c","c","a","a","a","b","b","b","c","c","c"),
firm2 = c("a","b","c","a","b","c","a","b","c","a","b","c","a","b","c","a","b","c"),
value = c(6,10,5,10,14,9,5,9,4,14,10,9,10,6,5,9,5,4))
result
The function could be
function(x, y){sum(x, y)}
Basically I am looking for a tidy solution to expand.grid data at each point of time and apply functions across columns. Can anyone help?
I tried this, but I could not have time in front of the pairs.
expected_result<-expand.grid(names(data[-1]), names(data[-1])) %>%
mutate(value = map2(Var1, Var2, ~ fun1(data[.x], data[.y])))
expected_result
Use exand.grid you get all possible combination of columns, split the data by time and apply fun for each row of tmp.
library(dplyr)
library(purrr)
tmp <- expand.grid(firm1 = names(data[-1]), firm2 = names(data[-1]))
fun <- function(x, y) sum(x, y)
result <- data %>%
group_split(time) %>%
map_df(~cbind(time = .x$time[1], tmp,
value = apply(tmp, 1, function(x) fun(.x[[x[1]]], .x[[x[2]]]))))
result
# time firm1 firm2 value
#1 1 a a 6
#2 1 b a 10
#3 1 c a 5
#4 1 a b 10
#5 1 b b 14
#6 1 c b 9
#7 1 a c 5
#8 1 b c 9
#9 1 c c 4
#10 2 a a 14
#11 2 b a 10
#12 2 c a 9
#13 2 a b 10
#14 2 b b 6
#15 2 c b 5
#16 2 a c 9
#17 2 b c 5
#18 2 c c 4
You may also do this in base R -
result <- do.call(rbind, by(data, data$time, function(x) {
cbind(time = x$time[1], tmp,
value = apply(tmp, 1, function(y) fun(x[[y[1]]], x[[y[2]]])))
}))
We may use
library(dplyr)
library(tidyr)
library(purrr)
data1 <- data %>%
group_by(time) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop') %>%
pivot_longer(cols = -time) %>%
group_split(time)
map_dfr(data1, ~ {dat <- .x
crossing(firm1 = dat$name, firm2 = dat$name) %>%
mutate(value = c(outer(dat$value, dat$value, FUN = `+`))) %>%
mutate(time = first(dat$time), .before = 1)})
-output
# A tibble: 18 × 4
time firm1 firm2 value
<dbl> <chr> <chr> <dbl>
1 1 a a 6
2 1 a b 10
3 1 a c 5
4 1 b a 10
5 1 b b 14
6 1 b c 9
7 1 c a 5
8 1 c b 9
9 1 c c 4
10 2 a a 14
11 2 a b 10
12 2 a c 9
13 2 b a 10
14 2 b b 6
15 2 b c 5
16 2 c a 9
17 2 c b 5
18 2 c c 4

If a column is NA, calculate row mean on other columns using dplyR

In the example below how can I calculate the row mean when column A is NA? The row mean would replace the NA in column A. Using base R, I can use this:
foo <- tibble(A = c(3,5,NA,6,NA,7,NA),
B = c(4,5,4,5,6,4,NA),
C = c(6,5,2,8,8,5,NA))
foo
tmp <- rowMeans(foo[,-1],na.rm = TRUE)
foo$A[is.na(foo$A)] <- tmp[is.na(foo$A)]
foo$A[is.nan(foo$A)] <- NA
Curious how I might do this with dplyR?
You can use ifelse :
library(dplyr)
foo %>%
mutate(A = ifelse(is.na(A), rowMeans(., na.rm = TRUE), A),
A = replace(A, is.nan(A), NA))
# A B C
# <dbl> <dbl> <dbl>
#1 3 4 6
#2 5 5 5
#3 3 4 2
#4 6 5 8
#5 7 6 8
#6 7 4 5
#7 NA NA NA
Here is a solution that not only replace NA in column A, but for all columns in the data frame.
library(dplyr)
foo2 <- foo %>%
mutate(RowMean = rowMeans(., na.rm = TRUE)) %>%
mutate(across(-RowMean, .fns =
function(x) ifelse(is.na(x) & !is.nan(RowMean), RowMean, x))) %>%
select(-RowMean)
Use coalesce:
foo %>%
mutate(m = rowMeans(across(), na.rm = T),
A = if_else(is.na(A) & !is.na(m), m, A)) %>%
select(-m)
# # A tibble: 7 x 3
# A B C
# <dbl> <dbl> <dbl>
# 1 3 4 6
# 2 5 5 5
# 3 3 4 2
# 4 6 5 8
# 5 7 6 8
# 6 7 4 5
# 7 NA NA NA

How to replace NA with set of values

I have the following data frame:
library(dplyr)
library(tibble)
df <- tibble(
source = c("a", "b", "c", "d", "e"),
score = c(10, 5, NA, 3, NA ) )
df
It looks like this:
# A tibble: 5 x 2
source score
<chr> <dbl>
1 a 10 . # current max value
2 b 5
3 c NA
4 d 3
5 e NA
What I want to do is to replace NA in score column with values ranging for existing max + n onwards. Where n range from 1 to total number of rows of the df
Resulting in this (hand-coded) :
source score
a 10
b 5
c 11 # obtained from 10 + 1
d 3
e 12 # obtained from 10 + 2
How can I achieve that?
Another option :
transform(df, score = pmin(max(score, na.rm = TRUE) +
cumsum(is.na(score)), score, na.rm = TRUE))
# source score
#1 a 10
#2 b 5
#3 c 11
#4 d 3
#5 e 12
If you want to do this in dplyr
library(dplyr)
df %>% mutate(score = pmin(max(score, na.rm = TRUE) +
cumsum(is.na(score)), score, na.rm = TRUE))
A base R solution
df$score[is.na(df$score)] <- seq(which(is.na(df$score))) + max(df$score,na.rm = TRUE)
such that
> df
# A tibble: 5 x 2
source score
<chr> <dbl>
1 a 10
2 b 5
3 c 11
4 d 3
5 e 12
Here is a dplyr approach,
df %>%
mutate(score = replace(score,
is.na(score),
(max(score, na.rm = TRUE) + (cumsum(is.na(score))))[is.na(score)])
)
which gives,
# A tibble: 5 x 2
source score
<chr> <dbl>
1 a 10
2 b 5
3 c 11
4 d 3
5 e 12
With dplyr:
library(dplyr)
df %>%
mutate_at("score", ~ ifelse(is.na(.), max(., na.rm = TRUE) + cumsum(is.na(.)), .))
Result:
# A tibble: 5 x 2
source score
<chr> <dbl>
1 a 10
2 b 5
3 c 11
4 d 3
5 e 12
A dplyr solution.
df %>%
mutate(na_count = cumsum(is.na(score)),
score = ifelse(is.na(score), max(score, na.rm = TRUE) + na_count, score)) %>%
select(-na_count)
## A tibble: 5 x 2
# source score
# <chr> <dbl>
#1 a 10
#2 b 5
#3 c 11
#4 d 3
#5 e 12
Another one, quite similar to ThomasIsCoding's solution:
> df$score[is.na(df$score)]<-max(df$score, na.rm=T)+(1:sum(is.na(df$score)))
> df
# A tibble: 5 x 2
source score
<chr> <dbl>
1 a 10
2 b 5
3 c 11
4 d 3
5 e 12
Not quite elegant as compared to the base R solutions, but still possible:
library(data.table)
setDT(df)
max.score = df[, max(score, na.rm = TRUE)]
df[is.na(score), score :=(1:.N) + max.score]
Or in one line but a bit slower:
df[is.na(score), score := (1:.N) + df[, max(score, na.rm = TRUE)]]
df
source score
1: a 10
2: b 5
3: c 11
4: d 3
5: e 12

How to lag a specific column of a data frame in R

Input
(Say d is the data frame below.)
a b c
1 5 7
2 6 8
3 7 9
I want to shift the contents of column b one position down and put an arbitrary number in the first position in b. How do I do this? I would appreciate any help in this regard. Thank you.
I tried c(6,tail(d["b"],-1)) but it does not produce (6,5,6).
Output
a b c
1 6 7
2 5 8
3 6 9
Use head instead
df$b <- c(6, head(df$b, -1))
# a b c
#1 1 6 7
#2 2 5 8
#3 3 6 9
You could also use lag in dplyr
library(dplyr)
df %>% mutate(b = lag(b, default = 6))
Or shift in data.table
library(data.table)
setDT(df)[, b:= shift(b, fill = 6)]
A dplyr solution uses lag with an explicit default argument, if you prefer:
library(dplyr)
d <- tibble(a = 1:3, b = 5:7, c = 7:9)
d %>% mutate(b = lag(b, default = 6))
#> # A tibble: 3 x 3
#> a b c
#> <int> <dbl> <int>
#> 1 1 6 7
#> 2 2 5 8
#> 3 3 6 9
Created on 2019-12-05 by the reprex package (v0.3.0)
Here is a solution similar to the head approach by #Ronak Shah
df <- within(df,b <- c(runif(1),b[-1]))
where a uniformly random variable is added to the first place of b column:
> df
a b c
1 1 0.6644704 7
2 2 6.0000000 8
3 3 7.0000000 9
Best solution below will help in any lag or lead position
d <- data.frame(a=c(1,2,3),b=c(5,6,7),c=c(7,8,9))
d1 <- d %>% arrange(b) %>% group_by(b) %>%
mutate(b1= dplyr::lag(b, n = 1, default = NA))

Add together 2 dataframes in R without losing columns

I have 2 dataframes in R (df1, df2).
A C D
1 1 1
2 2 2
df2 as
A B C
1 1 1
2 2 2
How can I merge these 2 dataframes to produce the following output?
A B C D
2 1 2 1
4 2 4 2
Columns are sorted and column values are added. Both DFs have same number of rows. Thank you in advance.
Code to create DF:
df1 <- data.frame("A" = 1:2, "C" = 1:2, "D" = 1:2)
df2 <- data.frame("A" = 1:2, "B" = 1:2, "C" = 1:2)
nm1 = names(df1)
nm2 = names(df2)
nm = intersect(nm1, nm2)
if (length(nm) == 0){ # if no column names in common
cbind(df1, df2)
} else { # if column names in common
cbind(df1[!nm1 %in% nm2], # columns only in df1
df1[nm] + df2[nm], # add columns common to both
df2[!nm2 %in% nm1]) # columns only in df2
}
# D A C B
#1 1 2 2 1
#2 2 4 4 2
You can try:
library(tidyverse)
list(df2, df1) %>%
map(rownames_to_column) %>%
bind_rows %>%
group_by(rowname) %>%
summarise_all(sum, na.rm = TRUE)
# A tibble: 2 x 5
rowname A B C D
<chr> <int> <int> <int> <int>
1 1 2 1 2 1
2 2 4 2 4 2
By using left_join() from dplyr you won't lose the column
library(tidyverse)
dat1 <- tibble(a = 1:10,
b = 1:10,
c = 1:10)
dat2 <- tibble(c = 1:10,
d = 1:10,
e = 1:10)
left_join(dat1, dat2, by = "c")
#> # A tibble: 10 x 5
#> a b c d e
#> <int> <int> <int> <int> <int>
#> 1 1 1 1 1 1
#> 2 2 2 2 2 2
#> 3 3 3 3 3 3
#> 4 4 4 4 4 4
#> 5 5 5 5 5 5
#> 6 6 6 6 6 6
#> 7 7 7 7 7 7
#> 8 8 8 8 8 8
#> 9 9 9 9 9 9
#> 10 10 10 10 10 10
Created on 2019-01-16 by the reprex package (v0.2.1)
allnames <- sort(unique(c(names(df1), names(df2))))
df3 <- data.frame(matrix(0, nrow = nrow(df1), ncol = length(allnames)))
names(df3) <- allnames
df3[,allnames %in% names(df1)] <- df3[,allnames %in% names(df1)] + df1
df3[,allnames %in% names(df2)] <- df3[,allnames %in% names(df2)] + df2
df3
A B C D
1 2 1 2 1
2 4 2 4 2
Here is a fun base R method with Reduce.
Reduce(cbind,
list(Reduce("+", list(df1[intersect(names(df1), names(df2))],
df2[intersect(names(df1), names(df2))])), # sum results
df1[setdiff(names(df1), names(df2))], # in df1, not df2
df2[setdiff(names(df2), names(df1))])) # in df2, not df1
This returns
A C D B
1 2 2 1 1
2 4 4 2 2
This assumes that both df1 and df2 have columns that are not present in the other. If this is not true, you'd have to adjust the list.
Note also that you could replace Reduce with do.call in both places and you'd get the same result.

Resources