Merge columns of dataframe with all combinations of variables - r

"w" "n"
"1" 2 1
"2" 3 1
"3" 4 1
"4" 2 1
"5" 5 1
"6" 6 1
"7" 3 2
"8" 7 2
I tried the following command,but didnt show any change as I expect.
w2 <- w1 %>%
expand(w,n)
My output should look like this
w n
2 1
2 2
3 1
3 2
4 1
4 2
5 1
5 2
6 1
6 2
7 1
7 2
data
w1 <- structure(list(w = c(2L, 3L, 3L, 4L, 5L, 6L, 7L), n = c(1L, 1L,
2L, 1L, 1L, 1L, 2L)), .Names = c("w", "n"), row.names = c(NA,
-7L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), groups = structure(list(
w = c(2L, 3L, 3L, 4L, 5L, 6L, 7L), n = c(1L, 1L, 2L, 1L,
1L, 1L, 2L), .rows = list(1L, 2L, 3L, 4L, 5L, 6L, 7L)), .Names = c("w",
"n", ".rows"), row.names = c(NA, -7L), class = c("tbl_df", "tbl",
"data.frame"), .drop = TRUE))

The issue was in your data frame being grouped, consider:
w1 %>%
ungroup() %>%
expand(w, n)
Output:
# A tibble: 12 x 2
w n
<int> <int>
1 2 1
2 2 2
3 3 1
4 3 2
5 4 1
6 4 2
7 5 1
8 5 2
9 6 1
10 6 2
11 7 1
12 7 2

We can use complete from tidyr.
library(dplyr)
library(tidyr)
dat2 <- dat %>%
distinct(w, .keep_all = TRUE) %>%
complete(w, n)
dat2
# # A tibble: 12 x 2
# w n
# <int> <int>
# 1 2 1
# 2 2 2
# 3 3 1
# 4 3 2
# 5 4 1
# 6 4 2
# 7 5 1
# 8 5 2
# 9 6 1
# 10 6 2
# 11 7 1
# 12 7 2
DATA
dat <- read.table(text = "w n
2 1
3 1
4 1
2 1
5 1
6 1
3 2
7 2",
header = TRUE)

Using the original data frame df you can create a new data frame that copies w for each unique value of n:
data.frame(w = rep(unique(df$w),
each = uniqueN(df$n)),
n = rep(unique(df$n),
times = uniqueN(df$w)))
Output:
w n
1 2 1
2 2 2
3 3 1
4 3 2
5 4 1
6 4 2
7 5 1
8 5 2
9 6 1
10 6 2
11 7 1
12 7 2

Related

Replace NA values in dataframe with variables in the next column (R)

I am new still trying to learn R and I could not find the answers I am looking for in any other thread.
I have a dataset with (for simplicity) 5 columns. Columns 1,2, and4 always have values, but in some rows column 3 doesn't. Below is an example:
Current
A B C D E
1 1 2 3
1 2 NA 4 5
1 2 3 4
1 3 NA 9 7
1 2 NA 5 6
I want to make it so that the NA's are replaced by the value in column D, and then the value in col E is shifted to D, etc.
Desired output:
A B C D E
1 1 2 3 NA
1 2 4 5 NA
1 2 3 4 NA
1 3 9 7 NA
1 2 5 6 NA
I copied what was on different Stack overflow threads and none achieved what I wanted.
na.omit gets rid of the row. Any help is greatly appreciated.
Data
data <- structure(list(A = c(1L, 1L, 1L, 1L, 1L), B = c(1L, 2L, 2L, 3L,
2L), C = c(2L, NA, 3L, NA, NA), D = c(3L, 4L, 4L, 9L, 5L), E = c(NA,
5L, NA, 7L, 6L)), class = "data.frame", row.names = c(NA, -5L
))
Code
library(dplyr)
data %>%
mutate(
aux = C,
C = if_else(is.na(aux),D,C),
D = if_else(is.na(aux),E,D),
E = NA
) %>%
select(-aux)
Output
A B C D E
1 1 1 2 3 NA
2 1 2 4 5 NA
3 1 2 3 4 NA
4 1 3 9 7 NA
5 1 2 5 6 NA
Replacement operation all in one go:
dat[is.na(dat$C), c("C","D","E")] <- c(dat[is.na(dat$C), c("D","E")], NA)
dat
# A B C D E
#1 1 1 2 3 NA
#2 1 2 4 5 NA
#3 1 2 3 4 NA
#4 1 3 9 7 NA
#5 1 2 5 6 NA
Where dat was:
dat <- read.table(text="A B C D E
1 1 2 3
1 2 NA 4 5
1 2 3 4
1 3 NA 9 7
1 2 NA 5 6", fill=TRUE, header=TRUE)
Using shift_row_values
library(hacksaw)
shift_row_values(df1)
A B C D E
1 1 1 2 3 NA
2 1 2 4 5 NA
3 1 2 3 4 NA
4 1 3 9 7 NA
5 1 2 5 6 NA
data
df1 <- structure(list(A = c(1L, 1L, 1L, 1L, 1L), B = c(1L, 2L, 2L, 3L,
2L), C = c(2L, NA, 3L, NA, NA), D = c(3L, 4L, 4L, 9L, 5L), E = c(NA,
5L, NA, 7L, 6L)), class = "data.frame", row.names = c(NA, -5L
))
A base R universal approach using order without prior knowledge of NA positions.
setNames(data.frame(t(apply(data, 1, function(x)
x[order(is.na(x))]))), colnames(data))
A B C D E
1 1 1 2 3 NA
2 1 2 4 5 NA
3 1 2 3 4 NA
4 1 3 9 7 NA
5 1 2 5 6 NA
Using dplyr
library(dplyr)
t(data) %>%
data.frame() %>%
mutate(across(everything(), ~ .x[order(is.na(.x))])) %>%
t() %>%
as_tibble()
# A tibble: 5 × 5
A B C D E
<int> <int> <int> <int> <int>
1 1 1 2 3 NA
2 1 2 4 5 NA
3 1 2 3 4 NA
4 1 3 9 7 NA
5 1 2 5 6 NA
Data
data <- structure(list(A = c(1L, 1L, 1L, 1L, 1L), B = c(1L, 2L, 2L, 3L,
2L), C = c(2L, NA, 3L, NA, NA), D = c(3L, 4L, 4L, 9L, 5L), E = c(NA,
5L, NA, 7L, 6L)), class = "data.frame", row.names = c(NA, -5L
))

R Recode variable for all observations that do not occur more than once

I have a simple dataframe that looks like the following:
Observation X1 X2 Group
1 2 4 1
2 6 3 2
3 8 4 2
4 1 3 3
5 2 8 4
6 7 5 5
7 2 4 5
How can I recode the group variable such that all non-recurrent observations are recoded as "unaffiliated"?
The desired output would be the following:
Observation X1 X2 Group
1 2 4 Unaffiliated
2 6 3 2
3 8 4 2
4 1 3 Unaffiliated
5 2 8 Unaffiliated
6 7 5 5
7 2 4 5
We may use duplicated to create a logical vector for non-duplicates and assign the 'Group' to Unaffiliated for those non-duplicates
df1$Group[with(df1, !(duplicated(Group)|duplicated(Group,
fromLast = TRUE)))] <- "Unaffiliated"
-output
> df1
Observation X1 X2 Group
1 1 2 4 Unaffiliated
2 2 6 3 2
3 3 8 4 2
4 4 1 3 Unaffiliated
5 5 2 8 Unaffiliated
6 6 7 5 5
7 7 2 4 5
data
df1 <- structure(list(Observation = 1:7, X1 = c(2L, 6L, 8L, 1L, 2L,
7L, 2L), X2 = c(4L, 3L, 4L, 3L, 8L, 5L, 4L), Group = c(1L, 2L,
2L, 3L, 4L, 5L, 5L)), class = "data.frame", row.names = c(NA,
-7L))
unfaffil takes a vector of Group numbers and returns "Unaffiliated" if it has one element and otherwise returns the input. We can then apply it by Group using ave. This does not overwrite the input. No packages are used but if you use dplyr then transform can be replaced with mutate.
unaffil <- function(x) if (length(x) == 1) "Unaffiliated" else x
transform(dat, Group = ave(Group, Group, FUN = unaffil))
giving
Observation X1 X2 Group
1 1 2 4 Unaffiliated
2 2 6 3 2
3 3 8 4 2
4 4 1 3 Unaffiliated
5 5 2 8 Unaffiliated
6 6 7 5 5
7 7 2 4 5
Note
dat <- structure(list(Observation = 1:7, X1 = c(2L, 6L, 8L, 1L, 2L,
7L, 2L), X2 = c(4L, 3L, 4L, 3L, 8L, 5L, 4L), Group = c(1L, 2L,
2L, 3L, 4L, 5L, 5L)), class = "data.frame", row.names = c(NA,
-7L))
One way could be first grouping then checking for maximum of row number and finishing with an ifelse:
library(dplyr)
df %>%
group_by(Group) %>%
mutate(Group = ifelse(max(row_number()) == 1, "Unaffiliated", as.character(Group))) %>%
ungroup()
Observation X1 X2 Group
<int> <int> <int> <chr>
1 1 2 4 Unaffiliated
2 2 6 3 2
3 3 8 4 2
4 4 1 3 Unaffiliated
5 5 2 8 Unaffiliated
6 6 7 5 5
7 7 2 4 5

R Fill in missing rows

I have a similar question like this one: Fill in missing rows in R
However, the gaps I need to fill are not only months, but also missing years in between for one ID. This is an example:
structure(list(ID = c("A", "A", "A", "A", "A", "B", "B", "B",
"B"), A = c(1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L, 3L), B = c(1L, 2L,
1L, 2L, 3L, 1L, 2L, 3L, 3L), Var1 = 12:4), class = "data.frame", row.names = c(NA,
-9L))
ID A B Var1
1 A 1 1 12
2 A 1 2 11
3 A 3 1 10
4 A 3 2 9
5 A 3 3 8
6 B 2 1 7
7 B 2 2 6
8 B 2 3 5
9 B 3 3 4
And this is what I want it to look like:
ID A B Var1
1 A 1 1 12
2 A 1 2 11
3 A 1 3 0
4 A 2 1 0
5 A 2 2 0
6 A 2 3 0
7 A 3 1 10
8 A 3 2 9
9 A 3 3 8
10 B 2 1 7
11 B 2 2 6
12 B 2 3 5
13 B 3 1 0
14 B 3 2 0
15 B 3 3 4
Has someone an idea how to solve it? I have already played around with the solutions mentioned above.
library(tidyverse)
df <- structure(list(ID = c("A", "A", "A", "A", "A", "B", "B", "B",
"B"), A = c(1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L, 3L), B = c(1L, 2L,
1L, 2L, 3L, 1L, 2L, 3L, 3L), Var1 = 12:4), class = "data.frame", row.names = c(NA,
-9L))
df %>%
complete(ID, A, B, fill = list(Var1 = 0))
#> # A tibble: 18 x 4
#> ID A B Var1
#> <chr> <int> <int> <dbl>
#> 1 A 1 1 12
#> 2 A 1 2 11
#> 3 A 1 3 0
#> 4 A 2 1 0
#> 5 A 2 2 0
#> 6 A 2 3 0
#> 7 A 3 1 10
#> 8 A 3 2 9
#> 9 A 3 3 8
#> 10 B 1 1 0
#> 11 B 1 2 0
#> 12 B 1 3 0
#> 13 B 2 1 7
#> 14 B 2 2 6
#> 15 B 2 3 5
#> 16 B 3 1 0
#> 17 B 3 2 0
#> 18 B 3 3 4
Created on 2021-03-03 by the reprex package (v1.0.0)
You could use the solution described there altering it slightly for your problem.
df
full <- with(df, unique(expand.grid(ID = ID, A = A, B = B)))
complete <- merge(df, full, by = c('ID', 'A', 'B'), all.y = TRUE)
complete$Var1[is.na(complete$Var1)] <- 0
Just in case somebody else has the same question, this is what I came up with, thanks to the answers provided:
library(tidyverse)
df %>% group_by(ID) %>% complete(ID, A = full_seq(A,1), B, fill = list(Var1 = 0))
This code avoids that too many unused datasets are produced.

Sum values larger than the current value, by group

I have measured basal area of trees in different plots. Here's a small example with two plots with 4 trees each:
Plot Tree BasalArea
1 1 4
1 2 5
1 3 7
1 4 3
2 1 4
2 2 6
2 3 9
2 4 5
Within each plot, I want calculate the sum of basal area of the trees that have basal area larger than the focal tree.
For example, Tree 1 in Plot 1 has an area of 4. Within that plot there are 2 trees with an area larger than tree 1: Tree 2 and Tree 3 with area 5 and 7, respectively. So, "BA_Larger" for tree 1 is 5 + 7 = 12.
Tree 2 in the same plot has basal area = 5. Within plot 1 there is only one tree with a larger area than tree 2: tree 3 with area 7. Thus, "BA_Larger" for tree 2 is 7.
Finally, the data frame should be like this:
Plot Tree BasalArea BA_Larger
1 1 4 12
1 2 5 7
1 3 7 0
1 4 3 16
2 1 4 20
2 2 6 9
2 3 9 0
2 4 5 15
The data set is very large. I have tried to calculate the "BA_Larger", without success. Any help is highly appreciated.
The base R solution with ave():
within(df, BA_Larger <- ave(BasalArea, Plot, FUN = function(x) sapply(x, function(y) sum(x[x > y]))))
With a tidyverse style, you can also use map_int() or map_dbl() from purrr.
library(dplyr)
library(purrr)
df %>%
group_by(Plot) %>%
mutate(BA_Larger = map_int(BasalArea, ~ sum(BasalArea[BasalArea > .]))) %>%
ungroup()
Output
# # A tibble: 8 x 4
# Plot Tree BasalArea BA_Larger
# <int> <int> <int> <int>
# 1 1 1 4 12
# 2 1 2 5 7
# 3 1 3 7 0
# 4 1 4 3 16
# 5 2 1 4 20
# 6 2 2 6 9
# 7 2 3 9 0
# 8 2 4 5 15
Data
df <- structure(list(Plot = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), Tree = c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L), BasalArea = c(4L, 5L, 7L, 3L, 4L,
6L, 9L, 5L)), class = "data.frame", row.names = c(NA, -8L))
Another solution
library(tidyverse)
df %>%
group_by(Plot) %>%
arrange(BasalArea, .by_group = T) %>%
mutate(res = sum(BasalArea) - cumsum(BasalArea)) %>%
arrange(Tree, .by_group = T) %>%
ungroup()
# A tibble: 8 x 4
Plot Tree BasalArea res
<int> <int> <int> <int>
1 1 1 4 12
2 1 2 5 7
3 1 3 7 0
4 1 4 3 16
5 2 1 4 20
6 2 2 6 9
7 2 3 9 0
8 2 4 5 15
Using a non-equi join with data.table. Calculate sum for each match.
library(data.table)
setDT(d)
d[ , ba2 := d[d, on = .(Plot, BasalArea > BasalArea), sum(x.BasalArea), by = .EACHI]$V1]
# Plot Tree BasalArea ba2
# 1: 1 1 4 12
# 2: 1 2 5 7
# 3: 1 3 7 NA
# 4: 1 4 3 16
# 5: 2 1 4 20
# 6: 2 2 6 9
# 7: 2 3 9 NA
# 8: 2 4 5 15
Actually you don't need a package to do this. Using by you may split the data on the Plot column, then compare the specific tree i to the other values in the split-subset and exclude i in the sum. Finally unsplit the result according to the df1$Plot column.
res <- unsplit(by(df1, df1$Plot, function(x)
transform(x, BA_Larger=sapply(1:nrow(x), function(i)
sum(x[x[, 3] > x[i, 3], 3])))), df1$Plot)
res
# Plot Tree BasalArea BA_Larger
# 1 1 1 4 12
# 2 1 2 5 7
# 3 1 3 7 0
# 4 1 4 3 16
# 5 2 1 4 20
# 6 2 2 6 9
# 7 2 3 9 0
# 8 2 4 5 15
Data:
df1 <- structure(list(Plot = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), Tree = c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L), BasalArea = c(4L, 5L, 7L, 3L, 4L,
6L, 9L, 5L)), class = "data.frame", row.names = c(NA, -8L))

How to create a calculated column using a lookup table and a formula in R?

I have a dataframe that looks like this:
V1 V2 V3
3 4 3
2 4 3
4 4 3
4 4 4
1 4 2
4 2 4
4 4 1
4 4 2
3 4 1
4 4 4
4 4 2
4 4 2
2 1 2
3 2 3
3 4 3
3 4 2
4 4 2
4 4 4
2 3 3
3 4 1
I also have a lookup table like this:
V_id coeff weight
V1 0.82 4.77
V2 0.75 4.77
V3 0.67 4.77
I want to use these values in the lookup table to create a new calculated column in DF1 using
(V1*coeff+V2*coeff+V3*coeff)/weight
The final dataframe should look like this.
V1 V2 V3 new_column
3 4 3 1.566037736
2 4 3 1.394129979
4 4 3 1.737945493
4 4 4 1.878406709
1 4 2 1.081761006
4 2 4 1.5639413
4 4 1 1.457023061
4 4 2 1.597484277
3 4 1 1.285115304
4 4 4 1.878406709
4 4 2 1.597484277
4 4 2 1.597484277
2 1 2 0.78197065
3 2 3 1.251572327
3 4 3 1.566037736
3 4 2 1.42557652
4 4 2 1.597484277
4 4 4 1.878406709
2 3 3 1.236897275
3 4 1 1.285115304
I have to do this for a data frame with 1125 columns.
Edit Updated answer to updated question (data frame has 1,125 columns):
df1_V <- as.matrix(df1) # or select the "V" columns using df1[, 1:1125]
df1$new_column <- df1_V %*% df2$coef / df2$weight[1]
This is a general solution which will work for any number of columns as long as the columns of df1 are arranged in the same manner (across the columns of the data frame) as the coef values are ordered (row-wise) in df2, and the number of columns in df1 equals the number of rows in df2, that is ncol(df1_V) = nrow(df2).
Answer (to original question):
library(dplyr)
df %>%
mutate(new_column = (V1*0.82 + V2*0.75 + V3*0.67) / 4.77)
V1 V2 V3 new_column
1 3 4 3 1.5660377
2 2 4 3 1.3941300
3 4 4 3 1.7379455
4 4 4 4 1.8784067
5 1 4 2 1.0817610
6 4 2 4 1.5639413
...
Alternative:
df1$new_column <- as.matrix(df1) %*% c(0.82, 0.75, 0.67) / 4.77
We can convert the first data to long format and then do a group by row number to get the calculated column
library(dplyr)
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = "V_id") %>%
left_join(df2) %>%
group_by(rn) %>%
summarise(new_column = sum(coeff *value)/weight[1]) %>%
select(new_column) %>%
bind_cols(df1, .)
# A tibble: 20 x 4
# V1 V2 V3 new_column
# <int> <int> <int> <dbl>
# 1 3 4 3 1.57
# 2 2 4 3 1.39
# 3 4 4 3 1.74
# 4 4 4 4 1.88
# 5 1 4 2 1.08
# 6 4 2 4 1.56
# 7 4 4 1 1.46
# 8 4 4 2 1.60
# 9 3 4 1 1.29
#10 4 4 4 1.88
#11 4 4 2 1.60
#12 4 4 2 1.60
#13 2 1 2 0.782
#14 3 2 3 1.25
#15 3 4 3 1.57
#16 3 4 2 1.43
#17 4 4 2 1.60
#18 4 4 4 1.88
#19 2 3 3 1.24
#20 3 4 1 1.29
In base R, we can also do
df1$new_column <- c(tcrossprod(df2$coeff, as.matrix(df1)))/df2$weight[1]
data
df1 <- structure(list(V1 = c(3L, 2L, 4L, 4L, 1L, 4L, 4L, 4L, 3L, 4L,
4L, 4L, 2L, 3L, 3L, 3L, 4L, 4L, 2L, 3L), V2 = c(4L, 4L, 4L, 4L,
4L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 1L, 2L, 4L, 4L, 4L, 4L, 3L, 4L
), V3 = c(3L, 3L, 3L, 4L, 2L, 4L, 1L, 2L, 1L, 4L, 2L, 2L, 2L,
3L, 3L, 2L, 2L, 4L, 3L, 1L)), class = "data.frame", row.names = c(NA,
-20L))
df2 <- structure(list(V_id = c("V1", "V2", "V3"), coeff = c(0.82, 0.75,
0.67), weight = c(4.77, 4.77, 4.77)), class = "data.frame", row.names = c(NA,
-3L))

Resources