Transpose specified columns columns to rows with grouped data - r

I have a dataframe like this:
household person R01 R02 R03 R04 R05
1 1 1 NA 1 7 7 NA
2 1 2 1 NA 7 7 NA
3 1 3 3 3 NA 11 NA
4 1 4 3 3 11 NA NA
5 2 1 NA 7 16 NA NA
6 2 2 3 NA 7 NA NA
7 2 3 15 3 NA NA NA
and I'm trying add new columns which are the grouped transposed versions of columns R01 to R05, like this:
household person R01 R02 R03 R04 R05 R01x R02x R03x R04x R05x
1 1 1 NA 1 7 7 NA NA 1 3 3 NA
2 1 2 1 NA 7 7 NA 1 NA 3 3 NA
3 1 3 3 3 NA 11 NA 7 7 NA 11 NA
4 1 4 3 3 11 NA NA 7 7 11 NA NA
5 2 1 NA 7 16 NA NA NA 3 15 NA NA
6 2 2 3 NA 7 NA NA 7 NA 3 NA NA
7 2 3 15 3 NA NA NA 16 7 NA NA NA
I have tried various attempts using t() and reshaping using gather() and spread() but I don't think they are designed to do this as I'm moving the data around rather than just reshaping it.
Example Code
df <- data.frame(household = c(rep(1,4),rep(2,3)),
person = c(1:4,1:3),
R01 = c(NA,1,3,3,NA,3,15),
R02 = c(1,NA,3,3,7,NA,3),
R03 = c(7,7,NA,11,16,7,NA),
R04 = c(7,7,11,rep(NA,4)),
R05 = rep(NA,7))

Referring to my previous answer, you can transpose the matrx within group_modify():
library(dplyr)
df %>%
group_by(household) %>%
group_modify(~ {
mat <- t(.x[-1][1:nrow(.x)])
colnames(mat) <- paste0(rownames(mat), "x")
cbind(.x, mat)
}) %>%
ungroup()
# # A tibble: 7 × 11
# household person R01 R02 R03 R04 R05 R01x R02x R03x R04x
# <dbl> <int> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 NA 1 7 7 NA NA 1 3 3
# 2 1 2 1 NA 7 7 NA 1 NA 3 3
# 3 1 3 3 3 NA 11 NA 7 7 NA 11
# 4 1 4 3 3 11 NA NA 7 7 11 NA
# 5 2 1 NA 7 16 NA NA NA 3 15 NA
# 6 2 2 3 NA 7 NA NA 7 NA 3 NA
# 7 2 3 15 3 NA NA NA 16 7 NA NA

Partly using a previous answer, here's a way to do it.
Split the dataframe according to their group
Get their number of columns with at least one non-NA (important to do the transposition)
Reduce their size using the length size created in step 2, and do the transposition.
Swap (again) the colnames and rownames which were swapped (first) in the transposition.
Bind the columns with the original dataframe.
l <- split(df[startsWith(colnames(df), "R")], df$household)
len <- lapply(l, \(l) ncol(l) - (sum(sapply(l, \(x) any(!is.na(x))))))
l <- mapply(\(x, y) t(x[1:(length(x) - y)]), l, len, SIMPLIFY = F)
l <- lapply(l, function(x){
r <- paste0(rownames(x), "x")
c <- colnames(x)
rownames(x) <- c
colnames(x) <- r
data.frame(x)
})
cbind(df, bind_rows(l))
output
household person R01 R02 R03 R04 R05 R01x R02x R03x R04x
1 1 1 NA 1 7 7 NA NA 1 3 3
2 1 2 1 NA 7 7 NA 1 NA 3 3
3 1 3 3 3 NA 11 NA 7 7 NA 11
4 1 4 3 3 11 NA NA 7 7 11 NA
5 2 1 NA 7 16 NA NA NA 3 15 NA
6 2 2 3 NA 7 NA NA 7 NA 3 NA
7 2 3 15 3 NA NA NA 16 7 NA NA

df %>%
left_join(pivot_longer(.,starts_with('R'), names_to = 'name',
names_pattern = "(\\d+)", values_drop_na = TRUE,
names_transform = list(name = as.integer)) %>%
pivot_wider(c(household,name), names_from = person,
names_glue = "R0{person}x"),
by = c('household', person = 'name'))
household person R01 R02 R03 R04 R05 R01x R02x R03x R04x
1 1 1 NA 1 7 7 NA NA 1 3 3
2 1 2 1 NA 7 7 NA 1 NA 3 3
3 1 3 3 3 NA 11 NA 7 7 NA 11
4 1 4 3 3 11 NA NA 7 7 11 NA
5 2 1 NA 7 16 NA NA NA 3 15 NA
6 2 2 3 NA 7 NA NA 7 NA 3 NA
7 2 3 15 3 NA NA NA 16 7 NA NA
Another solution:
df %>%
left_join(
reshape2::recast(.,household+variable~person,id.var = c('household', 'person'))%>%
group_by(household) %>%
mutate(person = seq_along(variable), variable = NULL))
household person R01 R02 R03 R04 R05 1 2 3 4
1 1 1 NA 1 7 7 NA NA 1 3 3
2 1 2 1 NA 7 7 NA 1 NA 3 3
3 1 3 3 3 NA 11 NA 7 7 NA 11
4 1 4 3 3 11 NA NA 7 7 11 NA
5 2 1 NA 7 16 NA NA NA 3 15 NA
6 2 2 3 NA 7 NA NA 7 NA 3 NA
7 2 3 15 3 NA NA NA 16 7 NA NA

Here's a way to do it.
library(dplyr)
transposed_df <- df %>%
group_split(household) %>%
lapply(\(x){
select(x, -1:-2) %>%
t() %>%
head(nrow(x)) %>%
as_tibble() %>%
setNames(paste0(names(x)[-1:-2], 'x'))
}) %>%
bind_rows()
df %>%
bind_cols(transposed_df)
#> household person R01 R02 R03 R04 R05 R01x R02x R03x R04x
#> 1 1 1 NA 1 7 7 NA NA 1 3 3
#> 2 1 2 1 NA 7 7 NA 1 NA 3 3
#> 3 1 3 3 3 NA 11 NA 7 7 NA 11
#> 4 1 4 3 3 11 NA NA 7 7 11 NA
#> 5 2 1 NA 7 16 NA NA NA 3 15 NA
#> 6 2 2 3 NA 7 NA NA 7 NA 3 NA
#> 7 2 3 15 3 NA NA NA 16 7 NA NA

Related

Counting the Position of the First 0 in Each Row

I have a dataset that looks like this:
set.seed(999)
col1 = sample.int(10, 10)
col2 = sample.int(10, 10)
col3 = sample.int(10, 10)
col4 = sample.int(10, 10)
col5 = sample.int(10, 10)
col_data = data.frame(col1, col2, col3, col4, col5)
col1 col2 col3 col4 col5
1 4 8 3 9 8
2 7 5 9 7 10
3 1 7 7 8 2
4 6 6 5 5 4
5 8 10 8 3 7
6 2 3 1 2 6
7 5 9 2 1 1
8 10 2 4 4 3
9 9 1 10 6 9
10 3 4 6 10 5
I would like to create new columns in this dataset that :
Find out the position (i.e. column number) for the first "9" in each row
Find out the position (i.e. column number) for the first "7" in each row
Find out the position (i.e. column number) for the first "1" in each row
Find out the position (i.e. column number) for the first "10" in each row
Find out the position (i.e. column number) for the first "4" in each row
I thought this might be easier to do if the data was a matrix, and then convert it back to a data frame:
col_d = as.matrix(col_data)
first_4 = apply(col_d == 9, 1, which.max)
first_7 = apply(col_d == 7, 1, which.max)
first_1 = apply(col_d == 1, 1, which.max)
first_10 = apply(col_d == 10, 1, which.max)
first_4 = apply(col_d == 4, 1, which.max)
final = cbind(col_data, first_4, first_7, first_1, first_10, first_4)
But this does not appear to be working:
col1 col2 col3 col4 col5 first_4 first_7 first_1 first_10 first_4
1 4 8 3 9 8 1 1 1 1 1
2 7 5 9 7 10 1 1 1 5 1
3 1 7 7 8 2 1 2 1 1 1
4 6 6 5 5 4 5 1 1 1 5
5 8 10 8 3 7 1 5 1 2 1
6 2 3 1 2 6 1 1 3 1 1
7 5 9 2 1 1 1 1 4 1 1
8 10 2 4 4 3 3 1 1 1 3
9 9 1 10 6 9 1 1 2 3 1
10 3 4 6 10 5 2 1 1 4 2
For example: In the first row, there is no 10 - but the value of "first_10" is 1
Is there a way to resolve this error?
Thank you!
How about
apply(col_data == 7, 1, function(x) {ifelse(sum(x)==0, NA, which.max(x))})
[1] NA 1 2 NA 5 NA NA NA NA NA
apply(col_data == 10, 1, function(x) {ifelse(sum(x)==0, NA, which.max(x))})
[1] NA 5 NA NA 2 NA NA 1 3 4
You may change NA whatever you want, that it means there is no that number(i.e 7 or 10)
get second one
apply(col_data == 7, 1, function(x) {ifelse(sum(x)==0, NA, which(x)[2])})
get last one
apply(col_data == 7, 1, function(x) {ifelse(sum(x)==0, NA, dplyr::last(which(x)))})
Use max.col:
nr <- c(9, 7, 1, 10, 4)
nr <- setNames(nr, paste0("first_", nr))
cbind(col_data, sapply(nr, function(x) {
. <- col_data == x
tt <- max.col(., "first")
is.na(tt) <- tt == 1 & !.[,1]
tt
}))
# col1 col2 col3 col4 col5 first_9 first_7 first_1 first_10 first_4
#1 4 8 3 9 8 4 NA NA NA 1
#2 7 5 9 7 10 3 1 NA 5 NA
#3 1 7 7 8 2 NA 2 1 NA NA
#4 6 6 5 5 4 NA NA NA NA 5
#5 8 10 8 3 7 NA 5 NA 2 NA
#6 2 3 1 2 6 NA NA 3 NA NA
#7 5 9 2 1 1 2 NA 4 NA NA
#8 10 2 4 4 3 NA NA NA 1 3
#9 9 1 10 6 9 1 NA 2 3 NA
#10 3 4 6 10 5 NA NA NA 4 2
For the last:
nr <- c(9, 7, 1, 10, 4)
nr <- setNames(nr, paste0("last_", nr))
cbind(col_data, sapply(nr, function(x) {
. <- col_data == x
tt <- max.col(., "last")
is.na(tt) <- rowSums(.) == 0
tt
}))
# col1 col2 col3 col4 col5 last_9 last_7 last_1 last_10 last_4
#1 4 8 3 9 8 4 NA NA NA 1
#2 7 5 9 7 10 3 4 NA 5 NA
#3 1 7 7 8 2 NA 3 1 NA NA
#4 6 6 5 5 4 NA NA NA NA 5
#5 8 10 8 3 7 NA 5 NA 2 NA
#6 2 3 1 2 6 NA NA 3 NA NA
#7 5 9 2 1 1 2 NA 5 NA NA
#8 10 2 4 4 3 NA NA NA 1 4
#9 9 1 10 6 9 5 NA 2 3 NA
#10 3 4 6 10 5 NA NA NA 4 2
And for the second match:
nr <- c(9, 7, 1, 10, 4)
nr <- setNames(nr, paste0("2nd_", nr))
cbind(col_data, sapply(nr, function(x) {
. <- which(col_data == x, TRUE)
. <- tapply(.[,2], .[,1], `[`, 2)
replace(rep(NA_integer_, nrow(col_data)), as.integer(names(.)), .)
}))
# col1 col2 col3 col4 col5 2nd_9 2nd_7 2nd_1 2nd_10 2nd_4
#1 4 8 3 9 8 NA NA NA NA NA
#2 7 5 9 7 10 NA 4 NA NA NA
#3 1 7 7 8 2 NA 3 NA NA NA
#4 6 6 5 5 4 NA NA NA NA NA
#5 8 10 8 3 7 NA NA NA NA NA
#6 2 3 1 2 6 NA NA NA NA NA
#7 5 9 2 1 1 NA NA 5 NA NA
#8 10 2 4 4 3 NA NA NA NA 4
#9 9 1 10 6 9 5 NA NA NA NA
#10 3 4 6 10 5 NA NA NA NA NA
Or using apply on one column.
#First
apply(col_data == 9, 1, function(x) if(any(x)) which.max(x) else NA)
# [1] 4 3 NA NA NA NA 2 NA 1 NA
#Last
apply(col_data == 9, 1, function(x) if(any(x)) tail(which(x), 1) else NA)
# [1] 4 3 NA NA NA NA 2 NA 5 NA
#Second
apply(col_data == 9, 1, function(x) if(any(x)) which(x)[2] else NA)
# [1] NA NA NA NA NA NA NA NA 5 NA

Identify "lone-parent" household from family relationship matrix

I have a survey dataset which includes intra-household relationships and I'm trying to write code to identify lone parent households, defined as a household where the parent of a dependent child does not have a cohabiting partner.
Intra-family relationships are coded as:
1 = Spouse, 2= Cohabiting partner, 3 = Son/daughter, 4 = Step son/daughter, 5 = Foster child, 6 = Son-in-law/daughter-in-law, 7 = Parent/guardian, 8 = Step-parent, 9 = Foster parent, 10 = Parent-in-law, 11 = Brother/sister, 12 = Step-brother/sister, 13 = Foster brother/sister, 14 = Brother/sister-in-law, 15 = Grand-child, 16 = Grand-parent, 17 = Other relative, 18 = Other non-relative.
The identifiers of a parent therefore are 7, 8, or 9 in any of a person's relationship columns, however whether their child is dependent (under18) is represented in the child's depchild column. Whether the parent of a depchild has a partner is identified by 1 or 2 in any of the parents relationship columns.
I can't preclude the possibility of multiple families within a given household, e.g. (two lone mothers independently living with two dependent children) therefore the presence of two parents within a household with dependent children does not automatically mean a non-lone-parent household. If there are any lone-parents in a household i.e. a parent of a dependent child who does not have a partner, the household should be tagged as lonepar = 1.
Example Data
household person depchild R01 R02 R03 R04 R05 R06
1 1 1 0 NA 1 7 7 NA NA
2 1 2 0 1 NA 7 7 NA NA
3 1 3 0 3 3 NA 11 NA NA
4 1 4 1 3 3 11 NA NA NA
5 2 1 0 NA 7 16 NA NA NA
6 2 2 0 3 NA 7 NA NA NA
7 2 3 1 15 3 NA NA NA NA
8 3 1 0 NA 18 NA NA NA NA
9 3 2 0 18 NA NA NA NA NA
10 4 1 0 NA NA NA NA NA NA
11 5 1 0 NA 9 NA NA NA NA
12 5 2 1 5 NA 18 NA NA NA
13 5 3 0 2 18 NA NA NA NA
In the above example, dependent children depchild are on rows 4, 7 and 12. The parents of the child on row 4 have spouses, indicated by 1 in R02 and R01 respectively; the household is therefore not a lone-parent household, so should be lonepar = 0. The parent of the depchild on row 7 however (row 6) does not have a spouse 1 or a cohabiting partner 2, the household should therefore be lonepar = 1
Output sought
household person depchild R01 R02 R03 R04 R05 R06 lonepar
1 1 1 0 NA 1 7 7 NA NA 0
2 1 2 0 1 NA 7 7 NA NA 0
3 1 3 0 3 3 NA 11 NA NA 0
4 1 4 1 3 3 11 NA NA NA 0
5 2 1 0 NA 7 16 NA NA NA 1
6 2 2 0 3 NA 7 NA NA NA 1
7 2 3 1 15 3 NA NA NA NA 1
8 3 1 0 NA 18 NA NA NA NA 0
9 3 2 0 18 NA NA NA NA NA 0
10 4 1 0 NA NA NA NA NA NA 0
11 5 1 0 NA 9 NA NA NA NA 0
12 5 2 1 5 NA 18 NA NA NA 0
13 5 3 0 2 18 NA NA NA NA 0
Example Code
df <- data.frame(household = c(1,1,1,1,2,2,2,3,3,4,5,5,5),
person = c(1,2,3,4,1,2,3,1,2,1,1,2,3),
depchild = c(0,0,0,1,0,0,1,0,0,0,0,1,0),
R01 = c(NA, 1, 3, 3, NA, 3, 15, NA, 18, NA, NA, 5,2),
R02 = c(1, NA, 3, 3, 7, NA, 3, 18, NA, NA, 9, NA, 18),
R03 = c(7, 7, NA, 11, 16, 7, rep(NA,5), 18, NA),
R04 = c(7, 7, 11, rep(NA, 10)),
R05 = rep(NA, 13),
R06 = rep(NA, 13))
Rather than concentrating on relationships of the parents, concentrate on the relationships of the dependent children. If a dependent child only has a single relation with a value of 3, 4, or, 5, then that dependent child only has a single parent in the household.
Essentially, we count up the instances of 3, 4, and 5 in each row for every person in the data frame. Then we group by household. If anyone in that household is a dependent child who only had one instance of a 3, 4, or 5 relationship code, then that household contains a dependent child with only one parent. It is therefore a lone parent household.
library(tidyverse)
df %>%
rowwise() %>%
mutate(n = length(na.omit(match(c(R01, R02, R03, R04, R05, R06), 3:5)))) %>%
group_by(household) %>%
mutate(lonepar = as.numeric(any(depchild == 1 & n == 1))) %>%
select(-n)
#> # A tibble: 12 x 10
#> # Groups: household [5]
#> household person depchild R01 R02 R03 R04 R05 R06 lonepar
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <lgl> <dbl>
#> 1 1 1 0 NA 1 7 7 NA NA 0
#> 2 1 2 0 1 NA 7 7 NA NA 0
#> 3 1 3 0 3 3 NA 11 NA NA 0
#> 4 1 4 1 3 3 11 NA NA NA 0
#> 5 2 1 0 NA 7 16 NA NA NA 1
#> 6 2 2 0 3 NA 7 NA NA NA 1
#> 7 2 3 1 15 3 NA NA NA NA 1
#> 8 3 1 0 NA 18 NA NA NA NA 0
#> 9 3 2 0 18 NA NA NA NA NA 0
#> 10 4 1 0 NA NA NA NA NA NA 0
#> 11 5 1 0 NA 9 NA NA NA NA 1
#> 12 5 2 1 5 NA NA NA NA NA 1
Created on 2022-05-16 by the reprex package (v2.0.1)

How do I populate upper.tri of matrix with matched integers from the lower.tri?

Issue
I have a dataframe of familial relationships coded with integers, where R01 is the relationship of person N to person 1, R02 their relationship to person 2 etc.
However, only the lower.tri of each family matrix is coded, so I am trying to write a function to match the correct relationship in the upper.tri.
Relationships
The relationships are coded in integers as follows:
1 = Spouse, 2 = Cohabiting partner, 3 = Son/daughter, 4 = Step son/daughter, 5 = Foster child, 6 = Son-in-law/daughter-in-law, 7 = Parent/guardian, 8 = Step-parent, 9 = Foster parent, 10 = Parent-in-law, 11 = Brother/sister, 12 = Step-brother/sister, 13 = Foster brother/sister, 14 = Brother/sister-in-law, 15 = Grand-child, 16 = Grand-parent, 17 = Other relative, 18 = Other non-relative.
thus the relationships are:
rel = c("1" = 1, "2" = 2, "3" = 7, "4" = 8, "5" = 9, "6" = 10, "7" = 3, "8" = 4, "9" = 5, "10" = 6, "11" = 11, "12" = 12, "13" = 13, "14" = 14, "15" = 16, "16" = 15, "17" = 17, "18" = 18)
Example Data
household person R01 R02 R03 R04 R05 R06
1 1 1 NA NA NA NA NA NA
2 1 2 1 NA NA NA NA NA
3 1 3 3 3 NA NA NA NA
4 1 4 3 3 11 NA NA NA
5 2 1 NA NA NA NA NA NA
6 2 2 3 NA NA NA NA NA
7 2 3 15 3 NA NA NA NA
8 3 1 NA NA NA NA NA NA
9 3 2 18 NA NA NA NA NA
10 4 1 NA NA NA NA NA NA
11 5 1 NA NA NA NA NA NA
12 5 2 5 NA NA NA NA NA
Required Output
household person R01 R02 R03 R04 R05 R06
1 1 1 NA 1 7 7 NA NA
2 1 2 1 NA 7 7 NA NA
3 1 3 3 3 NA 11 NA NA
4 1 4 3 3 11 NA NA NA
5 2 1 NA 1 16 NA NA NA
6 2 2 3 NA 1 NA NA NA
7 2 3 15 3 NA NA NA NA
8 3 1 NA 18 NA NA NA NA
9 3 2 18 NA NA NA NA NA
10 4 1 NA NA NA NA NA NA
11 5 1 NA 9 NA NA NA NA
12 5 2 5 NA NA NA NA NA
Example Code
df <- data.frame(household = c(1,1,1,1,2,2,2,3,3,4,5,5),
person = c(1,2,3,4,1,2,3,1,2,1,1,2),
R01 = c(NA, 1, 3, 3, NA, 3, 15, NA, 18, NA, NA, 5),
R02 = c(NA, NA, 3, 3, NA, NA, 3, rep(NA, 5)),
R03 = c(rep(NA,3), 11, rep(NA, 8)),
R04 = rep(NA, 12),
R05 = rep(NA, 12),
R06 = rep(NA, 12))
I know it's possible to write a function to do the matrix match and then apply it to each household with dplyr, however I'm not great at functions yet so I'm running into issues in a few areas.
You can make the relationship matrix symmetric in each household, and at the same time recode the elements according to rel.
library(dplyr)
df %>%
group_by(household) %>%
group_modify(~ {
mat <- as.matrix(.x[-1][1:nrow(.x)])
mat[upper.tri(mat)] <- recode(t(mat)[upper.tri(mat)], !!!rel)
cbind(.x[1], mat)
}) %>%
ungroup()
# A tibble: 12 × 6
household person R01 R02 R03 R04
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 NA 1 7 7
2 1 2 1 NA 7 7
3 1 3 3 3 NA 11
4 1 4 3 3 11 NA
5 2 1 NA 7 16 NA
6 2 2 3 NA 7 NA
7 2 3 15 3 NA NA
8 3 1 NA 18 NA NA
9 3 2 18 NA NA NA
10 4 1 NA NA NA NA
11 5 1 NA 9 NA NA
12 5 2 5 NA NA NA
Here's a way to do it mostly using base R.
First, create f, a function that replace the upper triangle of a matrix with the matching value from the rel vector and the lower triangle of the same matrix.
Then, split your data according to the household, compute the lengths of each group so that the resulting matrix has the right number of columns, and then apply the function to each groups. Finally, bind_rows and cbind with the original data set.
f <- function(m) {
m[upper.tri(m)] <- match(t(m)[upper.tri(m)], rel)
m
}
l <- split(df[3:6], df$household)
len <- lapply(l, \(l) ncol(l) - (sum(sapply(l, \(x) any(!is.na(x)))) + 1))
l <- mapply(\(x, y) x[1:(length(x) - y)], l, len, SIMPLIFY = F)
cbind(df[1:2],
dplyr::bind_rows(lapply(l, f)))
output
household person R01 R02 R03 R04
1 1 1 NA 1 7 7
2 1 2 1 NA 7 7
3 1 3 3 3 NA 11
4 1 4 3 3 11 NA
5 2 1 NA 7 16 NA
6 2 2 3 NA 7 NA
7 2 3 15 3 NA NA
8 3 1 NA 18 NA NA
9 3 2 18 NA NA NA
10 4 1 NA NA NA NA
11 5 1 NA 9 NA NA
12 5 2 5 NA NA NA

For loop using names of a dataframe in R

I am working with COVID-19 data from my country by regions (3) in a dataframe. I want to use those columns of positive cases to generate other columns in which I want to calculate the growth in between rows. The dataframe:
> df
Lima Arequipa Huánuco
1 1 NA NA
2 6 NA NA
3 6 1 NA
4 8 2 5
5 9 3 7
6 11 4 8
I want to use a for loop to calculate in a new column named as each df's column adding to its name "_dif" in which I have the row 1 - lag (row 1) for each column. So I used this code:
for(col in names(df)) {
df[paste0(col, "_dif")] = df[col] - lag(df[col])
}
The output I want is the next one:
Lima Arequipa Huánuco Lima_dif Arequipa_dif Huánuco_dif
1 1 NA NA NA NA NA
2 6 NA NA 5 NA NA
3 6 1 NA 0 NA NA
4 8 2 5 2 1 NA
5 9 3 7 1 1 2
6 11 4 8 2 1 1
But when I see the df after the for loop I got this (only NA in the new columns):
Lima Arequipa Huánuco Lima_dif Arequipa_dif Huánuco_dif
1 1 NA NA NA NA NA
2 6 NA NA NA NA NA
3 6 1 NA NA NA NA
4 8 2 5 NA NA NA
5 9 3 7 NA NA NA
6 11 4 8 NA NA NA
Thanks in advance.
We can just use mutate with across from dplyr as the _all/_at suffixes are getting deprecated and in the newer version, across is more genneralized
library(dplyr)
df %>%
mutate(across(everything(), ~ . - lag(.), names = "{col}_dif"))
# Lima Arequipa Huánuco Lima_dif Arequipa_dif Huánuco_dif
#1 1 NA NA NA NA NA
#2 6 NA NA 5 NA NA
#3 6 1 NA 0 NA NA
#4 8 2 5 2 1 NA
#5 9 3 7 1 1 2
#6 11 4 8 2 1 1
Or in base R
df[paste0(names(df), "_dif")] <- lapply(df, function(x) c(NA, diff(x)))
Or another option is
df[paste0(names(df), "_dif")] <- rbind(NA, diff(as.matrix(df)))
The issue in the OP's for loop is that df[col] is a still a data.frame with a single column, we need df[[col]] to extract as vector because lag needs a vector. According to ?lag
x - Vector of values
lag(df[1])
# Lima
#1 NA
returns NA and it gets recycled
while,
lag(df[[1]])
#[1] NA 1 6 6 8 9
therefore, if we change the code to
for(col in names(df)) {
df[paste0(col, "_dif")] = df[[col]] - lag(df[[col]])
}
df
# Lima Arequipa Huánuco Lima_dif Arequipa_dif Huánuco_dif
#1 1 NA NA NA NA NA
#2 6 NA NA 5 NA NA
#3 6 1 NA 0 NA NA
#4 8 2 5 2 1 NA
#5 9 3 7 1 1 2
#6 11 4 8 2 1 1
data
df <- structure(list(Lima = c(1L, 6L, 6L, 8L, 9L, 11L), Arequipa = c(NA,
NA, 1L, 2L, 3L, 4L), Huánuco = c(NA, NA, NA, 5L, 7L, 8L)),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))
In dplyr, you can use mutate_all :
library(dplyr)
df %>% mutate_all(list(diff = ~. - lag(.)))
# Lima Arequipa Huánuco Lima_diff Arequipa_diff Huánuco_diff
#1 1 NA NA NA NA NA
#2 6 NA NA 5 NA NA
#3 6 1 NA 0 NA NA
#4 8 2 5 2 1 NA
#5 9 3 7 1 1 2
#6 11 4 8 2 1 1
Or shift in data.table
library(data.table)
setDT(df)[, (paste0(names(df), '_diff')) := .SD - shift(.SD)]
You almost had it.
df <- read_table("V Lima Arequipa Huanuco
1 1 NA NA
2 6 NA NA
3 6 1 NA
4 8 2 5
5 9 3 7
6 11 4 8")
for(col in names(df)) {
df[paste0(col, "_dif")] <- df[col] - lag(df[col], default = 0)
}
df
# A tibble: 6 x 8
V Lima Arequipa Huanuco V_dif Lima_dif Arequipa_dif Huanuco_dif
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 NA NA 1 1 NA NA
2 2 6 NA NA 2 6 NA NA
3 3 6 1 NA 3 6 1 NA
4 4 8 2 5 4 8 2 5
5 5 9 3 7 5 9 3 7
6 6 11 4 8 6 11 4 8
You didn't set lag's default to 0, so it went to NA.

How to transfer values from one dataframe to another?

Consider the following code yielding the following dataframe
df1 <- data.frame("ID"=c("A", "A", "A", "A", "A", "B", "B", 'B', "B", "B"),
"X_A"=c(1,2,3,4,5,NA, NA, 8, 9,10), "X_B"=c(1,2,3,4,5,NA,NA, 8,9,10)
,"Y_A"=c(1,2,NA,NA, 10, 8,9,10,NA,NA), "Y_B"=c(1,2,NA, NA, 10,8,
9, 10, NA, NA))
it yields the following dataframe
ID X_A X_B Y_A Y_B
1 A 1 1 1 1
2 A 2 2 2 2
3 A 3 3 NA NA
4 A 4 4 NA NA
5 A 5 5 NA NA
6 B NA NA 8 8
7 B NA NA 9 9
8 B 8 8 10 10
9 B 9 9 NA NA
10 B 10 10 NA NA
I wish to transfer data from this dataframe to df2
ID X_A Y_A
1 A 1 1
2 A 2 2
3 A 3 3
4 A 4 4
5 A 5 5
6 A 6 6
7 A 7 7
8 A 8 8
9 A 9 9
10 A 10 10
11 B 1 1
12 B 2 2
13 B 3 3
14 B 4 4
15 B 5 5
16 B 6 6
17 B 7 7
18 B 8 8
19 B 9 9
20 B 10 10
The end data frame should be like this
ID X_A Y_A X_B Y_B
1 A 1 1 1 1
2 A 2 2 2 2
3 A 3 3 3 NA
4 A 4 4 4 NA
5 A 5 5 5 NA
6 A 6 6 NA NA
7 A 7 7 NA NA
8 A 8 8 NA NA
9 A 9 9 NA NA
10 A 10 10 NA NA
11 B 1 1 NA NA
12 B 2 2 NA NA
13 B 3 3 NA NA
14 B 4 4 NA NA
15 B 5 5 NA NA
16 B 6 6 NA NA
17 B 7 7 NA NA
18 B 8 8 8 8
19 B 9 9 9 9
20 B 10 10 10 10
The final output is like the result of a vlookup where, the ID and X_A, ID and Y_A columsn of df1 and df2 are matched so that the corresponding values of X_B and Y_B are filled in df2. In case there is no match, NA should result. I have tried the following code
merge(df1, df2).
this however slows down my system. I have also tried
library(dplyr)
df2 %>% right_join(df1, by=c(ID, x_A, y_A).
This results in all the rows appearing. Can the expected output be managed in R. request someone to help
Do you mean, join once on ID and X_A to get X_B, and afterwards ID and Y_A to get Y_B? Note that row 10 is different:
df2 %>%
left_join(select(df1, ID, X_A, X_B),
by = c("ID", "X_A")) %>%
left_join(select(df1, ID, Y_A, Y_B),
by = c("ID", "Y_A"))
# ID X_A Y_A X_B Y_B
# 1 A 1 1 1 1
# 2 A 2 2 2 2
# 3 A 3 3 3 NA
# 4 A 4 4 4 NA
# 5 A 5 5 5 NA
# 6 A 6 6 NA NA
# 7 A 7 7 NA NA
# 8 A 8 8 NA NA
# 9 A 9 9 NA NA
# 10 A 10 10 NA 10
# 11 B 1 1 NA NA
# 12 B 2 2 NA NA
# 13 B 3 3 NA NA
# 14 B 4 4 NA NA
# 15 B 5 5 NA NA
# 16 B 6 6 NA NA
# 17 B 7 7 NA NA
# 18 B 8 8 8 8
# 19 B 9 9 9 9
# 20 B 10 10 10 10
Base R:
want <- merge(df2, subset(df1, select = c(ID, X_A, X_B)), by = c("ID", "X_A"), all.x = TRUE)
(want <- merge(want, subset(df1, select = c(ID, Y_A, Y_B)), by = c("ID", "Y_A"), all.x = TRUE))

Resources