bind two dataframes that have different columns names - r

I have two data-frames
The first data
col1 col2 col3
A1 4 11 15
A2 2 9 17
A3 3 4 4
B1 10 5 4
B2 6 1 8
C1 12 1 12
C2 2 5 8
D1 4 1 6
D2 2 1 8
The second data
meancol1 meancol2 meancol3
meanA 3 8 12
meanB 8 3 6
meanC 7 3 10
meanD 3 1 7
I want to combine the two data-frames and keep the colnames of the first dataset
so the result i want is :
col1 col2 col3
A1 4 11 15
A2 2 9 17
A3 3 4 4
B1 10 5 4
B2 6 1 8
C1 12 1 12
C2 2 5 8
D1 4 1 6
D2 2 1 8
meanA 3 8 12
meanB 8 3 6
meanC 7 3 10
meanD 3 1 7
I tried : the following function
data_all <- rbind(df1,df2)
but it didn't work
I also tried the function bind_rows from dplyr package but this one create new columns.
Thank you

You could always do:
colnames(df2) <- colnames(df1)
data_all <- rbind(df1, df2)
data_all

A possible solution:
library(tidyverse)
df1 <- read.table(text = " col1 col2 col3
A1 4 11 15
A2 2 9 17
A3 3 4 4
B1 10 5 4
B2 6 1 8
C1 12 1 12
C2 2 5 8
D1 4 1 6
D2 2 1 8
", header=T)
df2 <- read.table(text = "meancol1 meancol2 meancol3
meanA 3 8 12
meanB 8 3 6
meanC 7 3 10
meanD 3 1 7
", header=T)
df2 %>% rename_with(~ str_remove(.x, "mean")) %>%
bind_rows(df1, .)
#> col1 col2 col3
#> A1 4 11 15
#> A2 2 9 17
#> A3 3 4 4
#> B1 10 5 4
#> B2 6 1 8
#> C1 12 1 12
#> C2 2 5 8
#> D1 4 1 6
#> D2 2 1 8
#> meanA 3 8 12
#> meanB 8 3 6
#> meanC 7 3 10
#> meanD 3 1 7

You could do:
rbind(df1, setNames(df2, names(df1)))
col1 col2 col3
A1 4 11 15
A2 2 9 17
A3 3 4 4
B1 10 5 4
B2 6 1 8
C1 12 1 12
C2 2 5 8
D1 4 1 6
D2 2 1 8
meanA 3 8 12
meanB 8 3 6
meanC 7 3 10
meanD 3 1 7

Use mapply:
data.frame(mapply(c, df1, df2))
output
id col1 col2 col3
1 A1 4 11 15
2 A2 2 9 17
3 A3 3 4 4
4 B1 10 5 4
5 B2 6 1 8
6 C1 12 1 12
7 C2 2 5 8
8 D1 4 1 6
9 D2 2 1 8
10 meanA 3 8 12
11 meanB 8 3 6
12 meanC 7 3 10
13 meanD 3 1 7
data
df1 <- read.table(text = "id col1 col2 col3
A1 4 11 15
A2 2 9 17
A3 3 4 4
B1 10 5 4
B2 6 1 8
C1 12 1 12
C2 2 5 8
D1 4 1 6
D2 2 1 8
", header=T)
df2 <- read.table(text = "id meancol1 meancol2 meancol3
meanA 3 8 12
meanB 8 3 6
meanC 7 3 10
meanD 3 1 7
", header=T)

Another option is to use rbindlist:
library(data.table)
rbindlist(lapply(list(df1, df2), setDT, keep.rownames = TRUE))
# Or using `add_rownames` from `dplyr`
# rbindlist(lapply(list(df1, df2), add_rownames), use.names = F)
Output
rn col1 col2 col3
1: A1 4 11 15
2: A2 2 9 17
3: A3 3 4 4
4: B1 10 5 4
5: B2 6 1 8
6: C1 12 1 12
7: C2 2 5 8
8: D1 4 1 6
9: D2 2 1 8
10: meanA 3 8 12
11: meanB 8 3 6
12: meanC 7 3 10
13: meanD 3 1 7
Data
df1 <- structure(list(col1 = c(4L, 2L, 3L, 10L, 6L, 12L, 2L, 4L, 2L),
col2 = c(11L, 9L, 4L, 5L, 1L, 1L, 5L, 1L, 1L), col3 = c(15L,
17L, 4L, 4L, 8L, 12L, 8L, 6L, 8L)), class = "data.frame", row.names = c("A1",
"A2", "A3", "B1", "B2", "C1", "C2", "D1", "D2"))
df2 <- structure(list(meancol1 = c(3L, 8L, 7L, 3L), meancol2 = c(8L,
3L, 3L, 1L), meancol3 = c(12L, 6L, 10L, 7L)), class = "data.frame", row.names = c("meanA",
"meanB", "meanC", "meanD"))

Related

taking sum of rows in R based on conditions

I have a data in this format
ColA
ColB
ColC
A
2
1
A
1
1
B
3
2
B
5
2
C
2
3
C
5
3
A
1
1
A
3
1
B
7
2
B
1
2
I want to get a new column with the sum of the rows of ColB, something like this:
ColA
ColB
ColC
ColD
A
2
1
3
A
1
1
3
B
3
2
8
B
5
2
8
C
2
3
7
C
5
3
7
A
1
1
4
A
3
1
4
B
7
2
8
B
1
2
8
Thanks much for your help!
I tried
df$ColD <- with(df, sum(ColB[ColC == 1]))
It seems to me that you want ColD to have the sum of ColB for each consecutive group defined by the values in ColA. In which case, we may do:
library(dplyr)
df %>%
mutate(group = data.table::rleid(ColA)) %>%
group_by(group) %>%
mutate(ColD = sum(ColB)) %>%
ungroup() %>%
select(-group)
#> # A tibble: 10 x 4
#> ColA ColB ColC ColD
#> <chr> <int> <int> <int>
#> 1 A 2 1 3
#> 2 A 1 1 3
#> 3 B 3 2 8
#> 4 B 5 2 8
#> 5 C 2 3 7
#> 6 C 5 3 7
#> 7 A 1 1 4
#> 8 A 3 1 4
#> 9 B 7 2 8
#> 10 B 1 2 8
This, at any rate, is the same as the expected output.
Created on 2023-01-16 with reprex v2.0.2
Data from question in reproducible format
df <- structure(list(ColA = c("A", "A", "B", "B", "C", "C", "A", "A",
"B", "B"), ColB = c(2L, 1L, 3L, 5L, 2L, 5L, 1L, 3L, 7L, 1L),
ColC = c(1L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 2L, 2L)),
class = "data.frame", row.names = c(NA, -10L))
Base R
df$ColD=ave(
df$ColB,
cumsum(c(1,abs(diff(match(df$ColA,LETTERS))))),
FUN=sum
)
ColA ColB ColC ColD
1 A 2 1 3
2 A 1 1 3
3 B 3 2 8
4 B 5 2 8
5 C 2 3 7
6 C 5 3 7
7 A 1 1 4
8 A 3 1 4
9 B 7 2 8
10 B 1 2 8
A base solution:
df |>
transform(ColD = ave(ColB, with(rle(ColA), rep(seq_along(values), lengths)), FUN = sum))
ColA ColB ColC ColD
1 A 2 1 3
2 A 1 1 3
3 B 3 2 8
4 B 5 2 8
5 C 2 3 7
6 C 5 3 7
7 A 1 1 4
8 A 3 1 4
9 B 7 2 8
10 B 1 2 8
Another base solution using ave.
df$ColD <- ave(df$ColB, c(0, cumsum(diff(df$ColC) != 0)), FUN=sum) #Using ColC
#df$ColD <- ave(df$ColB, c(0, cumsum(df$ColA[-nrow(df)] != df$ColA[-1])), FUN=sum) #Using ColA
df
# ColA ColB ColC ColD
#1 A 2 1 3
#2 A 1 1 3
#3 B 3 2 8
#4 B 5 2 8
#5 C 2 3 7
#6 C 5 3 7
#7 A 1 1 4
#8 A 3 1 4
#9 B 7 2 8
#10 B 1 2 8

R Create multiple rows from 1 row based on presence of values in certain columns

I have a data frame that looks like the following:
ID Date Participant_1 Participant_2 Participant_3 Covariate 1 Covariate 2 Covariate 3
1 9/1 A B 16 2 1
2 5/4 B 4 2 2
3 6/3 C A B 8 3 6
4 2/8 A 7 8 4
5 9/3 C A 7 1 3
I need to expand this data frame so that a row is present for all of the participants present at each event "ID", with the date and all other variables in all the created rows. The multiple participant columns would now only be one column for participant. The output would therefore be:
ID Date Participant Covariate 1 Covariate 2 Covariate 3
1 9/1 A 16 2 1
1 9/1 B 16 2 1
2 5/4 B 4 2 2
3 6/3 C 8 3 6
3 6/3 A 8 3 6
3 6/3 B 8 3 6
4 2/8 A 7 8 4
5 9/3 C 7 1 3
5 9/3 A 7 1 3
Is there a way to do this efficiently? Perhaps with a pivot function?
We can use pivot_longer and then some formatting
library(tidyr)
df %>%
pivot_longer(starts_with("Participant"), values_to = "Participant") %>%
select(-name) %>%
relocate(Participant, .before = Covariate_1) %>%
drop_na()
# A tibble: 9 × 6
ID Date Participant Covariate_1 Covariate_2 Covariate_3
<int> <chr> <chr> <int> <int> <int>
1 1 9/1 A 16 2 1
2 1 9/1 B 16 2 1
3 2 5/4 B 4 2 2
4 3 6/3 C 8 3 6
5 3 6/3 A 8 3 6
6 3 6/3 B 8 3 6
7 4 2/8 A 7 8 4
8 5 9/3 C 7 1 3
9 5 9/3 A 7 1 3
Here's the example data used:
df <- structure(list(ID = 1:5, Date = c("9/1", "5/4", "6/3", "2/8",
"9/3"), Participant_1 = c("A", "B", "C", "A", "C"), Participant_2 = c("B",
NA, "A", NA, "A"), Participant_3 = c(NA, NA, "B", NA, NA), Covariate_1 = c(16L,
4L, 8L, 7L, 7L), Covariate_2 = c(2L, 2L, 3L, 8L, 1L), Covariate_3 = c(1L,
2L, 6L, 4L, 3L)), class = "data.frame", row.names = c(NA, -5L
))

Find a the value in one group that corresponds to the maximum value in another, then subtract that value from another column

I have data like these:
A1 A2 A3 A4 B C1 C2 C3 C4
1 3 2 2 7 2 NA 6 9
4 6 12 1 3 1 6 5 2
6 1 NA 1 7 3 2 2 1
I want to take the maximum value of the columns starting with "C" and then subtract the "A" column ending with the same number from "B". For example, the max of the "C"s in the first row would be 9 and so I would want to subtract A4 from B (7-2)
A1 A2 A3 A4 B C1 C2 C3 C4 new
1 11 2 2 7 2 NA 6 9 5
4 6 12 1 3 1 6 5 2 -3
6 1 NA 1 7 3 2 2 1 1
Is this possible? Maybe using "starts_with"?
Here is a base solution:
cs = grep("C", names(df))
as = grep("A", names(df))
c_max = apply(df[cs], 1, which.max)
df$new = df$B - as.matrix(df[as])[cbind(1:nrow(df), c_max)]
df
# A1 A2 A3 A4 B C1 C2 C3 C4 new
# 1 1 3 2 4 7 2 NA 6 9 3
# 2 4 6 12 1 3 1 6 5 2 -3
# 3 6 1 NA 1 7 3 2 2 1 1
Using this data (note that in your input your A values in input are different from the A values in the output... I used the input.)
df = read.table(text = 'A1 A2 A3 A4 B C1 C2 C3 C4
1 3 2 4 7 2 NA 6 9
4 6 12 1 3 1 6 5 2
6 1 NA 1 7 3 2 2 1', header = T)
Maybe this could help:
library(dplyr)
df %>%
rowwise() %>%
mutate(idx = which.max(c_across(starts_with('C'))),
new = B - get(paste0('A', idx))) %>%
select(-idx)
# A tibble: 3 × 10
# Rowwise:
A1 A2 A3 A4 B C1 C2 C3 C4 new
<int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 3 2 4 7 2 NA 6 9 3
2 4 6 12 1 3 1 6 5 2 -3
3 6 1 NA 1 7 3 2 2 1 1
Here is one with max.col
library(dplyr)
library(tidyr)
df1 %>%
mutate( new = B- as.data.frame(across(starts_with('A')))[
cbind(row_number(), max.col(across(starts_with('C'),
replace_na, -5), 'first'))])
-output
A1 A2 A3 A4 B C1 C2 C3 C4 new
1 1 3 2 2 7 2 NA 6 9 5
2 4 6 12 1 3 1 6 5 2 -3
3 6 1 NA 1 7 3 2 2 1 1
data
df1 <- structure(list(A1 = c(1L, 4L, 6L), A2 = c(3L, 6L, 1L), A3 = c(2L,
12L, NA), A4 = c(2L, 1L, 1L), B = c(7L, 3L, 7L), C1 = c(2L, 1L,
3L), C2 = c(NA, 6L, 2L), C3 = c(6L, 5L, 2L), C4 = c(9L, 2L, 1L
)), class = "data.frame", row.names = c(NA, -3L))

Remove duplicate among consecutive values within a dataframe in R

I have a datafram such as
COL1 COL2 COL3
G1 1 6
G1 2 6
G1 3 7
G1 4 9
G1 5 9
G1 6 9
G1 7 6
G1 8 6
G1 9 7
G1 10 7
G1 11 7
G1 12 8
G1 13 7
and I would like to remove duplicate of consecutive COL3 values and keep the first.
here I should then get:
COL1 COL2 COL3
G1 1 6
G1 3 7
G1 4 9
G1 7 6
G1 9 7
G1 12 8
G1 13 7
Here is the dput format if it can helps:
structure(list(COL1 = c("G1", "G1", "G1", "G1", "G1", "G1", "G1",
"G1", "G1", "G1", "G1", "G1", "G1"), COL2 = 1:13, COL3 = c(6L,
6L, 7L, 9L, 9L, 9L, 6L, 6L, 7L, 7L, 7L, 8L, 7L)), class = "data.frame", row.names = c(NA,
-13L))
In base R, you can use run-length encoding (rle):
df[nrow(df) - rev(cumsum(rle(rev(df$COL3))$lengths) - 1),]
#> COL1 COL2 COL3
#> 1 G1 1 6
#> 3 G1 3 7
#> 4 G1 4 9
#> 7 G1 7 6
#> 9 G1 9 7
#> 12 G1 12 8
#> 13 G1 13 7
Or rleid from data.table
df[c(TRUE, diff(data.table::rleid(df$COL3)) == 1),]
#> COL1 COL2 COL3
#> 1 G1 1 6
#> 3 G1 3 7
#> 4 G1 4 9
#> 7 G1 7 6
#> 9 G1 9 7
#> 12 G1 12 8
#> 13 G1 13 7
Or only keep rows that don't match their own lag:
df[df$COL3 != dplyr::lag(df$COL3, default = pi),]
#> COL1 COL2 COL3
#> 1 G1 1 6
#> 3 G1 3 7
#> 4 G1 4 9
#> 7 G1 7 6
#> 9 G1 9 7
#> 12 G1 12 8
#> 13 G1 13 7
Another possible solution, based on dplyr:
library(dplyr)
df %>%
filter(COL3 != lead(COL3, default = Inf))
#> COL1 COL2 COL3
#> 1 G1 2 6
#> 2 G1 3 7
#> 3 G1 6 9
#> 4 G1 8 6
#> 5 G1 11 7
#> 6 G1 12 8
#> 7 G1 13 7
using data.table,
temp[COL3!=lag(COL3,default = 0)]
output
COL1 COL2 COL3
1: G1 1 6
2: G1 3 7
3: G1 4 9
4: G1 7 6
5: G1 9 7
6: G1 12 8
7: G1 13 7

Select i-th element if a condition occurs with for loop

I have a dataframe (df) like this:
Rif dd A A A A A B B B B B C C C C C
a1 10 5 8 10 2 6 9 6 5 7 9 1 5 6 4 5
b1 20 12 7 1 5 9 10 5 3 8 7 3 6 1 9 8
c1 100 11 6 8 1 14 1 11 9 3 6 10 8 13 8 4
d1 70 4 3 7 8 11 19 2 6 7 1 20 18 7 10 7
I have a vector
rif <- c(0, 15, 50, 90, 110)
I would like to add to the df a column such that if dd(i) >= rif(i-1) & dd(i)
Rif dd A A A A A B B B B B C C C C C V1
a1 10 5 8 10 2 6 9 6 5 7 9 1 5 6 4 5 8
b1 20 12 7 1 5 9 10 5 3 8 7 3 6 1 9 8 1
c1 100 1 6 8 1 14 1 11 9 3 6 10 8 13 8 4 14
d1 70 4 3 7 8 11 19 2 6 7 1 20 18 7 10 7 8
The same should be done for V2 and V3 with respect to Bs and Cs columns.
ref <- c(0, 15, 50, 90, 110)
for (i in 2:length(ref)) {
for (j in 1:nrow(df)) {
if (df$dd >= ref[i-1] && df$dd< ref[i]) {
df[,"V1"] <- df[j,i]
}
}
}
I get the following error:
Error in if (..) :
missing value where TRUE/FALSE needed
Probably the if command is not the correct one.
could you help me?
I think you just need to better specify the rows and columns:
df <- data.frame(
c("a1","b1","c1","d1")
, c(10,20,100,70), c(5,12,11,4), c(8,7,6,3), c(10,1,8,7), c(2,5,1,8), c(6,9,14,11)
, c(9,10,1,19), c(6,5,11,2), c(5,3,9,6), c(7,8,3,7), c(9,7,6,1)
, c(1,3,10,20), c(5,6,8,18), c(6,1,13,7), c(4,9,8,10), c(5,8,4,7)
)
colnames(df) <- c("Rif", "dd", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "C", "C", "C", "C", "C")
ref <- c(0, 15, 50, 90, 110)
for (i in 2:length(ref)) {
for (j in 1:nrow(df)) {
if (df$dd[j] >= ref[i-1] && df$dd[j] < ref[i]) {
df$V1[j] <- df[j,i+2]
df$V2[j] <- df[j,i+2+5]
df$V3[j] <- df[j,i+2+10]
}
}
}
which gives:
Rif dd A A A A A B B B B B C C C C C V1 V2 V3
1 a1 10 5 8 10 2 6 9 6 5 7 9 1 5 6 4 5 8 6 5
2 b1 20 12 7 1 5 9 10 5 3 8 7 3 6 1 9 8 1 3 1
3 c1 100 11 6 8 1 14 1 11 9 3 6 10 8 13 8 4 14 6 4
4 d1 70 4 3 7 8 11 19 2 6 7 1 20 18 7 10 7 8 7 10
Another option in base R:
lters <- c(A="A", B="B", C="C")
firstcol <- lapply(lters, function(x) match(x, colnames(DF)))
idx <- findInterval(DF$dd, rif)
for (l in lters)
DF[, paste0("V_", l)] <- as.integer(DF[cbind(seq_len(nrow(DF)), idx + firstcol[[l]])])
DF
output:
Rif dd A A.1 A.2 A.3 A.4 B B.1 B.2 B.3 B.4 C C.1 C.2 C.3 C.4 V_A V_B V_C
1 a1 10 5 8 10 2 6 9 6 5 7 9 1 5 6 4 5 8 6 5
2 b1 20 12 7 1 5 9 10 5 3 8 7 3 6 1 9 8 1 3 1
3 c1 100 11 6 8 1 14 1 11 9 3 6 10 8 13 8 4 14 6 4
4 d1 70 4 3 7 8 11 19 2 6 7 1 20 18 7 10 7 8 7 10
data:
DF <- structure(list(Rif = c("a1", "b1", "c1", "d1"), dd = c(10L, 20L,
100L, 70L), A = c(5L, 12L, 11L, 4L), A = c(8L, 7L, 6L, 3L), A = c(10L,
1L, 8L, 7L), A = c(2L, 5L, 1L, 8L), A = c(6L, 9L, 14L, 11L),
B = c(9L, 10L, 1L, 19L), B = c(6L, 5L, 11L, 2L), B = c(5L,
3L, 9L, 6L), B = c(7L, 8L, 3L, 7L), B = c(9L, 7L, 6L, 1L),
C = c(1L, 3L, 10L, 20L), C = c(5L, 6L, 8L, 18L), C = c(6L,
1L, 13L, 7L), C = c(4L, 9L, 8L, 10L), C = c(5L, 8L, 4L, 7L
)), class = "data.frame", row.names = c(NA, -4L))
rif <- c(0, 15, 50, 90, 110)
Another way is reorganize the data by separating the lookup values into another table and perform an update join using data.table:
library(data.table)
setDT(DF)
out <- DF[, .(rn=.I, Rif, dd)]
#reorganizing data
lc <- grepl("A|B|C", names(DF))
lutbl <- data.table(COL=names(DF)[lc], transpose(DF[, ..lc]))
lutbl <- melt(lutbl, measure.vars=patterns("V"), variable.name="rn")[,
c("rn", "rif") := .(as.integer(gsub("V", "", rn)), rep(rif, sum(lc)*nrow(DF)/length(rif)))]
#lookup and update
for (l in lters)
out[, paste0("NEW", l) := lutbl[COL==l][out, on=c("rn", "rif"="dd"), roll=-Inf, value]]
out:
rn Rif dd NEWA NEWB NEWC
1: 1 a1 10 8 6 5
2: 2 b1 20 1 3 1
3: 3 c1 100 14 6 4
4: 4 d1 70 8 7 10

Resources