Adding columns and insert info from a second dataframe R - r

hello everyone I have two dataframes and I'd like to join information from one df to another one in a specific way. I'm gonna explain better. Here is my first df where i'd like to add 6 columns (general col named col1, col2 and so on..):
res1 res4 aa1234
1 AAAAAA 1 4 IVGG
2 AAAAAA 8 11 RPRQ
3 AAAAAA 10 13 RQFP
4 AAAAAA 12 15 FPFL
5 AAAAAA 20 23 NQGR
6 AAAAAA 32 35 HARF
here is the 2nd df:
res1 dist
1 3.711846
1 3.698985
2 4.180874
2 3.112819
3 3.559737
3 3.722107
4 3.842375
4 3.914970
5 3.361647
5 2.982788
6 3.245118
6 3.224230
7 3.538315
7 3.602273
8 3.185184
8 2.771583
9 4.276871
9 3.157737
10 3.933783
10 2.956738
Considering "res1" I'd like to add to the 1st df in my new 6 columns the first 6th values contained in "dist" of second df corresponding to res1 = 1.
After, in the 1st df I have res1 = 8, so I'd like to add in the new 6 columns the 6 values from res1 = 8 contained in "dist" of 2nd df.
I'd like to obtain something like this
res1 res4 aa1234 col1 col2 col3 col4 col5 col6
1 4 IVGG 3.71 3.79 4.18 3.11 3.55 3.72
8 11 RPRQ 3.18 2.77 4.27 3.15 3.93 2.95
10 13 RQFP
12 15 FPFL
20 23 NQGR
32 35 HARF
Please consider that I have to do it on a large dataset and for 1000 and more files... thanks!

You could create a sequence from res1 to res4 and then join the data with pdb.
library(tidyverse)
turn %>%
mutate(res = map2(res1, res4, seq)) %>%
unnest(res) %>%
left_join(pdb, by = c('res' = 'res1')) %>%
group_by(res1 = as.character(res1)) %>%
mutate(col = paste0('col', row_number())) %>%
select(-res4, -res, -eleno) %>%
pivot_wider(names_from = col, values_from = dist)

We can use rowid from data.table
library(dplyr)
library(tidyr)
library(data.table)
library(stringr)
df2 %>%
mutate(col = str_c("col", rowid(res1))) %>%
pivot_wider(names_from = col, values_from = dist) %>%
right_join(df1, by = 'res1')
-output
# A tibble: 6 x 4
# res1 col1 col2 res4
# <int> <dbl> <dbl> <int>
#1 1 3.71 3.70 4
#2 8 3.19 2.77 11
#3 10 3.93 2.96 13
#4 12 NA NA 15
#5 20 NA NA 23
#6 32 NA NA 35
data
df1 <- structure(list(res1 = c(1L, 8L, 10L, 12L, 20L, 32L), res4 = c(4L,
11L, 13L, 15L, 23L, 35L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))
df2 <- structure(list(res1 = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L,
6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L), dist = c(3.711846,
3.698985, 4.180874, 3.112819, 3.559737, 3.722107, 3.842375, 3.91497,
3.361647, 2.982788, 3.245118, 3.22423, 3.538315, 3.602273, 3.185184,
2.771583, 4.276871, 3.157737, 3.933783, 2.956738)), class = "data.frame",
row.names = c(NA,
-20L))

Related

Assign established values randomly by ID in R

I have this file:
ID P
1 10
1 12
1 11
2 9
2 8
2 10
3 11
3 12
3 14
4 15
4 16
4 8
5 11
5 13
5 10
6 14
6 16
6 11
And I would like to assign these values (a,b,c) randomly to the file:
like this:
ID P Group
1 10 a
1 12 b
1 11 c
2 9 c
2 8 a
2 10 b
3 11 a
3 12 c
3 14 b
4 15 c
4 16 a
4 8 b
5 11 b
5 13 c
5 10 a
6 14 b
6 16 c
6 11 a
I need to do several times, every time randomly. I tried this:
df %>% group_by(ID) %>% replicate(1,sample(df$group))
but, for sure, didnĀ“t work. Some suggestion?
Here is an option with sample
library(dplyr)
df1 %>%
group_by(ID) %>%
mutate(Group = sample(c('a', 'b', 'c'), n(), replace = TRUE))
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L), P = c(10L, 12L, 11L, 9L, 8L,
10L, 11L, 12L, 14L, 15L, 16L, 8L, 11L, 13L, 10L, 14L, 16L, 11L
)), class = "data.frame", row.names = c(NA, -18L))
Two solutions, one with grouping, the other without
library(tidyverse)
df <- dplyr::tribble(
~ID, ~P,
1,10,
1,12,
1,11,
2,9,
2,8,
2,10,
3,11,
3,12,
3,14,
4,15,
4,16,
4,8,
5,11,
5,13,
5,10,
6,14,
6,16,
6,11
)
sample_vector <- c("a","b","c")
##Without grouping id
df_2 <- df %>%
mutate(Group = sample(sample_vector, nrow(df), replace = TRUE))
##With grouping by ID
df_2 <- df %>% group_by(ID) %>%
mutate(Group = sample(sample_vector, n(), replace = TRUE))

Aggregating columns based on columns name in R

I have this dataframe in R
Party Pro2005 Anti2005 Pro2006 Anti2006 Pro2007 Anti2007
R 1 18 0 7 2 13
R 1 19 0 7 1 14
D 13 7 3 4 10 5
D 12 8 3 4 9 6
I want to aggregate it to where it will combined all the pros and anti based on party
for example
Party ProSum AntiSum
R. 234. 245
D. 234. 245
How would I do that in R?
You can use:
library(tidyverse)
df %>%
pivot_longer(-Party,
names_to = c(".value", NA),
names_pattern = "([a-zA-Z]*)([0-9]*)") %>%
group_by(Party) %>%
summarise(across(where(is.numeric), sum, na.rm = T))
# A tibble: 2 x 3
Party Pro Anti
<chr> <int> <int>
1 D 50 34
2 R 5 78
I would suggest a tidyverse approach reshaping the data and the computing the sum of values:
library(tidyverse)
#Data
df <- structure(list(Party = c("R", "R", "D", "D"), Pro2005 = c(1L,
1L, 13L, 12L), Anti2005 = c(18L, 19L, 7L, 8L), Pro2006 = c(0L,
0L, 3L, 3L), Anti2006 = c(7L, 7L, 4L, 4L), Pro2007 = c(2L, 1L,
10L, 9L), Anti2007 = c(13L, 14L, 5L, 6L)), class = "data.frame", row.names = c(NA,
-4L))
The code:
df %>% pivot_longer(cols = -1) %>%
#Format strings
mutate(name=gsub('\\d+','',name)) %>%
#Aggregate
group_by(Party,name) %>% summarise(value=sum(value,na.rm=T)) %>%
pivot_wider(names_from = name,values_from=value)
The output:
# A tibble: 2 x 3
# Groups: Party [2]
Party Anti Pro
<chr> <int> <int>
1 D 34 50
2 R 78 5
Splitting by parties and loop sum over the pro/anti using sapply, finally rbind.
res <- data.frame(Party=sort(unique(d$Party)), do.call(rbind, by(d, d$Party, function(x)
sapply(c("Pro", "Anti"), function(y) sum(x[grep(y, names(x))])))))
res
# Party Pro Anti
# D D 50 34
# R R 5 78
An outer solution is also suitable.
t(outer(c("Pro", "Anti"), c("R", "D"),
Vectorize(function(x, y) sum(d[d$Party %in% y, grep(x, names(d))]))))
# [,1] [,2]
# [1,] 5 78
# [2,] 50 34
Data:
d <- read.table(header=T, text="Party Pro2005 Anti2005 Pro2006 Anti2006 Pro2007 Anti2007
R 1 18 0 7 2 13
R 1 19 0 7 1 14
D 13 7 3 4 10 5
D 12 8 3 4 9 6 ")

Merging two datasets by an ID without adding new columns that say ".x" or ".y"

Suppose I have two datasets. One main dataset, with many columns of metadata, and one new dataset which will be used to fill in some of the gaps in concentrations in the main dataset:
Main dataset:
study_id timepoint age occupation concentration1 concentration2
1 1 21 0 3 7
1 2 21 0 4 6
1 3 22 0 NA NA
1 4 22 0 NA NA
2 1 36 3 0 4
2 2 36 3 2 11
2 3 37 3 NA NA
2 4 37 3 NA NA
New data set to merge:
study_id timepoint concentration1 concentration2
1 3 11 20
1 4 21 35
2 3 7 17
2 4 14 25
Whenever I merge by "study_id" and "timepoint", I get two new columns that are "concentration1.y" and "concentration2.y" while the original columns get renamed as "concentration1.x" and "concentration2.x". I don't want this.
This is what I want:
study_id timepoint age occupation concentration1 concentration2
1 1 21 0 3 7
1 2 21 0 4 6
1 3 22 0 11 20
1 4 22 0 21 35
2 1 36 3 0 4
2 2 36 3 2 11
2 3 37 3 7 17
2 4 37 3 14 25
In other words, I want to merge by "study_id" and "timepoint" AND merge the two concentration columns so the data are within the same columns. Please note that both datasets do not have identical columns (dataset 1 has 1000 columns with metadata while dataset2 just has study id, timepoint, and concentration columns that match the concentration columns in dataset1).
Thanks so much in advance.
Using coalesce is one option (from dplyr package). This still adds the two columns for concentration 1 and 2 from the second data frame. These would be removed after NA filled in.
library(tidyverse)
df1 %>%
left_join(df2, by = c("study_id", "timepoint")) %>%
mutate(concentration1 = coalesce(concentration1.x, concentration1.y),
concentration2 = coalesce(concentration2.x, concentration2.y)) %>%
select(-concentration1.x, -concentration1.y, -concentration2.x, -concentration2.y)
Or to generalize with multiple concentration columns:
df1 %>%
left_join(df2, by = c("study_id", "timepoint")) %>%
split.default(str_remove(names(.), "\\.x|\\.y")) %>%
map_df(reduce, coalesce)
Edit: To prevent the resultant column names from being alphabetized from split.default, you can add an intermediate step of sorting the list based on the first data frame's column name order.
df3 <- df1 %>%
left_join(df2, by = c("study_id", "timepoint")) %>%
split.default(str_remove(names(.), "\\.x|\\.y"))
df3[names(df1)] %>%
map_df(reduce, coalesce)
Output
study_id timepoint age occupation concentration1 concentration2
1 1 1 21 0 3 7
2 1 2 21 0 4 6
3 1 3 22 0 11 20
4 1 4 22 0 21 35
5 2 1 36 3 0 4
6 2 2 36 3 2 11
7 2 3 37 3 7 17
8 2 4 37 3 14 25
Data
df1 <- structure(list(study_id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
timepoint = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), age = c(21L,
21L, 22L, 22L, 36L, 36L, 37L, 37L), occupation = c(0L, 0L,
0L, 0L, 3L, 3L, 3L, 3L), concentration1 = c(3L, 4L, NA, NA,
0L, 2L, NA, NA), concentration2 = c(7L, 6L, NA, NA, 4L, 11L,
NA, NA)), class = "data.frame", row.names = c(NA, -8L))
df2 <- structure(list(study_id = c(1L, 1L, 2L, 2L), timepoint = c(3L,
4L, 3L, 4L), concentration1 = c(11L, 21L, 7L, 14L), concentration2 = c(20L,
35L, 17L, 25L)), class = "data.frame", row.names = c(NA, -4L))

Dynamically subset and mutate data.table?

I have 2 separate DFs, I want to mutate 2 new columns in dat2 ('Avg_of_nonNA', and a 'Cols' to track which column its using) based on the non-NA columns in dat1. I need take a subset of dat2 because the matrix is dense whereas dat1 is sparse (So I can take advantage of the sparse-ness). The only way to match the columns is to match the common elements in the names: (0-1,1-2,2-3,3-4) in my case. The rest of the column names are gibberish. Its requiring string splitting and matching--causing many problems because I can't chain stuff together because each row has a different combination of columns to average (dummy example is simplified). I do have a working solution, but it is painfully slow across my 1M+ rows. Here is that solution:
I'm looking for a way to get rid of the for loop. Any suggestions?
for (z in 1:5) {
relevant_cols=dat1[z,] %>%
select_if(~!all(is.na(.))) %>%
names %>% strsplit(.,'_') %>% map(.,2) %>% unlist()
id=dat1[z,'ID']$`ID`
dat2[`ID`== id,`:=`(Avg_of_nonNA = (mean(as.numeric(.SD))),Cols=paste0(relevant_cols,collapse='/')), .SDcols=names(dat2) %like% paste0(relevant_cols,collapse='|')]
}
Data Below
> dat1
ID gjfkg_0-1_fkjdk_fjdkd jdfsje_1-2_fhks_ejfskj dfjs_2-3_vjskf_wqew gdlkrzc_3-4_rjrkj Avg_of_nonNA_otherDT
1: 1 2.23 1.37 NA NA 1.5
2: 2 1.98 NA NA 1.760 6.5
3: 3 NA 4.45 9.350 3.320 11.0
4: 4 NA NA 6.642 2.019 15.5
5: 5 NA 3.21 3.677 NA 18.5
> dat2
ID ewrwer_0-1_iopi_opop erewtt_1-2_rueiwu_vcvbc erewr_2-3_iirew_rewr mnmn_3-4_cxzxzc_gjd
1: 1 1 2 3 4
2: 2 5 6 7 8
3: 3 9 10 11 12
4: 4 13 14 15 16
5: 5 17 18 19 20
dput(dat1)
structure(list(ID = 1:5, `gjfkg_0-1_fkjdk_fjdkd` = c(2.23, 1.98,
NA, NA, NA), `jdfsje_1-2_fhks_ejfskj` = c(1.37, NA, 4.45, NA,
3.21), `dfjs_2-3_vjskf_wqew` = c(NA, NA, 9.35, 6.642, 3.677),
`gdlkrzc_3-4_rjrkj` = c(NA, 1.76, 3.32, 2.019, NA)), row.names = c(NA, -5L), class = c("data.table",
"data.frame"))
dput(dat2)
structure(list(ID = 1:5, `ewrwer_0-1_iopi_opop` = c(1L, 5L, 9L,
13L, 17L), `erewtt_1-2_rueiwu_vcvbc` = c(2L, 6L, 10L, 14L, 18L
), `erewr_2-3_iirew_rewr` = c(3L, 7L, 11L, 15L, 19L), `mnmn_3-4_cxzxzc_gjd` = c(4L,
8L, 12L, 16L, 20L)), row.names = c(NA, -5L), class = c("data.table",
"data.frame"))
Expected output:
Here is an option:
setDT(dat1)
setDT(dat2)
nm <- sapply(strsplit(names(dat1[, -"ID"]), "_"), `[[`, 2L)
dat2[, c("Avg_of_nonNA_otherDT", "Cols") := {
nas <- is.na(dat1[,-"ID"])
m <- col(nas)
m[] <- nm[m]
m[nas] <- ""
.(rowMeans(.SD * NA^nas, na.rm=TRUE),
gsub("\\s+", "/", trimws(do.call(paste, as.data.frame(m)))))
}, .SDcols=-"ID"]
output:
ID ewrwer_0-1_iopi_opop erewtt_1-2_rueiwu_vcvbc erewr_2-3_iirew_rewr mnmn_3-4_cxzxzc_gjd Avg_of_nonNA_otherDT Cols
1: 1 1 2 3 4 1.5 0-1/1-2
2: 2 5 6 7 8 6.5 0-1/3-4
3: 3 9 10 11 12 11.0 1-2/2-3/3-4
4: 4 13 14 15 16 15.5 2-3/3-4
5: 5 17 18 19 20 18.5 1-2/2-3

How to sum rows based on exact conditions on multiple columns and save edited rows in original dataset? [duplicate]

This question already has answers here:
Find nearest matches for each row and sum based on a condition
(4 answers)
Closed 3 years ago.
There are 3 parts to this problem:
1) I want to sum values in column b,c,d for any two adjacent rows which have the same values for columns(b,c,d)
2) I would like to keep values in other columns the same. (Some other column (eg. a) may contain character data.)
3) I would like to keep the changes by replacing the original value in columns b,c,d in the first row (of the 2 same rows) with the new values (the sums) and delete the second row(of the 2 same rows).
Time a b c d id
1 2014/10/11 A 40 20 10 1
2 2014/10/12 A 40 20 10 2
3 2014/10/13 B 9 10 9 3
4 2014/10/14 D 16 5 12 4
5 2014/10/15 D 1 6 5 5
6 2014/10/16 B 20 7 8 6
7 2014/10/17 B 20 7 8 7
8 2014/10/18 A 11 9 5 8
9 2014/10/19 C 31 20 23 9
Expected outcome:
Time a b c d id
1 2014/10/11 A 80 40 20 1 *
3 2014/10/13 B 9 10 9 3
4 2014/10/14 D 16 5 12 4
5 2014/10/15 D 1 6 5 5
6 2014/10/16 B 40 14 16 6 *
8 2014/10/18 A 11 9 5 8
9 2014/10/19 C 31 20 23 9
id 1 and 2 combined to become id 1; id 6 and 7 combined to become id 6.
Thank you. Any contribution is greatly appreciated.
Using dplyr functions along with data.table::rleid. To get same values for adjacent b, c and d columns we paste them and use rleid to create groups. For each group we sum the values at b, c and d columns and keep only the 1st row.
library(dplyr)
df %>%
mutate(temp_col = paste(b, c, d, sep = "-")) %>%
group_by(group = data.table::rleid(temp_col)) %>%
mutate_at(vars(b, c, d), sum) %>%
slice(1L) %>%
ungroup %>%
select(-temp_col, -group)
# Time a b c d id
# <fct> <fct> <int> <int> <int> <int>
#1 2014/10/11 A 80 40 20 1
#2 2014/10/13 B 9 10 9 3
#3 2014/10/14 D 16 5 12 4
#4 2014/10/15 D 1 6 5 5
#5 2014/10/16 B 40 14 16 6
#6 2014/10/18 A 11 9 5 8
#7 2014/10/19 C 31 20 23 9
data
df <- structure(list(Time = structure(1:9, .Label = c("2014/10/11",
"2014/10/12", "2014/10/13", "2014/10/14", "2014/10/15", "2014/10/16",
"2014/10/17", "2014/10/18", "2014/10/19"), class = "factor"),
a = structure(c(1L, 1L, 2L, 4L, 4L, 2L, 2L, 1L, 3L), .Label = c("A",
"B", "C", "D"), class = "factor"), b = c(40L, 40L, 9L, 16L,
1L, 20L, 20L, 11L, 31L), c = c(20L, 20L, 10L, 5L, 6L, 7L,
7L, 9L, 20L), d = c(10L, 10L, 9L, 12L, 5L, 8L, 8L, 5L, 23L
), id = 1:9), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7", "8", "9"))

Resources