I need to create an id that defines the relationship between contact_id and relationship_id into a common household_id if and where the combination of contact_id and relationship_id are the same.
Sample Data
account_id <- c(1,1,1,1)
contact_id <- c(1234,2345,3456,4567)
relationship_id <- c(2345,1234,NA,"")
ownership_percent <- c(26,22,40,12)
score <- c(500,300,700,600)
testdata <- data.frame(account_id,contact_id,relationship_id,ownership_percent,score)
Have been using combinations of mutate, paste0, min, max, group_indices - have not found the right combination, getting tripped up by NA and order output of new household_id
Approach 1
library(dplyr)
testdata %>%
mutate(col1 = pmin(contact_id, relationship_id),
col2 = pmax(contact_id, relationship_id),
household_id = paste0(col1,col2)) %>%
Approach 2
testdata %>%
mutate(household_id = sort(paste0(c(contact_id, relationship_id))), collapse = "")
Error: Column household_id must be length 4 (the number of rows) or one, not 8
Expected Outcome
library(dplyr)
# replace missing or NA values with 1
testdata$relationship_id <- type.convert(testdata$relationship_id)
testdata$relationship_id[relationship_id == ""] <- 1
testdata$relationship_id[is.na(testdata$relationship_id)] <- 1
# Create household_id
testdata %>%
mutate(group = paste0(pmin(contact_id, relationship_id), pmax(contact_id, relationship_id)),
household_id = match(group, unique(group)))
You can use -
library(dplyr)
testdata %>%
mutate(col1 = pmin(contact_id, relationship_id, na.rm = TRUE),
col2 = pmax(contact_id, relationship_id, na.rm = TRUE)) %>%
rowwise() %>%
mutate(household_id = paste0(unique(c(col1, col2)), collapse = '')) %>%
ungroup %>%
select(-col1, -col2)
# account_id contact_id relationship_id ownership_percent score household_id
# <dbl> <dbl> <chr> <dbl> <dbl> <chr>
#1 1 1234 "2345" 26 500 12342345
#2 1 2345 "1234" 22 300 12342345
#3 1 3456 NA 40 700 3456
#4 1 4567 "" 12 600 4567
Related
I need to summarize one variable/column of a long table after aggregating (group_by()) by another variable/column, I need to have the summarized value by all values of other variables/columns.
Here is test data:
library(tidyverse)
set.seed(123)
Site <- str_c("S", 1:5)
Species <- str_c("Sps", 1:6)
print(Species_tbl <- bind_cols(Species = Species,
Exotic = rbinom(length(Species), 1, .3),
Migrant = rbinom(length(Species), 2, .3)))
Data_tbl <- expand.grid(Site = Site,
Species = Species) %>%
left_join(Species_tbl)
Data_tbl$Presence <- rbinom(nrow(Data_tbl), 1, .5)
And here is my best effort:
print(Data_tbl %>%
group_by(Site) %>%
summarise(N_sp = sum(Presence),
N_sp_Exo = sum(Presence[Exotic == 1]),
N_sp_Nat = sum(Presence[Exotic == 0]),
N_sp_M0 = sum(Presence[Migrant == 0]),
N_sp_M1 = sum(Presence[Migrant == 1]),
N_sp_M2 = sum(Presence[Migrant == 2])))
You can get the data in long format for your columns of interest c(Exotic, Migrant) and take sum of Presence columns for each unique column names and it's values. This can be merged with sum of each Site.
library(dplyr)
library(tidyr)
data1 <- Data_tbl %>%
group_by(Site) %>%
summarise(N_sp = sum(Presence))
data2 <- Data_tbl %>%
pivot_longer(cols = c(Exotic, Migrant)) %>%
group_by(Site, name, value) %>%
summarise(result = sum(Presence), .groups = "drop") %>%
pivot_wider(names_from = c(name, value), values_from = result)
inner_join(data1, data2, by = 'Site')
# Site N_sp Exotic_0 Exotic_1 Migrant_0 Migrant_1 Migrant_2
# <fct> <int> <int> <int> <int> <int> <int>
#1 S1 4 2 2 1 2 1
#2 S2 3 2 1 0 2 1
#3 S3 2 1 1 0 2 0
#4 S4 4 2 2 1 3 0
#5 S5 4 1 3 1 2 1
The answer has been divided in two steps for ease of readability. If you would like to do this in a single chain without creating temporary variables that can be done as well.
Sorry I didn't clarify my question,
my aim is if dt$id %in% df$id , extract df$score add to new column at dt,
I have a dataframe like this :
df <- tibble(
score = c(2587,002,885,901,2587,3371,3372,002),
id = c("AR01.0","AR01.1","AR01.12","ERS02.00","ERS02.01","ERS02.02","QR01","QR01.03"))
And I have another dataframe like
dt <- tibble(
id = c("AR01","QR01","KVC"),
city = c("AM", "Bis","CHB"))
I want to mutate a new column "score"
I want to got output like below :
id
city
score
AR01
AM
2587/2/885
ERS02
Bis
901/3371
KVC
CHB
NA
or
id
city
score
score2
score3
AR01
AM
2587
2
885
ERS02
Bis
901
3371
NA
KVC
CHB
NA
NA
NA
I tried to use ifelse to achieve but always got error,
do any one can provide ideas? Thank you.
A simple left_join (after mutateing id values in df) is required:
library(dplyr)
library(stringr)
left_join(df %>% mutate(id = str_extract(id, "[\\w]+")), dt, by = "id") %>%
group_by(id) %>%
summarise(across(city,first),
score = paste(score, collapse = "/"))
# A tibble: 3 × 3
id city score
<chr> <chr> <chr>
1 AR01 AM 2587/2/885
2 ERS02 NA 901/2587/3371
3 QR01 Bis 3372/2
For the second solution you can use separate:
library(dyplr)
library(stringr)
library(tidyr)
left_join(df %>% mutate(id = str_extract(id, "[\\w]+")), dt, by = "id") %>%
group_by(id) %>%
summarise(across(city,first),
score = paste(score, collapse = "/")) %>%
separate(score,
into = paste("score", 1:3),
sep = "/" )
# A tibble: 3 × 5
id city `score 1` `score 2` `score 3`
<chr> <chr> <chr> <chr> <chr>
1 AR01 AM 2587 2 885
2 ERS02 NA 901 2587 3371
3 QR01 Bis 3372 2 NA
You could create groups by extracting everything before the . using sub to group_by on and merge the rows with paste separated with / and right_join them by id like this:
library(tibble)
df <- tibble(
score = c(2587,002,885,901,2587,3371,3372,002),
id = c("AR01.0","AR01.1","AR01.12","ERS02.00","ERS02.01","ERS02.02","QR01","QR01.03"))
dt <- tibble(
id = c("AR01","QR01","KVC"),
city = c("AM", "Bis","CHB"))
library(dplyr)
df %>%
mutate(id = sub('\\..*', "", id)) %>%
group_by(id) %>%
mutate(score = paste(score, collapse = '/')) %>%
distinct(id, .keep_all = TRUE) %>%
ungroup() %>%
right_join(., dt, by = 'id')
#> # A tibble: 3 × 3
#> score id city
#> <chr> <chr> <chr>
#> 1 2587/2/885 AR01 AM
#> 2 3372/2 QR01 Bis
#> 3 <NA> KVC CHB
Created on 2022-10-01 with reprex v2.0.2
df <- data.frame(
code1 = c ("ZAZ","ZAZ","ZAZ","ZAZ","ZAZ","ZAZ","JOZ","JOZ","JOZ","JOZ","JOZ","JOZ","TSV","TSV"),
code2 = c("NAN","NAN","NAN","NAN","NAN","NAN","NAN","NAN","NAN","NAN","NAN","NAN","TSA","TSA"),
start = c("Date1.1","Date1.1","Date1.3","Date1.3","Date1.5","Date1.5","Date3.1","Date3.1","Date3.3","Date3.3","Date3.5","Date3.5","Date 5.1","Date 5.1"),
end = c("Date2.1","Date2.1","Date2.3","Date2.3","Date2.5","Date2.5","Date4.1","Date4.1","Date4.3","Date4.3","Date4.5","Date4.5","Date6.1","Date6.1"),
price = c(1,2,3,4,5,6,1,2,3,4,5,6,1,2))
I'm trying to achieve:
I have so far done:
df <- df %>%
group_by(code1, code2,start,end) %>%
slice_min(price) #%>%
group_modify()
df <- df[order(df$price),]
All well explained in the image but in brief:
To group by code1,code2,start,end and select smallest price for each
Reshape sending start,end,price to different columns (max 3 start,end,price per key code1,code2
I understand that this can be done within group_modify() but unsure how
Any help so much appreciated!
Brian
Here is one way using dplyr and tidyr libraries.
For each group (code1, code2, start and end) calculate the minimum value of price.
Create an index column for code1 and code2. This is to name start, end and price as start_1, start_2 etc.
Get the data in wide format using pivot_wider.
library(dplyr)
library(tidyr)
df %>%
group_by(code1, code2, start, end) %>%
summarise(price = min(price, na.rm = TRUE)) %>%
group_by(code1, code2) %>%
mutate(index = row_number()) %>%
ungroup() %>%
pivot_wider(names_from = index, values_from = c(start, end, price),
names_vary = "slowest")
# code1 code2 start_1 end_1 price_1 start_2 end_2 price_2 start_3 end_3 price_3
# <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl>
#1 JOZ NAN Date3.1 Date4.1 1 Date3.3 Date4.3 3 Date3.5 Date4.5 5
#2 TSV TSA Date 5.1 Date6.1 1 NA NA NA NA NA NA
#3 ZAZ NAN Date1.1 Date2.1 1 Date1.3 Date2.3 3 Date1.5 Date2.5 5
Note that names_vary = "slowest" allows to have columns in an orderly fashion (start_1, end_1, price_1... instead of start_1, start_2 ..., end_1, end_2... etc. )
I guess you can try aggregate + reshape + ave (all from base R)
reshape(
transform(
aggregate(price ~ ., df, min),
id = ave(seq_along(price), code1, code2, FUN = seq_along)
),
direction = "wide",
idvar = c("code1", "code2"),
timevar = "id"
)
which gives
code1 code2 start.1 end.1 price.1 start.2 end.2 price.2 start.3 end.3
1 ZAZ NAN Date1.1 Date2.1 1 Date1.3 Date2.3 3 Date1.5 Date2.5
4 JOZ NAN Date3.1 Date4.1 1 Date3.3 Date4.3 3 Date3.5 Date4.5
7 TSV TSA Date5.1 Date6.1 1 <NA> <NA> NA <NA> <NA>
price.3
1 5
4 5
7 NA
I am trying to construct a control group. ID_1 is the original participant, ID_2 is the control. For simplicity sake they are matched by sex and age. I received a dataframe that looks like this:
ID_1 <- c(1,1,1,2,2,3,3,4,4,4)
Sex <- c("M","M","M","F","F","M","M","F","F","F")
Age <- c(23,23,23,35,35,44,44,35,35,35)
ID_2 <- c(321,322,323,630,631,502,503,630,631,632)
df <- data.frame(ID_1, Sex, Age, ID_2)
So I have several matches for each ID_1 and I want to sample within each group to get just one. I got that with:
library(dplyr)
random_ID_2 <- df %>% group_by(ID_1) %>% sample_n(size = 1, replace = F)
The problem is that I do not want to get any repeats of ID_2. So by random chance I could end up pairing ID_1 = 2 and ID_1 = 4 to the same control ID_2 = 630
How i can make sure this does not happen?
Thanks in advance.
If you can use a data.table solution:
dt <- setnames(
unique(
setorder(
setDT(copy(df))[, idx := 1:.N, by = ID_1], # add an index column for each ID_1 group
idx, ID_1) # sort by idx, ID_1
# for each Sex/Age group, sample unique values of ID_2 withouth replacement (pad with NA)
[, ID_3 := c(sample(unique(ID_2)), rep(NA, .N - uniqueN(ID_2))), by = c("Sex", "Age")],
by = "ID_1") # get the first row for each ID_1 group
[, c(1:3, 6)], "ID_3", "ID_2") # remove helper columns and rename "ID_3" to "ID_2"
Here is one potential option that samples and if there is a duplicate it will resample:
# handles case where no samples left
my_sample <- function(x, ...){
if (length(x) == 0L) return(NA) else sample(x, ...)
}
df %>%
group_by(ID_1) %>%
slice_sample(n = 1) %>%
ungroup() %>%
mutate(resample = duplicated(ID_2)) %>%
rowwise() %>%
mutate(ID_2 = if (resample) my_sample(df[df$ID_1 == ID_1 & df$ID_2 != ID_2, "ID_2"], 1) else ID_2) %>%
ungroup() %>%
select(-resample)
One thing to note is that rows further down your data frame with duplicate ID_2 are conditionally sampling.
Output
set.seed(17) is a case where the same ID_2 is sampled:
df %>%
group_by(ID_1) %>%
slice_sample(n = 1)
ID_1 Sex Age ID_2
<dbl> <chr> <dbl> <dbl>
1 1 M 23 322
2 2 F 35 631
3 3 M 44 502
4 4 F 35 631
And to test the above code:
set.seed(17)
df %>%
group_by(ID_1) %>%
slice_sample(n = 1) %>%
ungroup() %>%
mutate(resample = duplicated(ID_2)) %>%
rowwise() %>%
mutate(ID_2 = if (resample) my_sample(df[df$ID_1 == ID_1 & df$ID_2 != ID_2, "ID_2"], 1) else ID_2) %>%
ungroup() %>%
select(-resample)
ID_1 Sex Age ID_2
<dbl> <chr> <dbl> <dbl>
1 1 M 23 322
2 2 F 35 631
3 3 M 44 502
4 4 F 35 632
>
Again to emphasize my point above ID_1 == 4 is conditionally sampling since we allow ID_1 == 2 to remain matched to ID_2 == 631 and change the match for ID_1 == 4.
How it works
Sample your data as you normally would.
Then we we check for duplicates in ID_2. Note: duplicated returns TRUE for all subsequent duplicated IDs.
If a row needs to be resampled then we subset and sample from the original data frame with the line mutate(ID_2 = if ...)
I had a dataframe, whose ID column had many duplicated names. So I used table() function to get the frequency of IDs. like this:
library(dplyr)
id <- runif(1000,1000,3000) %>% round() %>% as.character()
freq <- rep(1:50,20)
data <- data.frame(id,freq)
GetID <- function(a){
if (a[2]==1) newid <- a[1] else newid <- paste(a[1],1:a[2],sep = "-");
return(newid)}
idlist <- data %>% apply(., 1, GetID)
idlist2 <- unlist(idlist) %>% as.data.frame()
I wanted to get a new ID vector. If the freq equals 1, the new ID equals the old one. If the freq is larger than 1, the new ID is the old id combined with its order.
However, it seems the if statement didn't work correctly. All of new id had order number.
do you have to use a function? if not:
id <- runif(1000,1000,3000)
freq <- rep(1:50,20)
num <- 1:length(id)
data <- data.frame(num,id,freq)
data2 <- data %>% filter(freq == 1) %>% mutate(newid = id)
data3 <- data %>% filter(freq != 1) %>% mutate(newid = paste(id,freq,sep = "-"))
result <- rbind(data2,data3) %>% arrange(num)
You can group_by id and if number of rows is greater than 1 then paste row_number() with id or just use id.
library(dplyr)
data %>%
group_by(id) %>%
mutate(newID = if(n() > 1) paste(id, row_number(), sep = '-')
else as.character(id)) %>%
arrange(id)
# id freq newID
# <chr> <int> <chr>
# 1 1002 49 1002-1
# 2 1002 31 1002-2
# 3 1003 26 1003
# 4 1005 11 1005-1
# 5 1005 28 1005-2
# 6 1007 37 1007
# 7 1013 33 1013
# 8 1016 7 1016
# 9 1020 11 1020
#10 1024 28 1024
# … with 990 more rows