I have 2 R data frames that looks like this:
DATA FRAME 1:
identifier
ef_posterior
position_no
classification
11111
0.260
1
yes
11111
0.0822
2
yes
11111
0.00797
3
yes
11111
0.04
4
no
11111
0.245
5
yes
11111
0.432
6
yes
11112
0.342
1
maybe
11112
0.453
2
yes
11112
0.0032
3
yes
11112
0.241
5
no
11112
0.0422
6
yes
11112
0.311
4
no
DATAFRAME 2:
study_identifier
%LVEF
11111
62
11112
76
I want to merge and rearrange these two data frames into something like this:
Study_identifier and identifier are the same thing (just different column names). Additionally, I would like to recode the classification so that yes = 0, no = 1, maybe = 2
identifier
pos_1
pos_1_class
pos_2
pos_2_class
pos_3
pos_3_class
pos_4
pos_4_class
pos_5
pos_5_class
pos_6
pos_6_class
%LVEF
11111
0.260
0
0.0822
0
0.00797
0
0.04
1
0.245
0
0.432
0
62
11112
0.342
2
0.453
0
0.0032
0
0.311
1
0.241
1
0.0422
0
76
df1 %>% mutate(position_no = paste0("position_", position_no)) %>%
pivot_wider(id_cols = identifier, names_from = position_no, values_from = ef_posterior) %>%
left_join(df2 %>% mutate(study_identifier = as.numeric(as.character(study_identifier))), by = c("identifier" = "study_identifier"))
This is the code I have right now, but I can't figure out where to put in the code for the classification column
How would I go about doing this?
Any help would be very much appreciated!
You can recode quite easily with dplyr and case_when:
df1 %>% mutate(
classification =
case_when( classification == "yes" ~ 1,
classification == "no" ~ 0,
classification == "maybe" ~ 2)
)
I would solve it the following way:
library(tidyverse)
df1 <- data.frame(
stringsAsFactors = FALSE,
identifier = c(11111L,11111L,11111L,11111L,
11111L,11111L,11112L,11112L,11112L,11112L,11112L,
11112L),
ef_posterior = c(0.26,0.0822,0.00797,0.04,
0.245,0.432,0.342,0.453,0.0032,0.241,0.0422,0.311),
position_no = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 5L, 6L, 4L),
classification = c("yes","yes","yes","no",
"yes","yes","maybe","yes","yes","no","yes","no")
)
df2 <- data.frame(
check.names = FALSE,
study_identifier = c(11111L, 11112L),
`%LVEF` = c(62L, 76L)
)
df1 %>% mutate(
classification =
case_when( classification == "yes" ~ 1,
classification == "no" ~ 0,
classification == "maybe" ~ 2)
) %>%
pivot_wider(
id_cols = c(identifier), names_from = c(position_no), values_from = c(classification,ef_posterior)) %>%
left_join(df2, by = c("identifier" = "study_identifier"))
#> # A tibble: 2 x 14
#> identifier classification_1 classification_2 classification_3 classification_4
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 11111 1 1 1 0
#> 2 11112 2 1 1 0
#> # … with 9 more variables: classification_5 <dbl>, classification_6 <dbl>,
#> # ef_posterior_1 <dbl>, ef_posterior_2 <dbl>, ef_posterior_3 <dbl>,
#> # ef_posterior_4 <dbl>, ef_posterior_5 <dbl>, ef_posterior_6 <dbl>,
#> # `%LVEF` <int>
Created on 2021-04-12 by the reprex package (v0.3.0)
Related
Say I want to join these 3 dataframes with dplyr. How do I do it? I know I should use some combination of pivots and joins, but I can't figure out how to get it right.
My goal is to have the df something like this:
mpg_deciles mean_mpg mean_price production coefficient
1 13.5 12990 Foreign 12990
2 16 10874 Domestic 10874.8571428572
Heres the data
library(dplyr)
a <- tibble::tribble(
~mpg_deciles, ~mean_mpg,
1L, 13.5,
2L, 16,
3L, 17.75,
4L, 18.625,
5L, 19.7142857142857)
b <- tibble::tribble(
~coeff_foreign, ~mpg_deciles, ~mean_p_foreign, ~foreign,
12990, 2, 12990, "Foreign",
-2147.49999999997, 3, 10842.5, "Foreign",
-7180.99999999996, 4, 5809.00000000003, "Foreign",
-6777.49999999999, 6, 6212.5, "Foreign",
-6435.3333333333, 7, 6554.66666666669, "Foreign")
c <- tibble::tribble(
~coeff_domestic, ~mpg_deciles, ~mean_p_domestic, ~foreign,
10874.8571428572, 1L, 10874.8571428572, "Domestic",
-3697.73214285716, 2L, 7177.125, "Domestic",
-6031.19047619049, 3L, 4843.66666666666, "Domestic",
-6365.35714285716, 4L, 4509.5, "Domestic",
-4650.42857142859, 5L, 6224.42857142857, "Domestic")
I think you need to pre-process b and c and then use a left_join:
library(dplyr)
a %>%
left_join(
bind_rows(
b %>%
rename(coefficient = coeff_foreign, mean_price = mean_p_foreign, production = foreign),
c %>%
rename(coefficient = coeff_domestic, mean_price = mean_p_domestic, production = foreign)
),
by = "mpg_deciles"
)
This returns
# A tibble: 8 x 5
mpg_deciles mean_mpg coefficient mean_price production
<dbl> <dbl> <dbl> <dbl> <chr>
1 1 13.5 10875. 10875. Domestic
2 2 16 12990 12990 Foreign
3 2 16 -3698. 7177. Domestic
4 3 17.8 -2147. 10842. Foreign
5 3 17.8 -6031. 4844. Domestic
6 4 18.6 -7181. 5809. Foreign
7 4 18.6 -6365. 4510. Domestic
8 5 19.7 -4650. 6224. Domestic
The pre-processing changes the coeff_foreign and coeff_domestic (same for mean_p_) columns into columns of the same name. If now the two data.frames are appended to each other, all values with the same column names go into the respective (same) columns. Without this pre-processing the columns with different names (e.g. coeff_foreign and coeff_domestic) would not end in the same column, but two columns are created (coeff_foreign and coeff_domestic) where the values are stored. In this case left_join would not achieve the desired result.
Updated version: Thanks to #Martin Gal input:
We could use a nested left_join:
library(dplyr)
left_join(a, b, by='mpg_deciles') %>%
left_join(., c, by='mpg_deciles') %>%
select(-starts_with("foreign")) %>%
pivot_longer(-c("mpg_deciles", "mean_mpg"), names_pattern = "(coeff|mean_p)_(.*)", names_to = c(".value", "production"), values_drop_na = TRUE)
mpg_deciles mean_mpg production coeff mean_p
<dbl> <dbl> <chr> <dbl> <dbl>
1 1 13.5 domestic 10875. 10875.
2 2 16 foreign 12990 12990
3 2 16 domestic -3698. 7177.
4 3 17.8 foreign -2147. 10842.
5 3 17.8 domestic -6031. 4844.
6 4 18.6 foreign -7181. 5809.
7 4 18.6 domestic -6365. 4510.
8 5 19.7 domestic -4650. 6224.
I have a dataframe:
> print(merged)
AgeGroup values ind
1 1 0.2449762 diff_v.ownhigh_avg
2 1 0.2598964 diff_v.ownhigh_avg
3 1 0.2519043 diff_v.ownhigh_avg
4 1 0.2452479 diff_v.ownhigh_avg
5 1 0.2840650 diff_v.ownhigh_avg
6 1 0.2589341 diff_v.ownhigh_avg
7 1 0.3201843 diff_v.ownhigh_avg
8 1 0.3218865 diff_v.ownhigh_avg
9 1 0.2822984 diff_v.ownhigh_avg
10 1 0.3313962 diff_v.ownhigh_avg
There are 8 different types of ind, and there are 2 AgeGroup types.
I am creating a new dataframe that summarises the means and credble intervals based on 2 group factors (AgeGroup and ind).
This is the code that I have:
meansCIs <- merged %>%
group_by(AgeGroup, ind) %>%
summarise(means = mean(values), .groups = "keep",
lower_bound = quantile(means,.025),
upper_bound = quantile(means,.975))
This is the output it gives:
# A tibble: 16 x 5
# Groups: AgeGroup, ind [16]
AgeGroup ind means lower_bound upper_bound
<dbl> <fct> <dbl> <dbl> <dbl>
1 1 diff_v.ownhigh_avg 0.290 0.290 0.290
2 1 diff_v.ownlow_avg 0.272 0.272 0.272
3 1 diff_v.otherhigh_avg 0.274 0.274 0.274
4 1 diff_v.otherlow_avg 0.388 0.388 0.388
5 1 diff_v.own_avg 0.281 0.281 0.281
As you can see, something has gone wrong with computing the credible intervals. It is just replicating the mean for each condition. Does anyone know how I could fix this?
The quantile function is operating on just the single mean value. I think you need to substitute in the “values” variable.
merged<- structure(list(AgeGroup = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
values = c(0.2449762, 0.2598964, 0.2519043, 0.2452479, 0.284065,
0.2589341, 0.3201843, 0.3218865, 0.2822984, 0.3313962),
ind = c("diff_v.ownhigh_avg", "diff_v.ownhigh_avg", "diff_v.ownhigh_avg", "diff_v.ownhigh_avg",
"diff_v.ownhigh_avg", "diff_v.ownhigh_avg", "diff_v.ownhigh_avg",
"diff_v.ownhigh_avg", "diff_v.ownhigh_avg", "diff_v.ownhigh_avg" )),
class = "data.frame", row.names = c(NA, -10L))
merged %>%
group_by(AgeGroup, ind) %>%
summarise(means = mean(values), .groups = "keep",
lower_bound = quantile(values,.025),
upper_bound = quantile(values,.975))
# A tibble: 1 × 5
# Groups: AgeGroup, ind [1]
AgeGroup ind means lower_bound upper_bound
<int> <chr> <dbl> <dbl> <dbl>
1 1 diff_v.ownhigh_avg 0.280 0.245 0.329
I currently have a list with columns as individual elements.
I would like to combine list elements with the same column names (i.e. bind rows) and merge across the different columns (i.e. bind columns) into a single data frame. I'm having difficulty finding examples of how to do this.
l = list(est = c(0, 0.062220390087795, 1.1020213968139, 0.0359939361491544
), se = c(0.0737200634874046, 0.237735179934829, 0.18105632705918,
0.111359438298789), rf = structure(c(NA, NA, NA, 4L), levels = c("Never\nsmoker",
"Occasional\nsmoker", "Ex-regular\nsmoker", "Smoker"), class = "factor"),
n = c(187L, 18L, 32L, 82L), model = c("Crude", "Crude", "Crude",
"Crude"), est = c(0, 0.112335510453586, 0.867095253670329,
0.144963556944891), se = c(0.163523775933409, 0.237039485900481,
0.186247776987999, 0.119887623484768), rf = structure(c(NA,
NA, NA, 4L), levels = c("Never\nsmoker", "Occasional\nsmoker",
"Ex-regular\nsmoker", "Smoker"), class = "factor"), n = c(187L,
18L, 32L, 82L), model = c("Model 1", "Model 1", "Model 1",
"Model 1"), est = c(0, 0.107097305324242, 0.8278765140371,
0.0958220447859447), se = c(0.164787596943329, 0.237347836229364,
0.187201880036661, 0.120882616647714), rf = structure(c(NA,
NA, NA, 4L), levels = c("Never\nsmoker", "Occasional\nsmoker",
"Ex-regular\nsmoker", "Smoker"), class = "factor"), n = c(187L,
18L, 32L, 82L), model = c("Model 2", "Model 2", "Model 2",
"Model 2"))
I would like the data to have the following format:
data.frame(
est = c(),
se = c(),
rf = c(),
model = c()
)
Any help would be appreciated. Thank you!
In this solution, first the elements of l are grouped by name and then are combined using c. Finally, the resulting list is converted to a dataframe using map_dfc.
library(dplyr)
library(purrr)
cols <- c("est", "se", "rf", "model")
setNames(cols,cols) |>
map(~l[names(l) == .x]) |>
map_dfc(~do.call(c, .x))
#> # A tibble: 12 × 4
#> est se rf model
#> <dbl> <dbl> <fct> <chr>
#> 1 0 0.0737 NA Crude
#> 2 0.0622 0.238 NA Crude
#> 3 1.10 0.181 NA Crude
#> 4 0.0360 0.111 Smoker Crude
#> 5 0 0.164 NA Model 1
#> 6 0.112 0.237 NA Model 1
#> 7 0.867 0.186 NA Model 1
#> 8 0.145 0.120 Smoker Model 1
#> 9 0 0.165 NA Model 2
#> 10 0.107 0.237 NA Model 2
#> 11 0.828 0.187 NA Model 2
#> 12 0.0958 0.121 Smoker Model 2
another option
library(purrr)
grp <- (seq(length(l)) - 1) %/% 5
l_split <- split(l, grp)
map_df(l_split, c)
#> # A tibble: 12 × 5
#> est se rf n model
#> <dbl> <dbl> <fct> <int> <chr>
#> 1 0 0.0737 <NA> 187 Crude
#> 2 0.0622 0.238 <NA> 18 Crude
#> 3 1.10 0.181 <NA> 32 Crude
#> 4 0.0360 0.111 Smoker 82 Crude
#> 5 0 0.164 <NA> 187 Model 1
#> 6 0.112 0.237 <NA> 18 Model 1
#> 7 0.867 0.186 <NA> 32 Model 1
#> 8 0.145 0.120 Smoker 82 Model 1
#> 9 0 0.165 <NA> 187 Model 2
#> 10 0.107 0.237 <NA> 18 Model 2
#> 11 0.828 0.187 <NA> 32 Model 2
#> 12 0.0958 0.121 Smoker 82 Model 2
I am using the separate function from tidyverse to split the first column of this tibble :
# A tibble: 6,951 x 9
Row.names Number_of_analysis~ DL_Minimum DL_Mean DL_Maximum Number_of_measur~ Measure_Minimum Measure_Mean Measure_Maximum
<I<chr>> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2011.FACILITY.PONT-À-CELLES 52 0.6 1.81 16 0 0 0 0
2 2011.FACILITY.PONT-À-CELLES 52 0.07 0.177 1.3 0 0 0 0
3 2011.FACILITY.CHARLEROI 52 0.07 0.212 1.9 0 0 0 0
4 2011.FACILITY.CHARLEROI 52 0.08 0.209 2 0 0 0 0
Merge_splitnames <- Merge %>%
separate(Row.names,sep = "\\.",into = c("Year", "Catchment", "Locality"), extra = "drop")
While everything seems correct, the output is a tibble without the first 2 columns (the ones which have a name comprising an accent in French) :
# A tibble: 6,951 x 9
Year Catchment Locality Number_of_analysis~ DL_Minimum DL_Mean DL_Maximum Number_of_measur~ Measure_Minimum Measure_Mean Measure_Maximum
<I<chr>> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
3 2011 FACILITY CHARLEROI 52 0.07 0.212 1.9 0 0 0 0
4 2011 FACILITY CHARLEROI 52 0.08 0.209 2 0 0 0 0
Any idea how to deal with this issue ? I wish to keep the real name in French (with the accent). This is quite surprising for me, I've never got any issue with all the other functions from tidyverse.
NB : this is a simple and reproducible example, my real tibble is about 100 times bigger
separate is retaining the accent for me:
library(tidyverse)
tribble(
~names,
"2011.FACILITY.PONT-À-CELLES",
"2011.FACILITY.PONT-À-CELLES",
"2011.FACILITY.CHARLEROI",
"2011.FACILITY.CHARLEROI"
) %>%
separate(names, sep = "\\.", into = c("Year", "Catchment", "Locality"))
#> # A tibble: 4 × 3
#> Year Catchment Locality
#> <chr> <chr> <chr>
#> 1 2011 FACILITY PONT-À-CELLES
#> 2 2011 FACILITY PONT-À-CELLES
#> 3 2011 FACILITY CHARLEROI
#> 4 2011 FACILITY CHARLEROI
Created on 2022-05-06 by the reprex package (v2.0.1)
Assuming DF shown reproducibly in the Note at the end, use extra = "merge" in separate . (It is possible that you may need to change your locale but I did not need to do that. Some things to try are shown in How to change the locale of R? or Using weekdays with any locale under Windows )
library(tidyr)
DF %>%
separate(Row.names, c("Year", "Catchment", "Locality"), extra = "merge")
giving:
Year Catchment Locality Number_of_analysis~ DL_Minimum DL_Mean
1 2011 FACILITY PONT-À-CELLES 52 0.60 1.810
2 2011 FACILITY PONT-À-CELLES 52 0.07 0.177
3 2011 FACILITY CHARLEROI 52 0.07 0.212
4 2011 FACILITY CHARLEROI 52 0.08 0.209
DL_Maximum Number_of_measur~ Measure_Minimum Measure_Mean Measure_Maximum
1 16.0 0 0 0 0
2 1.3 0 0 0 0
3 1.9 0 0 0 0
4 2.0 0 0 0 0
Note
DF <-
structure(list(Row.names = c("2011.FACILITY.PONT-À-CELLES", "2011.FACILITY.PONT-À-CELLES",
"2011.FACILITY.CHARLEROI", "2011.FACILITY.CHARLEROI"), `Number_of_analysis~` = c(52L,
52L, 52L, 52L), DL_Minimum = c(0.6, 0.07, 0.07, 0.08), DL_Mean = c(1.81,
0.177, 0.212, 0.209), DL_Maximum = c(16, 1.3, 1.9, 2), `Number_of_measur~` = c(0L,
0L, 0L, 0L), Measure_Minimum = c(0L, 0L, 0L, 0L), Measure_Mean = c(0L,
0L, 0L, 0L), Measure_Maximum = c(0L, 0L, 0L, 0L)), class = "data.frame", row.names = c("1",
"2", "3", "4"))
I have lab records of 30,000 unique ID's. I need to convert my data from long to wider format for each ID and TEST_DATE related to that unique ID.
Example for one ID :
I need to convert this to a wider format like this:
I have a dataset with 30,000 ID's and I need to do this for each ID. The ID with the maximum number of tests will determine our number of columns.
I will appreciate any ideas that you might have to solve this problem! Thank you
Try this:
library(dplyr)
library(tidyr)
#Code
new <- df %>%
group_by(ACCT,TEST_DATE) %>%
summarise(RESULT=round(mean(RESULT,na.rm=T),2)) %>%
ungroup() %>%
mutate(across(-ACCT,~as.character(.))) %>%
pivot_longer(-ACCT) %>%
group_by(ACCT,name) %>%
mutate(name=paste0(name,row_number())) %>%
pivot_wider(names_from = name,values_from=value) %>%
mutate(across(starts_with('RESULT'),~as.numeric(.)))
Output:
# A tibble: 2 x 7
# Groups: ACCT [2]
ACCT TEST_DATE1 RESULT1 TEST_DATE2 RESULT2 TEST_DATE3 RESULT3
<int> <chr> <dbl> <chr> <dbl> <chr> <dbl>
1 37733 9/1/2016 3 10/18/2016 2 11/1/2016 1
2 37734 9/1/2016 5 10/18/2016 4 11/1/2016 3
Some data used:
#Data
df <- structure(list(ACCT = c(37733L, 37733L, 37733L, 37734L, 37734L,
37734L), TEST_DATE = c("9/1/2016", "10/18/2016", "11/1/2016",
"9/1/2016", "10/18/2016", "11/1/2016"), RESULT = c(3L, 2L, 1L,
5L, 4L, 3L)), class = "data.frame", row.names = c(NA, -6L))
Here is a data.table option with dcast that might help (borrow data from #Duck)
> dcast(setDT(df)[, Q := seq(.N), ACCT], ACCT ~ Q, value.var = c("TEST_DATE", "RESULT"))
ACCT TEST_DATE_1 TEST_DATE_2 TEST_DATE_3 RESULT_1 RESULT_2 RESULT_3
1: 37733 9/1/2016 10/18/2016 11/1/2016 3 2 1
2: 37734 9/1/2016 10/18/2016 11/1/2016 5 4 3
Another option is using melt along with dcast, where the resulting format might be the one you are exactly after
suppressWarnings({
type.convert(
dcast(
melt(
setDT(df)[, Q := seq(.N), ACCT],
id = c("ACCT", "Q"),
measure = c("TEST_DATE", "RESULT")
)[order(ACCT, Q)],
ACCT ~ Q + variable,
value.var = "value"
),
as.is = TRUE
)
})
which gives
ACCT 1_TEST_DATE 1_RESULT 2_TEST_DATE 2_RESULT 3_TEST_DATE 3_RESULT
1: 37733 9/1/2016 3 10/18/2016 2 11/1/2016 1
2: 37734 9/1/2016 5 10/18/2016 4 11/1/2016 3
Take this simple route
library(tidyverse)
df %>% group_by(ACCT, TEST_DATE) %>% summarise(RESULT = mean(RESULT)) %>%
group_by(ACCT) %>% mutate(testno = row_number(), resultno = row_number()) %>%
pivot_wider(id_cols = ACCT, names_from = c("testno", "resultno"), values_from = c(TEST_DATE, RESULT))
# A tibble: 2 x 9
# Groups: ACCT [2]
ACCT TEST_DATE_1_1 TEST_DATE_2_2 TEST_DATE_3_3 TEST_DATE_4_4 RESULT_1_1 RESULT_2_2 RESULT_3_3 RESULT_4_4
<int> <date> <date> <date> <date> <dbl> <dbl> <dbl> <dbl>
1 37733 2016-01-07 2016-01-09 2016-01-11 2016-08-10 5 4.5 1 2
2 37734 2016-01-21 2016-08-20 NA NA 3 4 NA NA
data (dput) used
> dput(df)
structure(list(ACCT = c(37733L, 37733L, 37733L, 37733L, 37734L,
37734L, 37733L), TEST_DATE = structure(c(16809, 17023, 16811,
16807, 17033, 16821, 16809), class = "Date"), RESULT = c(3L,
2L, 1L, 5L, 4L, 3L, 6L)), row.names = c(NA, -7L), class = "data.frame")
df
> df
ACCT TEST_DATE RESULT
1 37733 2016-01-09 3
2 37733 2016-08-10 2
3 37733 2016-01-11 1
4 37733 2016-01-07 5
5 37734 2016-08-20 4
6 37734 2016-01-21 3
7 37733 2016-01-09 6