Related
I am attempting to output a fairly long gt table using gtsave. It keeps getting truncated, where the last few columns are missing. The scrollbar also shows up in the outputted image.
Here is the outputted table:
I want it to look like my 5% variation table, which is below:
Here is my code for both tables (including R Markdown headers):
3% Table
{r hotspot 3% 100 coverage reads table, fig.dim = c(12, 6)}
setwd(output_hotspot)
hotspot_3pct_table <- hotspot_3pct_longer %>%
ungroup() %>%
gt() %>%
tab_header(title = "Variation Across Lots at Select Base Pairs", subtitle = paste0("At least 100 coverage reads with at least 3% variation")) %>%
cols_label(
lot = "Lot"
) %>%
fmt_number(columns = 2:24, decimals = 2) %>%
fmt_missing(columns = everything(), missing_text = "--") %>%
tab_source_note(source_note = paste0("Dash indicates no variation present in lot")) %>%
tab_options(
table.width = pct(100)
)
gtsave(hotspot_3pct_table, "hotspot_3pct.png")
5% Table
{r hotspot 5% 100 coverage reads table, fig.dim = c(12, 6)}
setwd(output_hotspot)
hotspot_5pct_table <- hotspot_5pct_longer %>%
ungroup() %>%
gt() %>%
tab_header(title = "Variation Across Lots at Select Base Pairs", subtitle = paste0("At least 100 coverage reads with at least 5% variation")) %>%
cols_label(
lot = "Lot"
) %>%
fmt_number(columns = 2:18, decimals = 2) %>%
fmt_missing(columns = everything(), missing_text = "--") %>%
tab_source_note(source_note = paste0("Dash indicates no variation present in lot"))
gtsave(hotspot_5pct_table, "hotspot_5pct.png", expand = 10)
I have tried different fig.dim settings and different expand settings. I haven't encountered this issue before, so I am not sure how to approach this.
Reprex of each dataset:
3% Dataset
hotspot_3pct_longer = structure(list(lot = c("ABL GMP1", "MVS", "Tox Lot", "MVB1",
"MVB2", "CTM2", "CTM1", "Fuji 30k", "Fuji 7.5k"), `12` = c(0.0382775119617225,
0.0390625, 0.034883720930233, NA, NA, NA, NA, NA, NA), `13` = c(0.0588235294117647,
NA, NA, 0.048076923076924, 0.0714285714285714, 0.0417789757412399,
NA, NA, NA), `253` = c(NA, NA, 0.03360709902766, NA, NA, NA,
NA, NA, NA), `1266` = c(0.0646451454923886, NA, NA, NA, NA, NA,
NA, NA, NA), `1820` = c(1, NA, NA, NA, NA, NA, NA, NA, NA), `1821` = c(1,
NA, NA, NA, NA, NA, NA, NA, NA), `2861` = c(0.0434994715017482,
NA, NA, NA, NA, NA, NA, NA, NA), `3031` = c(0.183159188690842,
NA, NA, NA, NA, NA, NA, NA, NA), `3252` = c(0.0521527362955475,
NA, NA, NA, NA, NA, NA, NA, NA), `3368` = c(0.107515576323988,
NA, NA, NA, NA, NA, NA, NA, NA), `3512` = c(0.345980014097939,
NA, 0.064333937531195, NA, NA, 0.0822086320821032, 0.078818748712571,
0.089279658964298, NA), `3527` = c(0.17209788747124, NA, 0.0377838832455329,
NA, NA, 0.0471288691223414, 0.044333490343853, 0.059236465044716,
NA), `3554` = c(0.250983372072233, NA, 0.05112660944206, NA,
NA, 0.0639663737103554, 0.055374526495866, 0.0861535232698471,
0.031875819851334), `4752` = c(NA, NA, 0.04827943749595, NA,
NA, 0.0498005129666572, 0.049766115231033, 0.052495800335974,
0.0519236625723281), `4761` = c(NA, NA, 0.038136808232708, NA,
NA, 0.0317014863319821, 0.036080058906219, 0.034794423440454,
0.033717392388648), `7078` = c(NA, NA, 0.032269021739131, NA,
NA, NA, NA, NA, NA), `7299` = c(0.0830269157229004, 0.083128195417535,
0.361278273500727, 0.375946173254836, 0.216166788588149, 0.110078513058805,
0.10393717387867, 0.355137204850032, 0.310679611650486), `7300` = c(0.0525369400359628,
0.050222762251924, 0.245149911816579, 0.232037691401649, 0.148067737733391,
0.0629358437935844, 0.063008245663919, 0.236435818262021, 0.20123839009288
), `7301` = c(NA, NA, 0.0519736842105269, 0.038054968287527,
NA, NA, NA, 0.037803780378038, 0.034240150093809), `7315` = c(NA,
NA, NA, 0.037735849056604, 0.0406386066763426, NA, NA, NA, 0.036363636363637
), `7318` = c(0.0474754244861484, 0.07482430756511, NA, NA, 0.0369206598586017,
0.0493811726465808, 0.046463780540078, NA, NA), `7319` = c(0.0623240852432649,
0.083063994828701, NA, 0.0326086956521739, 0.058765915768854,
0.0560072267389341, 0.0604447228311939, 0.053601340033501, 0.039495798319328
), `7320` = c(0.0808298755186722, 0.10897808803568, NA, 0.045643153526971,
0.0581113801452785, 0.0764283011729096, 0.081006685017696, 0.04,
0.031458531935177)), row.names = c(NA, -9L), groups = structure(list(
lot = c("ABL GMP1", "CTM1", "CTM2", "Fuji 30k", "Fuji 7.5k",
"MVB1", "MVB2", "MVS", "Tox Lot"), .rows = structure(list(
1L, 7L, 6L, 8L, 9L, 4L, 5L, 2L, 3L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
5% Dataset
hotspot_5pct_longer = structure(list(lot = c("MVB2", "ABL GMP1", "CTM1", "CTM2", "Tox Lot",
"Fuji 30k", "Fuji 7.5k", "MVB1", "MVS"), `13` = c(0.0714285714285714,
0.0588235294117647, NA, NA, NA, NA, NA, NA, NA), `1266` = c(NA,
0.0646451454923886, NA, NA, NA, NA, NA, NA, NA), `1820` = c(NA,
1, NA, NA, NA, NA, NA, NA, NA), `1821` = c(NA, 1, NA, NA, NA,
NA, NA, NA, NA), `3031` = c(NA, 0.183159188690842, NA, NA, NA,
NA, NA, NA, NA), `3252` = c(NA, 0.0521527362955475, NA, NA, NA,
NA, NA, NA, NA), `3368` = c(NA, 0.107515576323988, NA, NA, NA,
NA, NA, NA, NA), `3512` = c(NA, 0.345980014097939, 0.078818748712571,
0.0822086320821032, 0.064333937531195, 0.089279658964298, NA,
NA, NA), `3527` = c(NA, 0.17209788747124, NA, NA, NA, 0.059236465044716,
NA, NA, NA), `3554` = c(NA, 0.250983372072233, 0.055374526495866,
0.0639663737103554, 0.05112660944206, 0.0861535232698471, NA,
NA, NA), `4752` = c(NA, NA, NA, NA, NA, 0.052495800335974, 0.0519236625723281,
NA, NA), `7299` = c(0.216166788588149, 0.0830269157229004, 0.10393717387867,
0.110078513058805, 0.361278273500727, 0.355137204850032, 0.310679611650486,
0.375946173254836, 0.083128195417535), `7300` = c(0.148067737733391,
0.0525369400359628, 0.063008245663919, 0.0629358437935844, 0.245149911816579,
0.236435818262021, 0.20123839009288, 0.232037691401649, 0.050222762251924
), `7301` = c(NA, NA, NA, NA, 0.0519736842105269, NA, NA, NA,
NA), `7318` = c(NA, NA, NA, NA, NA, NA, NA, NA, 0.07482430756511
), `7319` = c(0.058765915768854, 0.0623240852432649, 0.0604447228311939,
0.0560072267389341, NA, 0.053601340033501, NA, NA, 0.083063994828701
), `7320` = c(0.0581113801452785, 0.0808298755186722, 0.081006685017696,
0.0764283011729096, NA, NA, NA, NA, 0.10897808803568)), row.names = c(NA,
-9L), groups = structure(list(lot = c("ABL GMP1", "CTM1", "CTM2",
"Fuji 30k", "Fuji 7.5k", "MVB1", "MVB2", "MVS", "Tox Lot"), .rows = structure(list(
2L, 3L, 4L, 6L, 7L, 8L, 1L, 9L, 5L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Pass the options to webshot() by using vwidth and vheight. Refer the documentation,
https://www.rdocumentation.org/packages/webshot/versions/0.5.2/topics/webshot
gtsave(hotspot_3pct_table, "hotspot_3pct.png", vwidth = 1500, vheight = 1000)
I want to delete rows only when selected columns are NA.
Data here:
dput(df)
structure(list(record_id = c("BIV-1601-1250-E1", "BIV-1601-1250-E1",
"BIV-1601-1250-E1", "BIV-1601-1250-E1", "BIV-1601-1250-E1", "BIV-1601-1719-E1",
"BIV-1601-1719-E1", "BIV-1601-1719-E1", "BIV-1601-1719-E1", "BIV-1601-1719-E1",
"BIV-1402-1368-E1", "BIV-1402-1368-E1", "BIV-1402-1368-E1", "BIV-1402-1368-E1",
"BIV-1402-1368-E1", "BIV-1101-1038-E1", "BIV-1101-1038-E1", "BIV-1101-1038-E1",
"BIV-1101-1038-E1", "BIV-1101-1038-E1", "BIV-1701-1145-E1", "BIV-1701-1145-E1",
"BIV-1701-1145-E1", "BIV-1701-1145-E1", "BIV-1701-1145-E1", "BIV-1102-2040-E1",
"BIV-1102-2040-E1", "BIV-1102-2040-E1", "BIV-1102-2040-E1", "BIV-1102-2040-E1"
), DATE = structure(c(NA, 17478, 17480, 17479, NA, 18295, NA,
18296, 18296, NA, NA, 17912, 17914, 17934, NA, 17221, 17221,
17223, 17224, NA, NA, 17820, 17822, 17823, NA, NA, 18359, 18361,
18361, NA), class = "Date"), haemoglobin = structure(c(NA, 101,
NA, NA, NA, 100, NA, NA, NA, NA, NA, 97.6, NA, NA, NA, NA, 109,
NA, NA, NA, NA, 120, NA, NA, NA, NA, 205, NA, NA, NA), label = "g/L", class = c("labelled",
"numeric")), WBC = structure(c(NA, NA, "5", NA, NA, NA, "27.6",
NA, NA, NA, NA, NA, "8.8", NA, NA, NA, NA, "10.3", NA, NA, NA,
NA, "23.5", NA, NA, NA, NA, "11.81", NA, NA), label = "10^9/L", class = c("labelled",
"character")), CRP = c(NA, NA, "9", NA, NA, NA, "499", NA, NA,
NA, NA, NA, "7", NA, NA, NA, "43", "54.4", NA, NA, NA, NA, "37",
NA, NA, NA, NA, "<4.0", NA, NA), admission_day = c(NA, 0L, 2L,
1L, NA, 1L, NA, 2L, 2L, NA, NA, 1L, 3L, 23L, NA, 0L, 0L, 2L,
3L, NA, NA, 0L, 2L, 3L, NA, NA, 0L, 2L, 2L, NA)), row.names = c(NA,
-30L), groups = structure(list(record_id = c("BIV-1101-1038-E1",
"BIV-1102-2040-E1", "BIV-1402-1368-E1", "BIV-1601-1250-E1", "BIV-1601-1719-E1",
"BIV-1701-1145-E1"), .rows = structure(list(16:20, 26:30, 11:15,
1:5, 6:10, 21:25), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 6L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
I only want to drop the lines when the following columns DATE, haemoglobin, CRP, WBC, and admission_day all equal NA. My thoughts were something like this:
library(dplyr)
cols_to_drop <- c("DATE", "haemoglobin", "CRP", "WBC", "admission_day")
df <- df %>% mutate(case_when(is.na(cols_to_drop) ~ drop_na(DATE)))
Obviously (as usual for me) this doesn't work... II think it's something to do with needing to make case_when equal to a particular variable... but I want it to apply across the whole dataframe.
If someone can help, I'd be grateful!
You can use if_all/if_any -
library(dplyr)
cols_to_drop <- c("DATE", "haemoglobin", "CRP", "WBC", "admission_day")
df %>% filter(!if_all(cols_to_drop, is.na))
With if_any -
df %>% filter(if_any(cols_to_drop, Negate(is.na)))
I've got a two columns that contain information about sequences' starts and ends. I want to create a sequence column from that, i.e. each sequence starts when a seq_start is 1 and ends in first row appearing after seq_start = 1 in which seq_end = 1. How can I do it with tidyverse? The data is shown below, where seq is expected output. Please note that when seq_end = 1 and seq_start = 1 within the same rows this produces the sequence of length one.
structure(list(seq_start = c(NA, NA, NA, NA, NA, 1, NA, NA, NA,
NA, NA, 1, NA, 1, NA, NA, NA, NA, NA, NA, 1, 1, NA, NA, NA, NA,
NA, 1, 1, NA, NA, 1, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, 1,
NA), seq_end = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L,
1L, 1L, 1L, NA, NA, 1L, 1L, 1L, NA, 1L, NA, NA, NA, NA, NA, 1L,
1L, NA, NA, 1L, 1L, NA, 1L, 1L, 1L, 1L, NA, NA, NA, 1L, 1L, NA,
NA, NA, NA, NA, NA, 1L, NA, 1L, 1L, NA, 1L, 1L, NA, NA, 1L, 1L,
1L), seq = c(NA, NA, NA, NA, NA, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
NA, 3L, NA, NA, NA, NA, NA, NA, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 6L,
7L, 7L, 7L, 8L, NA, NA, NA, 9L, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 10L, 10L, NA, NA, NA, NA, NA, NA, NA, 11L,
NA)), .Names = c("seq_start", "seq_end", "seq"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -60L))
Here's a solution that makes heavy use of dplyr package's lag() function, along with cumsum() from the base package, to produce the expected result. It's probably not the most succinct solution out there, but I do think it's reasonably intuitive to understand:
d <- d %>%
# new.seq.starts starts from 0, and increments by 1 every time seq_starts takes on
# the value 1, like this: 0, 0, 0, 1, 1, 1, 1, 2, 2, ...
# Rows with the same new.seq.starts value are thus part of the same "run".
mutate(new.seq.starts = cumsum(!is.na(seq_start))) %>%
# group by each "run"
group_by(new.seq.starts) %>%
# any.ending.so.far counts whether there has been ANY seq_end == 1 within the run yet.
# first.ending is TRUE only if it's the first row (within the run) to have an ending.
mutate(any.ending.so.far = cumsum(!is.na(seq_end)),
first.ending = any.ending.so.far == 1 &
(is.na(lag(any.ending.so.far)) | lag(any.ending.so.far) < 1)) %>%
ungroup() %>%
# result keeps the new.seq.starts values only if there's no ending yet (i.e.
# any.ending.so.far == 0), or only just ended (first.ending == TRUE). Otherwise,
# it takes on the value NA.
mutate(result = ifelse(new.seq.starts > 0 &
(any.ending.so.far == 0 | first.ending),
new.seq.starts, NA)) %>%
# Remove helper variables as they are no longer needed.
select(-c(new.seq.starts, any.ending.so.far, first.ending))
> all.equal(d$seq, d$result)
[1] TRUE
I have a survey question in which respondents could select multiple answers (for 16 possible combinations, e.g. "Which color do you like?" can result in responses "red, blue, green, yellow" or "red, blue, green, black" etc.
These 16 possible combinations are contained in a spreadsheet:
Image 1: First two rows of the spreadsheet (full spreadsheet has 16 rows)
Example 1:
structure(list(V1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("red", "ruby"), class = "factor"),
V2 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L), .Label = c("blue", "violet"), class = "factor"),
V3 = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 2L, 2L), .Label = c("green", "turqoise"), class = "factor"),
V4 = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L), .Label = c("black", "yellow"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4"), class = "data.frame", row.names = c(NA, -16L
))
The dataframe with responses has sixteen columns for this question (one column per every simple combination of colors). If respondent 1 selected the first combination, only the first column contains data; similarly, if respondent 2 selected the second combination, the second column contains data. The other are empty:
Image 2: The first two columns of the dataframe
Example 2:
structure(list(respondentID = 1:16, v1 = c(1L, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v2 = c(NA, 1L, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v3 = c(NA,
NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA),
v4 = c(NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, 1L, 1L, NA,
NA, NA, NA), v5 = c(NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), v6 = c(NA, 1L, NA, NA, NA, NA, NA,
NA, NA, 1L, NA, NA, NA, NA, NA, NA), v7 = c(NA, NA, NA, NA,
1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v8 = c(NA,
NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), v9 = c(NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA,
NA, NA, NA, NA), v10 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), v11 = c(NA, NA, NA, NA,
NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA), v12 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA
), v13 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 1L, NA, NA), v14 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), v15 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v16 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L
)), .Names = c("respondentID", "v1", "v2", "v3", "v4", "v5",
"v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
"v16"), class = "data.frame", row.names = c(NA, -16L))
(Of course, in practice respondent 1 didn't necessarily choose combination 1).
All the information in the dataframe is the number "1", which corresponds to appropriate combination in the spreadsheet.
In order to analyze responses to the question, I need to extract the combination from the spreadsheet and import it into the dataframe with responses, so that I get four new columns in the dataframe with the combination of colors chosen by a respondent (e.g. red, blue, green, yellow for respondent 1).
I don't think there's any way to do this using apply, so I guess I need to write a for loop to extract and import the data. Any advice on how to do this?
If you put the second data frame into a long shape, you can filter for just the combinations each person chose, and then join the second data frame with the first. The two data frames have combination labels that can be reconciled between the two to join on.
Note that I changed the column names in the first data frame, df1_with_id, to be color1, etc, only because otherwise you would have v1, v2, ... in one data frame, and V1, V2, ... representing something different in the other. Not a necessary change, but it's good to keep from confusing what different variables mean.
library(tidyverse)
df1 <- structure(list(V1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("red", "ruby"), class = "factor"),V2 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,1L, 1L, 2L, 2L, 2L, 2L), .Label = c("blue", "violet"), class = "factor"),V3 = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,2L, 2L, 1L, 1L, 2L, 2L), .Label = c("green", "turqoise"), class = "factor"),V4 = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,2L, 1L, 2L, 1L, 2L, 1L), .Label = c("black", "yellow"), class = "factor")), .Names = c("V1","V2", "V3", "V4"), class = "data.frame", row.names = c(NA, -16L))
df2 <- structure(list(respondentID = 1:16, v1 = c(1L, NA, NA, NA, NA,NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v2 = c(NA, 1L, NA,NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v3 = c(NA,NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA),v4 = c(NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, 1L, 1L, NA,NA, NA, NA), v5 = c(NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA,NA, NA, NA, NA, NA, NA), v6 = c(NA, 1L, NA, NA, NA, NA, NA,NA, NA, 1L, NA, NA, NA, NA, NA, NA), v7 = c(NA, NA, NA, NA,1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v8 = c(NA,NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v9 = c(NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA,NA, NA, NA, NA), v10 = c(NA, NA, NA, NA, NA, NA, NA, NA,NA, NA, NA, NA, NA, NA, NA, NA), v11 = c(NA, NA, NA, NA,NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA), v12 = c(NA,NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA), v13 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,NA, 1L, NA, NA), v14 = c(NA, NA, NA, NA, NA, NA, NA, NA,NA, NA, NA, NA, NA, NA, NA, NA), v15 = c(NA, NA, NA, NA,NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v16 = c(NA,NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L)), .Names = c("respondentID", "v1", "v2", "v3", "v4", "v5","v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15","v16"), class = "data.frame", row.names = c(NA, -16L))
df1_with_id <- df1 %>%
setNames(paste0("color", 1:4)) %>%
mutate(combo = paste0("v", row_number()))
head(df1_with_id)
#> color1 color2 color3 color4 combo
#> 1 red blue green yellow v1
#> 2 red blue green black v2
#> 3 red blue turqoise yellow v3
#> 4 red blue turqoise black v4
#> 5 red violet green yellow v5
#> 6 red violet green black v6
df2 %>%
gather(key = combo, value = val, -respondentID) %>%
filter(!is.na(val)) %>%
left_join(df1_with_id, by = "combo")
#> respondentID combo val color1 color2 color3 color4
#> 1 1 v1 1 red blue green yellow
#> 2 2 v2 1 red blue green black
#> 3 7 v3 1 red blue turqoise yellow
#> 4 4 v4 1 red blue turqoise black
#> 5 11 v4 1 red blue turqoise black
#> 6 12 v4 1 red blue turqoise black
#> 7 3 v5 1 red violet green yellow
#> 8 2 v6 1 red violet green black
#> 9 10 v6 1 red violet green black
#> 10 5 v7 1 red violet turqoise yellow
#> 11 6 v8 1 red violet turqoise black
#> 12 8 v9 1 ruby blue green yellow
#> 13 9 v11 1 ruby blue turqoise yellow
#> 14 13 v12 1 ruby blue turqoise black
#> 15 14 v13 1 ruby violet green yellow
#> 16 16 v16 1 ruby violet turqoise black
Created on 2018-05-08 by the reprex package (v0.2.0).
I'm not sure to understand what you want to do. Using the tidyverse packages and the melt() function of the reshape2 package, you might try
df_respondent_combination <-
df_respondent %>%
melt(measure.vars = c(2:ncol(.)), na.rm = T) %>%
cbind(df_combination) %>%
select(-variable, -value) %>%
arrange(respondentID)
With df_respondent_combination the new dataframe expected, df_respondent your example 2 and df_combination your example 1.
I've encountered a rather puzzling tidyr::spread() error. When I tried to run code (example below) in the full dataframe, I got the "Duplicate identifiers for rows" error.
I subset the (very large) dataframe to investigate and re-ran the code. This time it worked (see subset1df dput below). Then I tried it again with a different subset (subset2df), and I got the error message again. I honestly have no idea how to make sense of this and will greatly appreciate any help.
Reproducible code below:
subset1df:
structure(list(v1 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 1L, NA, NA, NA, NA), .Label = "2", class = "factor"),
v2 = structure(c(1L, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 1L, NA, NA), .Label = "2", class = "factor"),
v3 = structure(c(NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_), .Label = character(0), class = "factor"),
v4 = structure(c(NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v5 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA,
NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v6 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 1L), .Label = "2", class = "factor"),
v7 = structure(c(NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_), .Label = character(0), class = "factor"),
v8 = structure(c(NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_), .Label = character(0), class = "factor"),
v9 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1L, NA, NA, NA, NA, NA), .Label = "1", class = "factor"),
v10 = structure(c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_), .Label = character(0), class = "factor"),
v11 = structure(c(NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA,
NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v12 = structure(c(NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v13 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L,
NA, NA, NA, NA, NA, NA), .Label = "1", class = "factor"),
v14 = structure(c(NA, NA, NA, NA, NA, NA, 2L, NA, NA, NA,
NA, NA, 1L, NA, NA, NA), .Label = c("1", "2"), class = "factor"),
v15 = structure(c(NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v16 = structure(c(NA, NA, 2L, NA, NA, 2L, NA, NA, NA, NA,
NA, NA, NA, NA, 1L, NA), .Label = c("1", "2"), class = "factor"),
respondentID = structure(c(7L, 7L, 7L, 5L, 6L, 6L, 4L, 4L,
4L, 3L, 3L, 3L, 2L, 2L, 2L, 1L), .Label = c("EO15", "EO17",
"EO19", "EO21", "Eo23", "EO23", "EO24"), class = "factor")), .Names = c("v1",
"v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
"v12", "v13", "v14", "v15", "v16", "respondentID"), row.names = c(NA,
-16L), class = "data.frame")
subset2df:
structure(list(v2 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 1L, NA, NA, 1L, NA, 1L, NA, 1L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v4 = structure(c(NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v5 = structure(c(NA, NA, NA, 2L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA), .Label = c("1",
"2"), class = "factor"), v6 = structure(c(NA, 1L, NA, NA,
2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), .Label = c("1", "2"), class = "factor"),
v9 = structure(c(NA, NA, NA, NA, NA, 1L, NA, NA, NA, 1L,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v11 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA,
NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "1", class = "factor"),
v12 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2L, NA), .Label = c("1",
"2"), class = "factor"), v13 = structure(c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA,
NA, NA, 2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 2L, NA, NA, NA), .Label = c("1", "2"), class = "factor"),
v14 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v15 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1L, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v16 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, 1L, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
respondentID = structure(c(21L, 20L, 20L, 19L, 18L, 18L,
1L, 1L, 16L, 16L, 16L, 10L, 10L, 17L, 15L, 15L, 15L, 14L,
14L, 14L, 13L, 12L, 12L, 11L, 11L, 11L, 8L, 9L, 9L, 6L, 7L,
7L, 3L, 2L, 3L, 4L, 4L, 4L, 5L), .Label = c("EO11", "Eo14",
"EO14", "EO16", "EO18", "EO26", "EO27", "Eo28", "EO28", "EO3",
"Eo30", "EO32", "EO331", "EO35", "EO37", "EO4", "EO41", "EO6",
"EO6 ", "EO7", "EO7 "), class = "factor")), .Names = c("v2",
"v4", "v5", "v6", "v9", "v11", "v12", "v13", "v14", "v15", "v16",
"respondentID"), row.names = c(NA, -39L), class = "data.frame")
combodf_id (needed to execute code):
structure(list(color1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("red", "ruby"
), class = "factor"), color2 = structure(c(1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("blue",
"violet"), class = "factor"), color3 = structure(c(1L, 1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("green",
"turqoise"), class = "factor"), color4 = structure(c(2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("black",
"yellow"), class = "factor"), combo = c("v1", "v2", "v3", "v4",
"v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
"v15", "v16")), class = "data.frame", .Names = c("color1", "color2",
"color3", "color4", "combo"), row.names = c(NA, -16L))
code:
result_df <- subset1df %>%
gather(key = combo, value = val, -respondentID) %>%
filter(!is.na(val)) %>%
left_join(combodf_id, by = "combo") %>%
arrange(respondentID) %>%
rename(rose_color1 = color1, rose_color2 = color2,
tulip_color1 = color3, tulip_color2 = color4) %>%
gather(color, value, rose_color1:tulip_color2) %>%
separate(color, into = c('flower', 'color')) %>%
spread(color, value) %>%
mutate(val = if_else(val == 1, 'rose', 'tulip')) %>%
mutate(val = if_else(val == flower, 1, 0)) %>%
select(respondentID, flower, color1, color2, choice = val)
The solution by #Tung below is very similar to the one in Spread with duplicate identifiers (using tidyverse and %>%)
, but neither of them quite solves the problem.
In your subset2df...
you have removed some of the response columns (e.g. v1, v3, v7, etc.)... that's probably why you have a bunch of rows/respondants without any response (all NA)
your respondentID column has levels with whitespace in some values that will probably mess things up later on (eg. "EO7" and "EO7 ")
there are duplicated rows, e.g. subset2df[c(15, 17), ]
all of your columns are factors... particularly with the response columns with integer values, I find that strange. tidyr functions gather and spread will take into account the levels of a factor, which can seem especially strange when you have subset a factor variable and not all of the levels are represented in the data.
You should probably fix those problems first, because they are what likely lead to your problems later, however... the reason you get the error "Duplicate identifiers for rows" is because the data frame you're passing to spread(color, value) has duplicate rows. You can force this to work by adding distinct() %>% one line before it, but be aware that the only reason you have to do that is because of the other problems before.
subset2df %>%
as_tibble() %>%
mutate_at(vars(v2:v16), as.integer) %>%
gather(key = combo, value = val, -respondentID, na.rm = T) %>%
filter(!is.na(val)) %>%
left_join(combodf_id, by = "combo") %>%
arrange(respondentID) %>%
rename(rose_color1 = color1, rose_color2 = color2,
tulip_color1 = color3, tulip_color2 = color4) %>%
gather(color, value, rose_color1:tulip_color2) %>%
separate(color, into = c('flower', 'color')) %>%
distinct() %>%
spread(color, value) %>%
mutate(val = if_else(val == 1, 'rose', 'tulip')) %>%
mutate(val = if_else(val == flower, 1, 0)) %>%
select(respondentID, flower, color1, color2, choice = val)
I would strongly suggest fixing all the above problems first though, like so (notice you won't need the distinct command further down in the chain because you will have already applied that to the original data)...
subset2df %>%
as_tibble() %>% # tibble has better printing methods
mutate_at(vars(-respondentID), as.integer) %>% # convert response to numeric
mutate(respondentID = as.character(respondentID)) %>% # convert to char
mutate(respondentID = trimws(respondentID)) %>% # remove whitespace
distinct() %>% # remove duplicate rows
gather(key = combo, value = val, -respondentID, na.rm = T) %>%
left_join(combodf_id, by = "combo") %>%
mutate_at(vars(color1:color4), as.character) %>% # convert colors to char
rename(rose_color1 = color1, rose_color2 = color2,
tulip_color1 = color3, tulip_color2 = color4) %>%
gather(color, value, rose_color1:tulip_color2) %>%
separate(color, into = c('flower', 'color')) %>%
spread(color, value) %>%
mutate(val = if_else(val == 1, 'rose', 'tulip')) %>%
mutate(val = if_else(val == flower, 1L, 0L)) %>%
select(respondentID, flower, color1, color2, choice = val)