Reshaping data to wide without quantitative data - r

I think I understand general reshaping. However, I have data that needs to be reshaped to wide format, but I don't want to show scores or quantitative data indexed by another variable.
Instead, I want to switch one variable from a single variable to five variables based on its values. No other variables should be indexed. I want the values of the one variable to form the other five variables, and values of those variables should simply be the same as their variable names.
I've included an example of a before and after.
Data:
> dput(ansscales3)
structure(list(ATID = c(33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
33, 33, 33), AnswerTypeDesc = c("VS|S|SD|VD", "VS|S|SD|VD", "VS|S|SD|VD",
"VS|S|SD|VD", "VS|S|SD|VD", "VS|S|SD|VD", "VS|S|SD|VD", "VS|S|SD|VD",
"VS|S|SD|VD", "VS|S|SD|VD", "VS|S|SD|VD", "VS|S|SD|VD", "VS|S|SD|VD"
), AValue = c(4, 3, 2, 1, 4, 3, 2, 1, 2, 1, 4, 3, 4), ScaleValue = c(1,
2, 3, 4, 1, 2, 3, 4, 3, 4, 1, 2, 1), ADesc = c("Very Satisfied",
"Satisfied", "Somewhat Dissatisfied", "Very Dissatisfied", "Very Satisfied",
"Satisfied", "Somewhat Dissatisfied", "Very Dissatisfied", "Somewhat Dissatisfied",
"Very Dissatisfied", "Very Satisfied", "Satisfied", "Very Satisfied"
), AOrder = c(1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 1, 2, 1), StatGroup = c("AdjN",
"AdjN", "AdjN", "AdjN", "N", "N", "N", "N", "PctNeg", "PctNeg",
"PctPos", "PctPos", "TopBox"), Cycles = c(11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11)), .Names = c("ATID", "AnswerTypeDesc",
"AValue", "ScaleValue", "ADesc", "AOrder", "StatGroup", "Cycles"
), row.names = c(NA, -13L), class = "data.frame")
Reshape into:
> dput(atids1)
structure(list(ATID = c(33, 33, 33, 33), AnswerTypeDesc = structure(c(1L,
1L, 1L, 1L), .Label = "VS|S|SD|VD", class = "factor"), AValue = c(4,
3, 2, 1), ScaleValue = c(1, 2, 3, 4), ADesc = c("Very Satisfied",
"Satisfied", "Somewhat Dissatisfied", "Very Dissatisfied"), AOrder = c(1,
2, 3, 4), Cycles = c(11, 11, 11, 11), N = c("N", "N", "N", "N"
), AdjN = c("AdjN", "AdjN", "AdjN", "AdjN"), PctPos = c("PctPos",
"PctPos", "", ""), PctNeg = c("", "", "PctNeg", "PctNeg"), TopBox = c("TopBox",
"", "", "")), .Names = c("ATID", "AnswerTypeDesc", "AValue",
"ScaleValue", "ADesc", "AOrder", "Cycles", "N", "AdjN", "PctPos",
"PctNeg", "TopBox"), row.names = c(NA, -4L), class = "data.frame")
I'm sure this is simple but unfortunately I haven't been able to figure it out using the reshape method.

Here is a solution with reshape2
(note: I've loaded dplyr to trigger the %>% operator but this is just a personal choice of styling code)
library(reshape2)
library(dplyr)
dat1 %>% dcast(... ~ StatGroup, value.var = "StatGroup", fill = "")
ATID AnswerTypeDesc AValue ScaleValue ADesc AOrder Cycles AdjN N PctNeg PctPos TopBox
1 33 VS|S|SD|VD 1 4 Very Dissatisfied 4 11 AdjN N PctNeg
2 33 VS|S|SD|VD 2 3 Somewhat Dissatisfied 3 11 AdjN N PctNeg
3 33 VS|S|SD|VD 3 2 Satisfied 2 11 AdjN N PctPos
4 33 VS|S|SD|VD 4 1 Very Satisfied 1 11 AdjN N PctPos TopBox
another solution with tidyr
library(tidyr)
dat1 %>% spread(StatGroup, StatGroup, fill = "")
ATID AnswerTypeDesc AValue ScaleValue ADesc AOrder Cycles AdjN N PctNeg PctPos TopBox
1 33 VS|S|SD|VD 1 4 Very Dissatisfied 4 11 AdjN N PctNeg
2 33 VS|S|SD|VD 2 3 Somewhat Dissatisfied 3 11 AdjN N PctNeg
3 33 VS|S|SD|VD 3 2 Satisfied 2 11 AdjN N PctPos
4 33 VS|S|SD|VD 4 1 Very Satisfied 1 11 AdjN N PctPos TopBox

Related

Calculate changes in columns of daily tidy data

Each day a company creates a value for category_1 and category_2.
A new company may enter the survey midway as company E appears on Dec 25.
Here are three days of data. So, two intervals: Dec 24-25 and Dec 25-26.
Question
For each category how many increase/decreases/no change were there over the 3 days?
For example, in cat1 A goes from a 2 to 1, B goes from a 3 to a 4, etc.
By hand I get:
cat1 - Up: 2, Down: 5, No change: 2
cat2 - Up: 6, Down: 2, No change: 1
How do I calculate the number of up/downs/no changes in an R Script?
library("tidyverse")
d1 <- as.Date("2022-12-24")
d2 <- as.Date("2022-12-25")
d3 <- as.Date("2022-12-26")
df <- tibble(
company = c(LETTERS[1:4], LETTERS[1:5], LETTERS[1:5]),
cat1 = c(2, 3, 4, 5, 1, 4, 5, 3, 2, 1, 4, 4, 2, 1),
cat2 = c(6, 7, 8, 9, 5, 5, 9, 10, 11, 6, 5, 10, 12, 13),
date = c(rep(d1, 4), rep(d2, 5), rep(d2, 5))
)
df
One approach using dplyr, assuming arranged data. Note: I changed the typo in date 3 to d3.
library(dplyr)
df %>%
group_by(company) %>%
mutate(cat1_change = cat1 - lag(cat1), cat2_change = cat2 - lag(cat2)) %>%
ungroup() %>%
summarize(type = c("up", "down", "no-change"),
across(ends_with("change"), ~
c(sum(.x > 0, na.rm=T), sum(.x < 0, na.rm=T), sum(.x == 0, na.rm=T))))
# A tibble: 3 × 3
type cat1_change cat2_change
<chr> <int> <int>
1 up 2 6
2 down 5 2
3 no-change 2 1
Data
df <- structure(list(company = c("A", "B", "C", "D", "A", "B", "C",
"D", "E", "A", "B", "C", "D", "E"), cat1 = c(2, 3, 4, 5, 1, 4,
5, 3, 2, 1, 4, 4, 2, 1), cat2 = c(6, 7, 8, 9, 5, 5, 9, 10, 11,
6, 5, 10, 12, 13), date = structure(c(19350, 19350, 19350, 19350,
19351, 19351, 19351, 19351, 19351, 19352, 19352, 19352, 19352,
19352), class = "Date")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -14L))
An option with data.table - grouped by company, loop over the 'cat' column, get the diff of adjacent elements, convert to sign, and rename with factor labels, melt to long format and reshape back to 'wide' format with dcast
library(data.table)
dcast(melt(setDT(df)[, lapply(.SD, \(x) factor(sign(diff(x)),
levels = c(-1, 0, 1), labels = c("down", "no-change", "up"))),
company, .SDcols = patterns("^cat")], id.var = "company",
value.name = "type"), type ~ paste0(variable, "_change"), length)
-output
type cat1_change cat2_change
1: down 5 2
2: no-change 2 1
3: up 2 6

Tried code in R with mutate_at and max() functions with own data. Warning messages come up: no non-missing arguments to max

I'm curretly learning R with a book and was trying a mutate_at function from dplyr. In this example I want to standardize the survey items on a scale from 0 to 1. To do this, we can divide each value by the (theoretical) maximum value of the scale.
The book example stats_test from the package "pradadata" works perfectly fine:
data(stats_test, package = "pradadata")
stats_test %>%
drop_na() %>%
mutate_at(.vars = vars(study_time, self_eval, interest),
.funs = funs(prop = ./max(.))) %>%
select(contains("_prop"))
Output:
study_time_prop self_eval_prop interest_prop
<dbl> <dbl> <dbl>
1 0.6 0.7 0.667
2 0.8 0.8 0.833
3 0.6 0.4 0.167
4 0.8 0.7 0.833
5 0.4 0.6 0.5
6 0.4 0.6 0.667
7 0.8 0.6 0.5
8 0.2 0.7 0.667
9 0.6 0.8 0.833
10 0.6 0.7 0.833
# ... with 1,617 more rows
Tried the same code with my own data but it doesn't work and I can't figure out why. The variable RG04 from my data has a range from 1-5. I tried to transform the variable from numeric to integer, because the variables from the the data stats_test are integer too:
df_literacy_2 <- transform(df_literacy, RG04 = as.integer(RG04))
df_literacy_2 <- tibble(df_literacy_2)
df_literacy_2 %>%
drop_na() %>%
mutate_at(.vars = vars(RG04),
.funs = funs(prop = ./max(.))) %>%
select(contains("_prop"))
Output:
# A tibble: 0 x 0
Warning messages:
1: Problem with `mutate()` input `prop`.
i no non-missing arguments to max; returning -Inf
i Input `prop` is `RG04/max(RG04)`.
2: In base::max(x, ..., na.rm = na.rm) :
no non-missing arguments to max; returning -Inf
str(df_literacy_2$RG04)
int [1:630] 2 4 2 1 2 2 1 3 1 3 ...
Why doesn't it work on my data?
Thank you for your help.
Edit with sample of df_literacy:
> dput(head(df_literacy,20))
structure(list(CASE = c(40, 41, 44, 45, 48, 49, 54, 55, 56, 57,
58, 61, 62, 63, 64, 65, 66, 67, 68, 69), SERIAL = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), REF = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), QUESTNNR = c("base", "base",
"base", "base", "base", "base", "base", "base", "base", "base",
"base", "base", "base", "base", "base", "base", "base", "base",
"base", "base"), MODE = c("interview", "interview", "interview",
"interview", "interview", "interview", "interview", "interview",
"interview", "interview", "interview", "interview", "interview",
"interview", "interview", "interview", "interview", "interview",
"interview", "interview"), STARTED = structure(c(1607290462,
1607290608, 1607291086, 1607291118, 1607291265, 1607291793, 1607294071,
1607294336, 1607294337, 1607294419, 1607294814, 1607296474, 1607301809,
1607329348, 1607333933, 1607335996, 1607336207, 1607336378, 1607343194,
1607343414), tzone = "UTC", class = c("POSIXct", "POSIXt")),
EI01 = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("Ja",
"Nein", "Nicht beantwortet"), class = "factor"), EI02 = c(2,
2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 3),
RF01 = c(4, 2, 4, 3, 4, 4, 1, 3, 2, 3, 4, 3, 2, 3, 2, 2,
4, 2, 5, 3), RF02 = c(1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1,
1, 1, 2, 2, 2, 2, 2, 2), RF03 = c(1, 2, 2, 2, 1, 2, 1, 1,
1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2), RG01 = c(2, 2, 2, 2,
2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2), RG02 = c(3,
3, 3, 3, 4, 3, 4, 2, 4, 2, 3, 4, 4, 2, 4, 3, 4, 3, 4, 4),
RG03 = c(3, 2, 2, 3, 3, 3, 1, 3, 1, 2, 3, 1, 2, 2, 1, 3,
2, 3, 2, 2), RG04 = c(2, 4, 2, 1, 2, 2, 1, 3, 1, 3, 2, 4,
1, 1, 1, 1, 1, 2, 4, 1), RG05 = c(1, 1, 1, 1, 1, 1, 1, 2,
1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1), SD01 = structure(c(2L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 1L), .Label = c("weiblich", "männlich", "divers",
"nicht beantwortet"), class = "factor"), SD03 = c(4, 3, 2,
2, 1, 2, 4, 4, 1, 4, 3, 1, 2, 3, 2, 4, 2, 3, 1, 3), SD05_01 = c(23,
22, 22, 21, 18, 22, 21, 27, 17, 22, 17, 21, 21, 22, 50, 25,
23, 20, 23, 23), TIME001 = c(2, 3, 23, 73, 29, 2, 3, 3, 29, 7,
50, 55, 3, 2, 10, 2, 1, 5, 7, 35), TIME002 = c(2, 2, 16,
34, 12, 14, 2, 2, 21, 2, 30, 24, 21, 3, 3, 2, 3, 2, 3, 22
), TIME003 = c(34, 8, 12, 15, 13, 12, 12, 7, 13, 11, 16,
10, 11, 16, 8, 8, 7, 8, 11, 14), TIME004 = c(60, 33, 25,
31, 45, 25, 14, 13, 38, 35, 50, 50, 37, 32, 32, 25, 72, 55,
28, 29), TIME005 = c(84, 21, 29, 41, 54, 33, 30, 22, 32,
42, 44, 23, 65, 30, 28, 32, 51, 31, 27, 44), TIME006 = c(14,
9, 27, 11, 24, 8, 8, 9, 18, 12, 35, 33, 27, 46, 11, 15, 8,
14, 12, 14), TIME007 = c(3, 18, 3, 5, 6, 2, 9, 2, 3, 3, 6,
7, 3, 13, 4, 4, 378, 3, 4, 10), TIME_SUM = c(199, 94, 135,
142, 183, 96, 78, 58, 154, 112, 186, 152, 167, 142, 96, 88,
146, 118, 92, 168), MAILSENT = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
LASTDATA = structure(c(1607290661, 1607290702, 1607291221,
1607291328, 1607291448, 1607291889, 1607294149, 1607294394,
1607294491, 1607294531, 1607295045, 1607296676, 1607301976,
1607329490, 1607334030, 1607336084, 1607336727, 1607336496,
1607343286, 1607343582), tzone = "UTC", class = c("POSIXct",
"POSIXt")), FINISHED = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1), Q_VIEWER = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), LASTPAGE = c(7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
MAXPAGE = c(7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7), MISSING = c(7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 0, 7, 7, 7), MISSREL = c(1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1), TIME_RSI = c("46023",
"14246", "0.75", "0.63", "0.54", "12055", "17533", "30682",
"0.7", "44197", "0.45", "0.58", "0.83", "44378", "44501",
"18629", "46753", "46388", "44197", "0.57"), DEG_TIME = c(27,
27, 3, 1, 0, 23, 30, 42, 2, 17, 0, 2, 7, 18, 10, 27, 43,
18, 8, 0)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
Edit with TRUE and FALSE NAs:
> sapply(df_literacy, function(a) table(c(T,F,is.na(a)))-1)
CASE SERIAL REF QUESTNNR MODE STARTED EI01 EI02 RF01 RF02 RF03 RG01 RG02 RG03 RG04 RG05 SD01 SD03 SD05_01 TE03_01 TIME001 TIME002 TIME003
FALSE 630 0 0 630 630 630 630 630 630 630 630 630 630 630 630 630 629 629 615 99 630 630 630
TRUE 0 630 630 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 15 531 0 0 0
TIME004 TIME005 TIME006 TIME007 TIME_SUM MAILSENT LASTDATA FINISHED Q_VIEWER LASTPAGE MAXPAGE MISSING MISSREL TIME_RSI DEG_TIME
FALSE 630 630 629 625 630 0 630 630 630 630 630 630 630 630 630
TRUE 0 0 1 5 0 630 0 0 0 0 0 0 0 0 0
There are a few things to correct here.
drop_na() is removing all of your data.
drop_na(df_literacy)
# # A tibble: 0 x 37
# # ... with 37 variables: CASE <dbl>, SERIAL <lgl>, REF <lgl>, QUESTNNR <chr>,
# # MODE <chr>, STARTED <dttm>, EI01 <fct>, EI02 <dbl>, RF01 <dbl>, RF02 <dbl>,
# # RF03 <dbl>, RG01 <dbl>, RG02 <dbl>, RG03 <dbl>, RG04 <dbl>, RG05 <dbl>,
# # SD01 <fct>, SD03 <dbl>, SD05_01 <dbl>, TIME001 <dbl>, TIME002 <dbl>,
# # TIME003 <dbl>, TIME004 <dbl>, TIME005 <dbl>, TIME006 <dbl>, TIME007 <dbl>,
# # TIME_SUM <dbl>, MAILSENT <lgl>, LASTDATA <dttm>, FINISHED <dbl>,
# # Q_VIEWER <dbl>, LASTPAGE <dbl>, MAXPAGE <dbl>, MISSING <dbl>,
# # MISSREL <dbl>, TIME_RSI <chr>, DEG_TIME <dbl>
The problem is that you have several columns that are completely NA, namely SERIAL, REF, and MAILSENT.
sapply(df_literacy, function(a) table(c(T,F,is.na(a)))-1)
# CASE SERIAL REF QUESTNNR MODE STARTED EI01 EI02 RF01 RF02 RF03 RG01 RG02
# FALSE 20 0 0 20 20 20 20 20 20 20 20 20 20
# TRUE 0 20 20 0 0 0 0 0 0 0 0 0 0
# RG03 RG04 RG05 SD01 SD03 SD05_01 TIME001 TIME002 TIME003 TIME004 TIME005
# FALSE 20 20 20 20 20 20 20 20 20 20 20
# TRUE 0 0 0 0 0 0 0 0 0 0 0
# TIME006 TIME007 TIME_SUM MAILSENT LASTDATA FINISHED Q_VIEWER LASTPAGE
# FALSE 20 20 20 0 20 20 20 20
# TRUE 0 0 0 20 0 0 0 0
# MAXPAGE MISSING MISSREL TIME_RSI DEG_TIME
# FALSE 20 20 20 20 20
# TRUE 0 0 0 0 0
Drop the drop_na(), or at least drop_na(-SERIAL, -REF, -MAILSENT).
Your code is using funs, which has been deprecated since dplyr-0.8.0.
# Warning: `funs()` is deprecated as of dplyr 0.8.0.
# Please use a list of either functions or lambdas:
# # Simple named list:
# list(mean = mean, median = median)
# # Auto named with `tibble::lst()`:
# tibble::lst(mean, median)
# # Using lambdas
# list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
While this isn't causing an error, it is causing a warning (and will likely stop working at some point. Change your mutate_at to be:
mutate_at(.vars = vars(RG04, RF02),
.funs = list(prop = ~ . / max(.)))
You are using a single variable within .vars and a single function within .funs, so the column names are preserved as-is (and you will not see a _prop column). From ?mutate_at:
The names of the new columns are derived from the names of the
input variables and the names of the functions.
• if there is only one unnamed function (i.e. if '.funs' is an
unnamed list of length one), the names of the input variables
are used to name the new columns;
• for _at functions, if there is only one unnamed variable
(i.e., if '.vars' is of the form 'vars(a_single_column)') and
'.funs' has length greater than one, the names of the
functions are used to name the new columns;
• otherwise, the new names are created by concatenating the
names of the input variables and the names of the functions,
separated with an underscore '"_"'.
If you aren't going to add more variables and functions, then you need to self-name it in the call, as in mutate_at(.vars = vars(RG04 = RG04), ...). Oddly enough, this causes it to produce RG04_prop.
If we fix all of those, then it works.
df_literacy %>%
drop_na(-SERIAL, -REF, -MAILSENT) %>%
mutate_at(.vars = vars(RG04 = RG04),
.funs = list(prop = ~ ./max(.))) %>%
select(contains("_prop")) %>%
head(3)
# A tibble: 3 x 1
# RG04_prop
# <dbl>
# 1 0.5
# 2 1
# 3 0.5

Move data from one set of columns to another for a subset of rows

I have a dataframe...
df <- tibble(
id = 1:10,
family = c("a","a","b","b","c", "d", "e", "f", "g", "h"),
col1_a = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
col1_b = c(1, 2, 3, 4, NA, NA, NA, NA, NA, NA),
col2_a = c(11, 12, 13, 14, 15, 16, 17, 18, 19, 20),
col2_b = c(11, 12, 13, 14, NA, NA, NA, NA, NA, NA),
)
Families will only contain 2 members at most (so they're either individuals or pairs).
For individuals (families with only one row, i.e. id = 5:10), I want to randomly move 50% of the data from columns ending in 'a' to columns ending in 'b'.
By the end, the data should look like the following (depending on which 50% of rows are used)...
df <- tibble(
id = 1:10,
family = c("a","a","b","b","c", "d", "e", "f", "g", "h"),
col1_a = c(1, 2, 3, 4, 5, NA, 7, NA, 9, NA),
col1_b = c(1, 2, 3, 4, NA, 6, NA, 8, NA, 10),
col2_a = c(11, 12, 13, 14, NA, NA, 17, 18, NA, 20),
col2_b = c(11, 12, 13, 14, 15, 16, NA, NA, 19, NA),
)
I would like to be able to do this with a combination of group_by and mutate since I am mostly using Tidyverse.
Update: I forgot to mention that values in columns ending 'a' should be replaced with NA if they are moved across to 'b'.
I would do this in two primary steps, first create the fam_count column to determine which families only have 1 person. Then, create two rand columns, to determine whether or not we use the values in the b columns.
library(tidyverse)
set.seed(1)
df %>% group_by(family) %>%
mutate(fam_count = n()) %>%
ungroup() %>%
mutate(
rand1 = sample(c(NA, 1), nrow(.), replace = TRUE),
rand2 = sample(c(NA, 1), nrow(.), replace = TRUE),
col1_b = ifelse(fam_count == 1, rand1 * col1_a, col1_b),
col2_b = ifelse(fam_count == 1, rand2 * col2_a, col2_b)
) %>%
mutate(
col1_a = ifelse(fam_count == 1 & !is.na(col1_b), NA, col1_a),
col2_a = ifelse(fam_count == 1 & !is.na(col2_b), NA, col2_a)
) %>%
select(-rand1, -rand2, - fam_count)
# A tibble: 10 x 6
id family col1_a col1_b col2_a col2_b
<int> <chr> <int> <dbl> <int> <dbl>
1 1 a 1 1 11 11
2 2 a 2 2 12 12
3 3 b 3 3 13 13
4 4 b 4 4 14 14
5 5 c 5 NA NA 15
6 6 d 6 NA NA 16
7 7 e NA 7 17 NA
8 8 f 8 NA NA 18
9 9 g NA 9 19 NA
10 10 h 10 NA 20 NA

Replacing NAs in columns with values from rows in a different dataframe in R that have the same ID

I have two dataframes:
deploy.info <- data.frame(Echo_ID = c("20180918_7.5Fa_1", "20180918_Sebre_3", "20190808_Bake_2", "20190808_NH_2"),
uppermost_bin = c(2, 7, 8, 12))
spc <- data.frame(species = c("RS", "GS", "YG", "RR", "BR", "GT", "CB"),
percent_dist = c(0, 25, 80, 100, 98, 60, 100),
percent_dist_from_surf = c(0, 25, 80, 100, 98, 60, 100),
'20180918_7.5Fa_1' = c(1, 1, 1, "NA", "NA", 1, "NA"),
'20180918_Sebre_3' = c(1, 2, "NA", "NA", "NA", 4, "NA"),
'20190808_Bake_2' = c(1, 3, 7, "NA", "NA", 6, "NA"),
'20190808_NH_2' = c(1, 2, 8, "NA", "NA", 6, "NA"))
The last four columns in the spc data frame refer to each Echo_ID that I am dealing with in the deploy.info data frame. I want to replace the NAs in the spc data frame with the uppermost_bin values for each of the Echo_IDs. Does anyone know how to go about doing this?
My desired end product would look like:
i.want.this <- data.frame(species = c("RS", "GS", "YG", "RR", "BR", "GT", "CB"),
percent_dist = c(0, 25, 80, 100, 98, 60, 100),
percent_dist_from_surf = c(0, 25, 80, 100, 98, 60, 100),
'20180918_7.5Fa_1' = c(1, 1, 1, 2, 2, 1, 2),
'20180918_Sebre_3' = c(1, 2, 7, 7, 7, 4, 7),
'20190808_Bake_2' = c(1, 3, 7, 8, 8, 6, 8),
'20190808_NH_2' = c(1, 2, 8, 12, 12, 6, 12))
I have over 100 columns like this and would rather not go in and have to do this change by hand. Any ideas are greatly appreciated.
We can use Map to replace the NA elements in the columns of 'Echo_ID' by the corresponding values of 'uppermost_bin'. In the OP's dataset, the columns were factor, so it was converted to the correct type with type.convert
nm1 <- paste0("X", deploy.info$Echo_ID)
spc <- type.convert(spc, as.is = TRUE)
spc[nm1] <- Map(function(x, y) replace(x, is.na(x), y),
spc[nm1], deploy.info$uppermost_bin)
spc
# species percent_dist percent_dist_from_surf X20180918_7.5Fa_1 X20180918_Sebre_3 X20190808_Bake_2 X20190808_NH_2
#1 RS 0 0 1 1 1 1
#2 GS 25 25 1 2 3 2
#3 YG 80 80 1 7 7 8
#4 RR 100 100 2 7 8 12
#5 BR 98 98 2 7 8 12
#6 GT 60 60 1 4 6 6
#7 CB 100 100 2 7 8 12

Enter value from df based on condition across multiple columns into new variable

I am sure I am not the only person who has asked this but after hours of searching with no luck I need to ask the question myself.
I have a df (rp) like so:
rp <- structure(list(agec1 = c(7, 16, 11, 11, 17, 17),
agec2 = c(6, 12, 9, 9, 16, 15),
agec3 = c(2, 9, 9, 9, 14, NA),
agec4 = c(NA, 7, 9, 9, 13, NA),
agec5 = c(NA, 4, 7, 7, 10, NA),
agec6 = c(NA, NA, 6, 6, 9, NA),
agec7 = c(NA, NA, NA, NA, 7, NA),
agec8 = c(NA, NA, NA, NA, 5, NA),
row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
Where each obs in agecX refers to the age of a parent's children up to 8 children. I would like to create a new column "agec5_12" that contains the age of the oldest child aged 5-12. So my df would look like this:
rpage <- structure(list(agec1 = c(7, 16, 11, 11, 17, 17),
agec2 = c(6, 12, 9, 9, 16, 15),
agec3 = c(2, 9, 9, 9, 14, NA),
agec4 = c(NA, 7, 9, 9, 13, NA),
agec5 = c(NA, 4, 7, 7, 10, NA),
agec6 = c(NA, NA, 6, 6, 9, NA),
agec7 = c(NA, NA, NA, NA, 7, NA),
agec8 = c(NA, NA, NA, NA, 5, NA),
agec5_12 = c(7, 12, 11, 11, 10, NA))
row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
Notes about my data:
Ages are not always in the same chronological order i.e. youngest to oldest or oldest to youngest
It is possible for a row to have no children aged within this range (in which case I would like NA to be returned)
I have tried writing a function and applying it using rowwise and mutate:
fun.age5_12 <- function(x){
x[which(x == max(x[(x > 4) & (x < 13)], na.rm = TRUE))]
}
rpage <- rp %>%
select(-c(20:21, 199:200)) %>%
rowwise() %>%
mutate(agec5_12 = fun.age5_12(c(1:8)))
However, this returns all obs as "12". Ideally I would like to do this using dplyr. Any suggestions using mutate or ifelse and not necessarily with functions are fine.
Thank you
I know you wanted tidyverse but here's one base R way:
data.frame(
agec1 = c(7, 16, 11, 11, 17, 17),
agec2 = c(6, 12, 9, 9, 16, 15),
agec3 = c(2, 9, 9, 9, 14, NA),
agec4 = c(NA, 7, 9, 9, 13, NA),
agec5 = c(NA, 4, 7, 7, 10, NA),
agec6 = c(NA, NA, 6, 6, 9, NA),
agec7 = c(NA, NA, NA, NA, 7, NA),
agec8 = c(NA, NA, NA, NA, 5, NA),
stringsAsFactors = FALSE
) -> rp
for (i in 1:nrow(rp)) {
agec5_12 <- unlist(rp[i,], use.names = FALSE)
agec5_12 <- agec5_12[agec5_12 >= 5 & agec5_12 <= 12 & !is.na(agec5_12)]
rp[i, "agec5_12"] <- if (length(agec5_12)) max(agec5_12) else NA_integer_
}
rp
## agec1 agec2 agec3 agec4 agec5 agec6 agec7 agec8 agec5_12
## 1 7 6 2 NA NA NA NA NA 7
## 2 16 12 9 7 4 NA NA NA 12
## 3 11 9 9 9 7 6 NA NA 11
## 4 11 9 9 9 7 6 NA NA 11
## 5 17 16 14 13 10 9 7 5 10
## 6 17 15 NA NA NA NA NA NA NA
The for shows the idiom but an sapply() solution is alot faster:
rp1$agec5_12 <- sapply(1:nrow(rp), function(i) {
agec5_12 <- unlist(rp[i,], use.names = FALSE)
agec5_12 <- agec5_12[agec5_12 >= 5 & agec5_12 <= 12 & !is.na(agec5_12)]
if (length(agec5_12)) max(agec5_12) else NA_integer_
})
I think apply solution for such a problem will always be simpler and more readable thandplyr (I am assuming you meant tidyverse) solution but since you asked, here is one way -
library(dplyr)
library(tidyr)
rp %>%
rownames_to_column("parent_id") %>%
gather(variable, value, -parent_id) %>%
group_by(parent_id) %>%
arrange(parent_id, desc(value)) %>%
mutate(
agec5_12 = value[between(value, 5, 12)][1]
) %>%
ungroup() %>%
spread(variable, value) %>%
select(3:10, 2)
# A tibble: 6 x 9
agec1 agec2 agec3 agec4 agec5 agec6 agec7 agec8 agec5_12
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 7 6 2 NA NA NA NA NA 7
2 16 12 9 7 4 NA NA NA 12
3 11 9 9 9 7 6 NA NA 11
4 11 9 9 9 7 6 NA NA 11
5 17 16 14 13 10 9 7 5 10
6 17 15 NA NA NA NA NA NA NA
Another base R solution. We can use replace to replace numbers outside the range of 5 to 12, and then use apply and function(x) ifelse(all(is.na(x)), NA, max(x, na.rm = TRUE)) to find the maximum for each row. You can also consider to use max directly, but for rows with elements are NA, the max function would return -Inf.
rp$agec5_12 <- apply(replace(rp, rp > 12 | rp < 5, NA), 1,
function(x) ifelse(all(is.na(x)), NA, max(x, na.rm = TRUE)))
Or use do.call and pmax.
rp$agec5_12 <- do.call(pmax, c(replace(rp, rp > 12 | rp < 5, NA), na.rm = TRUE))
Here is a performance comparison of the three base R methods so far. do.call with pmax seems to be the fastest one.
library(microbenchmark)
perf <- microbenchmark(
m1 = {sapply(1:nrow(rp), function(i) {
agec5_12 <- unlist(rp[i,], use.names = FALSE)
agec5_12 <- agec5_12[agec5_12 >= 5 & agec5_12 <= 12 & !is.na(agec5_12)]
if (length(agec5_12)) max(agec5_12) else NA_integer_
})},
m2 = {
apply(replace(rp, rp > 12 | rp < 5, NA), 1,
function(x) ifelse(all(is.na(x)), NA, max(x, na.rm = TRUE)))
},
m3 = {rp$agec5_12 <- do.call(pmax, c(replace(rp, rp > 12 | rp < 5, NA), na.rm = TRUE))
}, times = 1000L)
perf
# Unit: microseconds
# expr min lq mean median uq max neval cld
# m1 505.318 559.2935 860.3941 608.386 1231.937 9844.699 1000 b
# m2 526.394 568.0325 831.6851 629.205 1207.262 4748.342 1000 b
# m3 384.514 425.1250 635.3154 465.736 918.362 8992.393 1000 a
DATA
rp <- data.frame(
agec1 = c(7, 16, 11, 11, 17, 17),
agec2 = c(6, 12, 9, 9, 16, 15),
agec3 = c(2, 9, 9, 9, 14, NA),
agec4 = c(NA, 7, 9, 9, 13, NA),
agec5 = c(NA, 4, 7, 7, 10, NA),
agec6 = c(NA, NA, 6, 6, 9, NA),
agec7 = c(NA, NA, NA, NA, 7, NA),
agec8 = c(NA, NA, NA, NA, 5, NA)
)
Since you asked for it, here's a pure dplyr way to do this -
max5_12 <- function(x) {
a <- sort(x, decreasing = T)
a[a >= 5 & a <= 12][1]
}
rp %>%
t() %>%
as.data.frame() %>%
bind_rows(
summarise_all(., max5_12)
) %>%
t() %>%
as.data.frame() %>%
setNames(c(names(rp), "agec5_12"))
agec1 agec2 agec3 agec4 agec5 agec6 agec7 agec8 agec5_12
V1 7 6 2 NA NA NA NA NA 7
V2 16 12 9 7 4 NA NA NA 12
V3 11 9 9 9 7 6 NA NA 11
V4 11 9 9 9 7 6 NA NA 11
V5 17 16 14 13 10 9 7 5 10
V6 17 15 NA NA NA NA NA NA NA
The most straightforward way I can think of to accomplish this uses dplyr, purrr and tidyr:
library(dplyr)
library(purrr)
library(tidyr)
rp %>%
mutate_at(vars(agec1:agec8), funs(ifelse(between(., 5, 12), ., NA))) %>%%
group_by(id) %>%
nest() %>%
mutate(agec5_12 = map(data, max, na.rm = TRUE),
agec5_12 = ifelse(agec5_12 == -Inf, NA, agec5_12)) %>%
unnest()

Resources