Suppose I have data like the following:
# A tibble: 10 x 4
# Groups: a.month, a.group [10]
a.month a.group other.group amount
<date> <chr> <chr> <dbl>
1 2016-02-01 A X 15320
2 2016-05-01 A Z 50079
3 2016-06-01 A Y 60564
4 2016-08-01 A X 10540
5 2017-01-01 B X 30020
6 2017-03-01 B X 76310
7 2017-04-01 B Y 44215
8 2017-05-01 A Y 67241
9 2017-06-01 A Z 17180
10 2017-07-01 B Z 31720
And I want to produce rows for every possible combination of a.group, other.group and for every month in between (with amount being zero if not present on the data above)
I managed to produce a tibble with the default amounts through:
another.tibble <- as_tibble(expand.grid(
a.month = months.list,
a.group = unique.a.groups,
other.group = unique.o.groups,
amount = 0
));
How should I proceed to populate another.tibble with the values from the first one?
It is important to invoke expand.grid with stringsAsFactors=FALSE. Then, we simply make a LEFT_JOIN() to complete the combinations where we have data
library(tidyverse)
df <- tribble(
~a.month, ~a.group, ~other.group, ~amount,
'2016-02-01', 'A', 'X', 15320,
'2016-05-01', 'A', 'Z', 50079,
'2016-06-01', 'A', 'Y', 60564,
'2016-08-01', 'A', 'X', 10540,
'2017-01-01', 'B', 'X', 30020,
'2017-03-01', 'B', 'X', 76310,
'2017-04-01', 'B', 'Y', 44215,
'2017-05-01', 'A', 'Y', 67241,
'2017-06-01', 'A', 'Z', 17180,
'2017-07-01', 'B', 'Z', 31720
)
another.tibble <- as_tibble(expand.grid(
a.month = unique(df$a.month),
a.group = unique(df$a.group),
other.group = unique(df$other.group),
amount = 0, stringsAsFactors=F)
)
another.tibble %>%
left_join(df, by= c("a.month" = "a.month", "a.group" = "a.group", "other.group" = "other.group")) %>%
mutate(amount.x = ifelse(is.na(amount.y), 0, amount.y)) %>%
rename(amount = amount.x) %>%
select(1:4)
Related
How can I convert my data from this:
example <- data.frame(RTD_1_LOC = c('A', 'B'), RTD_2_LOC = c('C', 'D'),
RTD_3_LOC = c('E', 'F'), RTD_4_LOC = c('G', 'H'),
RTD_5_LOC = c('I', 'J'),RTD_1_OFF = c('1', '2'), RTD_2_OFF = c('3', '4'),
RTD_3_OFF = c('5', '6'), RTD_4_OFF = c('7', '8'),
RTD_5_OFF = c('9', '10'))
to this:
example2 <- data.frame(RTD = c(1,1,2,2,3,3,4,4,5,5),LOC = c('A', 'B','C','D','E','F','G','H','I','J'),
OFF = c(1,2,3,4,5,6,7,8,9,10))
I have been using tidyverse gather, but I end up with about 50 columns
ex <- gather(example,RTD, Location, RTD_1_LOC:RTD_5_LOC)
ex$RTD <- sub('_LOC',"",ex$RTD)
ex3 <- gather(ex,RTD, Offset, RTD_1_OFF:RTD_5_OFF)
ex2$RTD <- sub('_OFF',"",ex2$RTD)
We can use pivot_longer from tidyr and specify the names_pattern to capture the groups from the column names. As the 'RTD' column should be left as such, specify in the names_to, a vector of 'RTD' and the column values (.value) so that the 'RTD' will get the digits capture ((\\d+) and the word ((\\w+)) 'LOC', 'OFF' will be created as new columns with the column values
library(dplyr)
library(tidyr)
example %>%
pivot_longer(cols = everything(),
names_to = c("RTD", ".value"), names_pattern = "\\w+_(\\d+)_(\\w+)")
-output
# A tibble: 10 x 3
RTD LOC OFF
<chr> <chr> <chr>
1 1 A 1
2 2 C 3
3 3 E 5
4 4 G 7
5 5 I 9
6 1 B 2
7 2 D 4
8 3 F 6
9 4 H 8
10 5 J 10
I am working on a function that outputs a data frame that currently omits trials where there is missing data. However, I would like the full trial count to be added back into the file and the other data columns be blank for these instances (reflecting the missing data).
Example Data Frames:
Df1withTrialCount <- data.frame(Participant = c('A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A' ),
Trial = c(1,1,2,2,3,3,4,5,6,7,8,9,10,10,10),
NotRelevantVariable = c(1,2,3,4,5,6,4,3,2,1,1,2,3,4,5))
Df2NeedsTrialsAddedIn <- data.frame(Participant = c('A', 'A', 'A', 'A', 'A'),
Trial = c(1,3,5,6,10),
EyeGaze = c(.4, .2., .2, .1, .1))
So I would end up with something that had one row each for Trials 1-10 but blanks in Eye Gaze when there was not data (e.g., Trial 2 would have a blank for EyeGaze but Trial 3 would have .2).
Any help or insights would be greatly appreciated.
Take care and thank you for your time,
Caroline
With base::merge:
merge(unique(Df1withTrialCount[, c("Participant", "Trial")]),
Df2NeedsTrialsAddedIn,
all.x = TRUE)
We can use complete
library(tidyr)
complete(Df2NeedsTrialsAddedIn, Participant,
Trial = seq_len(max(Df1withTrialCount$Trial)))
-output
# A tibble: 10 x 3
# Participant Trial EyeGaze
# <chr> <dbl> <dbl>
# 1 A 1 0.4
# 2 A 2 NA
# 3 A 3 0.2
# 4 A 4 NA
# 5 A 5 0.2
# 6 A 6 0.1
# 7 A 7 NA
# 8 A 8 NA
# 9 A 9 NA
#10 A 10 0.1
If we need both min and `max from first dataset
complete(Df2NeedsTrialsAddedIn, Participant,
Trial = seq(min(Df1withTrialCount$Trial), max(Df1withTrialCount$Trial), by = 1))
library(tidyverse)
Df1withTrialCount %>%
left_join(Df2NeedsTrialsAddedIn, by=c('Participant', 'Trial')) %>%
distinct(Trial, .keep_all = TRUE)
I'm looking for an automated way of converting this:
dat = tribble(
~a, ~b, ~c
, 'x', 1, 'y'
, 'y', 2, NA
, 'q', 4, NA
, 'z', 3, 'q'
)
to:
tribble(
~a, ~b, ~d
, 'x', 1, 2
, 'z', 3, 4
)
So, the column c in dat encodes which row in dat to look at to grab a value for a new column d, and if c is NA, toss that row from the output. Any tips?
We can join dat with itself using c and a columns.
library(dplyr)
dat %>%
inner_join(dat %>% select(-c) %>% rename(d = 'b'),
by = c('c' = 'a'))
# A tibble: 2 x 4
# a b c d
# <chr> <dbl> <chr> <dbl>
#1 x 1 y 2
#2 z 3 q 4
In base R, we can do this with merge :
merge(dat, dat[-3], by.x = 'c', by.y = 'a')
We create the 'd' with lead of 'b' and filter out the NA rows of 'c' and remove the c column with select
library(dplyr)
dat %>%
mutate(d = lead(b)) %>%
filter(!is.na(c)) %>%
select(-c)
# A tibble: 2 x 3
# a b d
# <chr> <dbl> <dbl>
#1 x 1 2
#2 z 3 4
Or more compactly
dat %>%
mutate(d = replace(lead(b), is.na(c), NA), c = NULL) %>%
na.omit
Or with fill
library(tidyr)
dat %>%
mutate(c1 = c) %>%
fill(c1) %>%
group_by(c1) %>%
mutate(d = lead(b)) %>%
ungroup %>%
filter(!is.na(c)) %>%
select(-c, -c1)
Or in data.table
library(data.table)
setDT(dat)[, d := shift(b, type = 'lead')][!is.na(c)][, c := NULL][]
# a b d
#1: x 1 2
#2: z 3 4
NOTE: Both the solutions are simple and doesn't require any joins. Besides, it gives the expected output in the OP's post
Or using match from base R
cbind(na.omit(dat), d = with(dat, b[match(c, a, nomatch = 0)]))[, -3]
# a b d
#1 x 1 2
#2 z 3 4
I would like to understand how to verify two conditions in groups with R. Like if I have:
x <- data.frame("id" = c('A12', 'A12', 'A13', 'A13', 'A14', 'A14'),
"var1" = c('a', 'b', 'b', 'c', 'b', 'a'),
"var2" = c('x', 'y', 'z', 'z', 'y', 'x'),
"var3" = c('h', 'l', 'l', 'h', 'q', 'q),
stringsAsFactors = FALSE)
for the group with the ID A12 are 'a', 'x' and 'h' present in the same row?
After grouping by 'id', we may need to wrap with any if the whole group have at least one row with the condition satisfied
library(dplyr)
x %>%
group_by(id) %>%
mutate(flag = any(var1 == 'a' & var2 == 'x' & var3 == 'h'))
# A tibble: 6 x 5
# Groups: id [3]
# id var1 var2 var3 flag
# <chr> <chr> <chr> <chr> <lgl>
#1 A12 a x h TRUE
#2 A12 b y l TRUE
#3 A13 b z l FALSE
#4 A13 c z h FALSE
#5 A14 b y q FALSE
#6 A14 a x q FALSE
Or another option is to paste the columns, and then do a string match
library(stringr)
x %>%
group_by(id) %>%
mutate(flag = any(str_c(var1, var2, var3) == 'axh'))
If it is just to create a column of TRUE/FALSE, then remove the any and the group_by step
Suppose I have this data set:
df <- data.frame(c('a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', 'b', 'b'),
c('c', 'c', 'd', 'e', 'f', 'c', 'e', 'f', 'f', 'f', 'g', 'h', 'f')
) %>% setNames(c('type', 'value'))
type value
1 a c
2 a c
3 a d
4 a e
5 a f
6 a c
7 b e
8 b f
9 b f
10 b f
11 b g
12 b h
13 b f
I'd like to perform some kind of command as follows:
df %>% group_by(type) %>%
summarise_all(funs(largest_group_size))
This would ideally produce a table with the largest number of any value for a and b.
type largest_group_size
1 a 3
2 b 4
This table would have:
3 for a, because there are 3 values of c for a, and c is the largest group for a
4 for b, because there are 4 values of f for b, and f is the largest group for b
Ideally, I'd like to go a step further and calculate the percentage that the largest group is of the whole by type. So (largest_group_size / n()).
In two group_by steps:
df %>%
group_by(type, value) %>%
summarise(groups = n()) %>%
group_by(type) %>%
summarise(largest_group = max(groups),
as_percentage = largest_group / sum(groups))
This gives:
type largest_group as_percentage
<fct> <dbl> <dbl>
1 a 3 0.5
2 b 4 0.571
There is probably a more efficient way, but this is how I would do this in a hurry.