This is related to my question in this post but in a way needs the opposite output.
I have the same dataframe:
df <- data.frame("subj.no" = rep(1:3, each = 24),
"trial.no" = rep(1:3, each = 8, length.out = 72),
"item" = c(rep(c("ball", "book"), 4), rep(c("doll", "rope"), 4), rep(c("fish", "box"), 4), rep(c("paper", "candle"), 4), rep(c("horse", "marble"), 4), rep(c("doll", "rope"), 4), rep(c("tree", "dog"), 4), rep(c("ball", "book"), 4), rep(c("horse", "marble"), 4)),
"rep.no" = rep(1:4, each = 2, length.out = 72),
"DV" = c(1,0,1,0,1,0,0,1,1,0,1,0,0,0,1,0,1,0,1,0,1,0,0,0,0,1,1,1,1,0,0,1,0,1,1,0,0,1,0,1,1,1,0,1,0,0,
1,0,0,1,1,0,1,0,0,1,1,1,1,0,0,0,0,0,0,1,0,1,0,1,1,0),)
I now want to create a column where 1 is entered in every row with DV == 0 iff there is a lower rep.no of the same subj.no-trial.no-item group with DV == 1. 0 should be entered in all other rows.
How can this be done? I assume, like in my last post, df %>% group_by(subj.no, trial.no, item) is the first step. But I am stuck at the conditional statement.
Like this?
library(dplyr
df %>%
group_by(subj.no, trial.no, item) %>%
mutate(min_rep_no = min(rep.no[DV == 1]),
new_col = if_else(DV == 0 & rep.no > min_rep_no, 1, 0))
Related
I have a data table that looks like this:
https://i.stack.imgur.com/HIKPM.png
df <- data.frame(id = c('Student 1', 'Student 2', 'Student 3', 'Student 4'),
question1 = c(1, 2, 1, 2),
qusetion2 = c(2, 2, 1, 1),
question3 = c(1, 1, 2, 2),
question4 = c(2, 1, 2, 1))
where 1 = True and 2 = False. I want to calculate each student's score such that their answer choices (1 or 2) matches the right answer choices (1 or 2). Then after that, I want to calculate the class average
Is this what you want? You can calculate the class average by sum(df.longer$right.answers)/total questions
student.name <- c("student1", "student2", "student3", "student4")
Question1 <- c(1,2,1,2)
Question2 <- c(2,2,1,1)
Question3 <- c(1,1,2,2)
Question4 <- c(2,1,2,1)
df <- data.frame(student.name, Question1, Question2, Question3, Question4)
df.longer <- df %>%
pivot_longer(cols = starts_with("Question"), names_to = "Question.id", values_to = "T/F" ) %>%
group_by(student.name) %>%
summarise(right.answers = length(`T/F`[`T/F` == 1]))
I have two large data tables with the coordinates of different sequences. For example:
library(data.table)
dt1 <- data.table(cat = c(rep("A", 2), rep("B", 2)),
start = c(1, 4, 2, 15),
end = c(6, 9, 5, 20))
dt2 <- data.table(cat = c(rep("A", 2), rep("B", 2)),
start = c(2, 1, 10, 17),
end = c(7, 3, 12, 20))
I need to create a data table of the coordinates for the overlapping sequences (ie the integers that occur in the sequences given in both data tables, for each category). I can currently do this using a for loop. For example:
seq2 <- Vectorize(seq.default, vectorize.args = c("from", "to"))
out_list <- list()
for(i in 1:length(unique(dt1$cat))){
sub1 <- dt1[cat == unique(dt1$cat)[i]]
sub2 <- dt2[cat == unique(dt1$cat)[i]]
vec1 <- unique(unlist(c(seq2(from = sub1$start, to = sub1$end))))
vec2 <- unique(unlist(c(seq2(from = sub2$start, to = sub2$end))))
vec <- Reduce(intersect, list(vec1, vec2))
vec_dt <- data.table(V1 = vec)
output <- vec_dt[order(V1),
.(start = min(V1),
end = max(V1)),
by = .(grp = rleid(c(0, cumsum(diff(V1) > 1))))
]
output$grp <- NULL
output$cat <- unique(dt1$cat)[i]
out_list[[i]] <- output
print(i)
}
output_dt <- do.call("rbind", out_list)
However, the data sets I need to apply this to are very large (both in the number of rows and the size of the vectors). Is anyone able to suggest a way to improve performance?
Thanks
You could (a) convert your start/end variables to a sequence, (b) do an inner join, (c) convert back to start/end.
library(data.table)
dt1 <- data.table(cat = c(rep("A", 2), rep("B", 2)),
start = c(1, 4, 2, 15),
end = c(6, 9, 5, 20))
dt2 <- data.table(cat = c(rep("A", 2), rep("B", 2)),
start = c(2, 1, 10, 17),
end = c(7, 3, 12, 20))
# convert to sequence
dt1 = dt1[, .(sequence = start:end), by=.(cat, 1:nrow(dt1))][
, nrow := NULL]
dt2 = dt2[, .(sequence = start:end), by=.(cat, 1:nrow(dt2))][
, nrow := NULL]
# inner join + unique
overlap = merge(dt1, dt2)
overlap = unique(overlap)
# convert to start/end
overlap = overlap[, .(start=min(sequence), end=max(sequence)), by=.(cat)]
# result
overlap
#> cat start end
#> 1: A 1 7
#> 2: B 17 20
I have a data set containing the step count of cows from a 4 week trial where each animal was exposed to treatment A or treatment B at the beginning of week 2, and want to know how the step rate of the two treatment groups changed each week compared to week 1.
How do I add an offset to my model to do this?
The model I am running before adding the offset is this:
mod.1 <- glmmTMB(Step.count ~ Week*Treatment + (1|Cow.ID), data = data.df, family = poisson)
Here is an example of my data
data.1 <- data.frame(Cow.ID = rep(1, 20),
Week = sample(c(1,2,3,4), 20, replace = TRUE),
Treatment = sample(c("infected"), 20, replace = TRUE),
Step.count = rpois(20, 60.1))
data.2 <- data.frame(Cow.ID = rep(2, 20),
Week = sample(c(1,2,3,4), 20, replace = TRUE),
Treatment = sample(c("infected"), 20, replace = TRUE),
Step.count = rpois(20, 60.1))
data.3 <- data.frame(Cow.ID = rep(3, 20),
Week = sample(c(1,2,3,4), 20, replace = TRUE),
Treatment = sample(c("non-infected"), 20, replace = TRUE),
Step.count = rpois(20, 60.1))
data.4 <- data.frame(Cow.ID = rep(4, 20),
Week = sample(c(1,2,3,4), 20, replace = TRUE),
Treatment = sample(c("non-infected"), 20, replace = TRUE),
Step.count = rpois(20, 60.1))
sample.df <- rbind(data.1, data.2, data.3, data.4)
Hard to say without an example of your data, but assuming that you have a datafame something like this
library(dplyr)
cows <- tibble(
Cow.Id = rep(1:4, times = 5),
Week = rep(1:5, each = 4),
Step.count = floor(runif(20, 100,200)),
Treatment = rep(c('A','B','A','B'), times = 5),
)
Then, you can easily calculate a column of Step.count.offset for each cow like this:
cows.clean <- cows %>%
group_by(Cow.Id) %>%
arrange(Week) %>%
mutate(
Step.count.offset = Step.count - first(Step.count)
) %>%
ungroup()
I have a dataframe like this:
df <- data.frame("subj.no" = rep(1:3, each = 24),
"trial.no" = rep(1:3, each = 8, length.out = 72),
"item" = c(rep(c("ball", "book"), 4), rep(c("doll", "rope"), 4), rep(c("fish", "box"), 4), rep(c("paper", "candle"), 4), rep(c("horse", "marble"), 4), rep(c("doll", "rope"), 4), rep(c("tree", "dog"), 4), rep(c("ball", "book"), 4), rep(c("horse", "marble"), 4)),
"rep.no" = rep(1:4, each = 2, length.out = 72),
"DV" = c(1,0,1,0,1,0,0,1,1,0,1,0,0,0,1,0,1,0,1,0,1,0,0,0,0,1,1,1,1,0,0,1,0,1,1,0,0,1,0,1,1,1,0,1,0,0,
1,0,0,1,1,0,1,0,0,1,1,1,1,0,0,0,0,0,0,1,0,1,0,1,1,0),)
I now want to create another column DV.no which says that the value 1 occurred the nth time within that combination of subj.no, trial.no and item. For DV==0, the value in the new column should be 0.
So the resulting vector should look like this:
DV.no = c(1,0,2,0,3,0,0,1,1,0,2,0,0,0,3,0,1,0,2,0,3,0,0,0,0,1,1,2,2,0,0,3,0,1,1,0,0,2,0,3,1,1,0,2,0,0,2,0,0,1,1,0,2,0,0,2,1,1,2,0,0,0,0,0,0,1,0,2,0,3,1,0)
So basically, for each unique combination of values in subj.no, trial.no and item, whenever the value of DV is 1, then 1 should be added to the count in the new variable.
(Remark: The column rep.no is not part of the relevant value combination. But it's in the df anyway, and since I didn't know if it's useful for the solution, I left it there.)
How can this be done in R?
We can do a group by cumsum on the 'DV' column
library(dplyr)
df %>%
group_by(subj.no, trial.no, item) %>%
mutate(V.no = cumsum(DV)* DV)
Or in base R with ave
df$V.no <- with(df, DV *ave(DV, subj.no, trial.no, item, FUN = cumsum))
I'm trying to load data from Quandl with collapse = "monthly".
Some of the values are only available in a yearly or halve-yearly fashion.
Some other values are only available within certain periods of time.
This leaves me with a lot of inhomogeneous data.
How can I fill the yearly and halve-yearly data in a "Last observation carried forward" fashion and the remaining NAs with 0?
Here is my idea of the data I got and the one I want to have at the end:
library(tibble)
set.seed(4711)
# How do I get from:
#
df.start <- data_frame(
Date = seq.Date(as.Date("1990-01-01"), as.Date("1999-12-01"), "1 month"),
B = rep(NA, 120),
C = c(rep(NA, 50), rnorm(120 - 50)),
D = rep(c(rnorm(1), rep(NA, 11)), 10),
E = c(rep(NA, 24), rep(c(rnorm(1), rep(NA, 11)), 8)),
F = c(rep(NA, 45), rnorm(50), rep(NA, 25)),
G = c(rep(NA, 24), rep(c(rnorm(1), rep(NA, 11)), 6), rep(NA, 24)),
H = c(rep(NA, 10), rnorm(20), rep(NA, 16), rnorm(37), rep(NA, 37)),
I = rep(c(rnorm(1), rep(NA, 5)), 20)
)
#
# To:
#
df.end <- data_frame(
Date = seq.Date(as.Date("1990-01-01"), as.Date("1999-12-01"), "1 month"),
B = rep(0, 120),
C = c(rep(0, 50), rnorm(120 - 50)),
D = rep(rnorm(10), each = 12),
E = c(rep(0, 24), rep(rnorm(8), each = 12)),
F = c(rep(0, 45), rnorm(50), rep(0, 25)),
G = c(rep(0, 24), rep(rnorm(6), each = 12), rep(0, 24)),
H = c(rep(0, 10), rnorm(20), rep(0, 16), rnorm(37), rep(0, 37)),
I = rep(rnorm(20), each = 6)
)
#
# Automatically?
#
You can use fill to fill the NAs with the last non-empty value (except for the Date column), and then replace the remaining NAs by 0. We do these operations grouped by year.
library(tidyverse)
library(lubridate)
df.end <- df.start %>%
mutate(year = year(Date)) %>%
group_by(year) %>%
fill(., colnames(df.start[-1])) %>%
replace(., is.na(.), 0) %>%
ungroup() %>%
select(-year)