I need to label a lot of variables with sjlabelled::set_label. Here is a reproducable example and what already works:
library(data.table)
library(sjlabelled)
lookup <- data.table(id = paste0("q", 1:5),
question = paste0("qtext", 1:5))
data <- data.table(q1 = sample(1:5, 10, replace = TRUE),
q2 = sample(1:5, 10, replace = TRUE),
q3 = sample(1:5, 10, replace = TRUE),
q4 = sample(1:5, 10, replace = TRUE),
q5 = sample(1:5, 10, replace = TRUE))
data[, q1 := set_label(data[, q1], lookup[id == "q1", question])]
get_label(data$q1)
So I am able to label specific variables if I call them with their id but I am struggling with the task if I want to "loop" through all variables. Tried with a for loop with no success.
The goal is to be able to export the datatable (or dataframe) as an SPSS file. If it works with other packages I would also be happy.
You can use set_label directly on a dataframe.
library(sjlabelled)
data <- set_label(data, lookup$question[match(names(data), lookup$id)])
get_label(data)
# q1 q2 q3 q4 q5
#"qtext1" "qtext2" "qtext3" "qtext4" "qtext5"
Related
I need to label a values in a lot of variables with sjlabelled::set_labels. Here is a reproducable example and what already works:
library(data.table)
library(sjlabelled)
lookup <- data.table(id = paste0("q", 1:5),
answers = paste(paste0("atext", 1:5), paste0("btext", 1:5)
, paste0("ctext", 1:5), sep = ";"))
data <- data.table(q1 = sample(1:3, 10, replace = TRUE),
q2 = sample(1:3, 10, replace = TRUE),
q3 = sample(1:3, 10, replace = TRUE),
q4 = sample(1:3, 10, replace = TRUE),
q5 = sample(1:3, 10, replace = TRUE))
data$q1 <- set_labels(data$q1, labels = unlist(strsplit(lookup[id == "q1", answers], split = ";")))
get_labels(data$q1)
So the labels for the different answers (=values) are seperated by a semicolon. I am able to make it work if I call the variables by id but as you can see in the example code but I am struggling with the task if I want to "loop" through all variables.
The goal is to be able to export the datatable (or dataframe) as an SPSS file. If it works with other packages I would also be happy.
Match the column names of data with id, split the answers on ; and pass the labels as a list.
library(sjlabelled)
data <- set_labels(data, labels = strsplit(lookup$answers[match(names(data), lookup$id)], ';'))
get_labels(data)
#$q1
#[1] "atext1" "btext1" "ctext1"
#$q2
#[1] "atext2" "btext2" "ctext2"
#$q3
#[1] "atext3" "btext3" "ctext3"
#$q4
#[1] "atext4" "btext4" "ctext4"
#$q5
#[1] "atext5" "btext5" "ctext5"
I have a huge dataset with several groups (factors with between 2 to 6 levels), and dichotomous variables (0, 1).
example data
DF <- data.frame(
group1 = sample(x = c("A","B","C","D"), size = 100, replace = T),
group2 = sample(x = c("red","blue","green"), size = 100, replace = T),
group3 = sample(x = c("tiny","small","big","huge"), size = 100, replace = T),
var1 = sample(x = 0:1, size = 100, replace = T),
var2 = sample(x = 0:1, size = 100, replace = T),
var3 = sample(x = 0:1, size = 100, replace = T),
var4 = sample(x = 0:1, size = 100, replace = T),
var5 = sample(x = 0:1, size = 100, replace = T))
I want to do a chi square for every group, across all the variables.
library(tidyverse)
library(rstatix)
chisq_test(DF$group1, DF$var1)
chisq_test(DF$group1, DF$var2)
chisq_test(DF$group1, DF$var3)
...
etc
I managed to make it work by using two nested for loops, but I'm sure there is a better solution
groups <- c("group1","group2","group3")
vars <- c("var1","var2","var3","var4","var5")
results <- data.frame()
for(i in groups){
for(j in vars){
test <- chisq_test(DF[,i], DF[,j])
test <- mutate(test, group=i, var=j)
results <- rbind(results, test)
}
}
results
I think I need some kind of apply function, but I can't figure it out
Here is one way to do it with apply. I am sure there is an even more elegant way to do it with dplyr. (Note that here I extract the p.value of the test, but you can extract something else or the whole test result if you prefer).
res <- apply(DF[,1:3], 2, function(x) {
apply(DF[,4:7], 2,
function(y) {chisq.test(x,y)$p.value})
})
Here's a quick and easy dplyr solution, that involves transforming the data into long format keyed by group and var, then running the chi-sq test on each combination of group and var.
DF %>%
pivot_longer(starts_with("group"), names_to = "group", values_to = "group_val") %>%
pivot_longer(starts_with("var"), names_to = "var", values_to = "var_val") %>%
group_by(group, var) %>%
summarise(chisq_test(group_val, var_val)) %>%
ungroup()
I have a data set containing the step count of cows from a 4 week trial where each animal was exposed to treatment A or treatment B at the beginning of week 2, and want to know how the step rate of the two treatment groups changed each week compared to week 1.
How do I add an offset to my model to do this?
The model I am running before adding the offset is this:
mod.1 <- glmmTMB(Step.count ~ Week*Treatment + (1|Cow.ID), data = data.df, family = poisson)
Here is an example of my data
data.1 <- data.frame(Cow.ID = rep(1, 20),
Week = sample(c(1,2,3,4), 20, replace = TRUE),
Treatment = sample(c("infected"), 20, replace = TRUE),
Step.count = rpois(20, 60.1))
data.2 <- data.frame(Cow.ID = rep(2, 20),
Week = sample(c(1,2,3,4), 20, replace = TRUE),
Treatment = sample(c("infected"), 20, replace = TRUE),
Step.count = rpois(20, 60.1))
data.3 <- data.frame(Cow.ID = rep(3, 20),
Week = sample(c(1,2,3,4), 20, replace = TRUE),
Treatment = sample(c("non-infected"), 20, replace = TRUE),
Step.count = rpois(20, 60.1))
data.4 <- data.frame(Cow.ID = rep(4, 20),
Week = sample(c(1,2,3,4), 20, replace = TRUE),
Treatment = sample(c("non-infected"), 20, replace = TRUE),
Step.count = rpois(20, 60.1))
sample.df <- rbind(data.1, data.2, data.3, data.4)
Hard to say without an example of your data, but assuming that you have a datafame something like this
library(dplyr)
cows <- tibble(
Cow.Id = rep(1:4, times = 5),
Week = rep(1:5, each = 4),
Step.count = floor(runif(20, 100,200)),
Treatment = rep(c('A','B','A','B'), times = 5),
)
Then, you can easily calculate a column of Step.count.offset for each cow like this:
cows.clean <- cows %>%
group_by(Cow.Id) %>%
arrange(Week) %>%
mutate(
Step.count.offset = Step.count - first(Step.count)
) %>%
ungroup()
I'd like to compare element by element from two data.frame called df1 and df2. From they, I'd like to build a new data.frame called out. If the elements are equals, then the element in out is 1, otherwise is 0.
MWE
set.seed(1)
df1 <- data.frame(Q1 = sample(letters[1:5], 2, replace = TRUE),
Q2 = sample(letters[1:5], 2, replace = TRUE))
set.seed(2)
df2 <- data.frame(Q1 = sample(letters[1:5], 2, replace = TRUE),
Q2 = sample(letters[1:5], 2, replace = TRUE))
Expected output
out <- data.frame(Q1 = c(0, 0), Q2 = c(1, 0))
If the datasets are created with stringsAsFactors = FALSE while creating the data.frame - factor makes it difficult as the attributes would create difficulty in doing the comparison)
+(df1 == df2)
Or if it is factor convert to character columns with type.convert
+(type.convert(df1, as.is = TRUE) == type.convert(df2, as.is = TRUE))
Or make use of matrix hack way of changing to character
+(as.matrix(df1) == as.matrix(df2))
I am working with a large health insurance dataset and I am interested in participants with certain claims codes. One of my inclusion criteria is that the participant has to have be insured for one year before and one year after the claim date. E.g., if they were injured 9/27/2017, they need insurance from 9/27/2016-9/27/2018.
I have tried doing a simple rowsum, and using apply, but both have the same issue: in from:to : numerical expression has # elements: only the first used. Right now, I have the range saved as variables in the dataframe. It think I understand why I am having the issue--it is expecting a number and receiving a vector. How can I get it to conditionally select columns to sum. I will include my code below.
In my example, I am just trying to count the number of months a participant is insured for 6 month before and after their accident. The ins_#_# variables are a simple YES/NO for whether or not participants were insured that month. Any guidance is appreciated!
library(tidyverse)
set.seed(1)
df <- data.frame(id= seq(1,100),
injury_date = sample(seq(as.Date('2017/01/01'), as.Date('2017/12/31'), by="day"), 100),
ins_07_16 = sample(c(0,1), replace = TRUE),
ins_08_16 = sample(c(0,1), replace = TRUE),
ins_09_16 = sample(c(0,1), replace = TRUE),
ins_10_16 = sample(c(0,1), replace = TRUE),
ins_11_16 = sample(c(0,1), replace = TRUE),
ins_12_16 = sample(c(0,1), replace = TRUE),
ins_01_17 = sample(c(0,1), replace = TRUE),
ins_02_17 = sample(c(0,1), replace = TRUE),
ins_03_17 = sample(c(0,1), replace = TRUE),
ins_04_17 = sample(c(0,1), replace = TRUE),
ins_05_17 = sample(c(0,1), replace = TRUE),
ins_06_17 = sample(c(0,1), replace = TRUE),
ins_07_17 = sample(c(0,1), replace = TRUE),
ins_08_17 = sample(c(0,1), replace = TRUE),
ins_09_17 = sample(c(0,1), replace = TRUE),
ins_10_17 = sample(c(0,1), replace = TRUE),
ins_11_17 = sample(c(0,1), replace = TRUE),
ins_12_17 = sample(c(0,1), replace = TRUE),
ins_01_18 = sample(c(0,1), replace = TRUE),
ins_02_18 = sample(c(0,1), replace = TRUE),
ins_03_18 = sample(c(0,1), replace = TRUE),
ins_04_18 = sample(c(0,1), replace = TRUE),
ins_05_18 = sample(c(0,1), replace = TRUE),
ins_06_18 = sample(c(0,1), replace = TRUE))
df <- df %>%
mutate(month = as.numeric(format(as.Date(injury_date), "%m")), #pulling month of injury
low_mo = month + 2,
high_mo = month + 14)
df$insured <- rowSums(df[df$low_mo:df$high_mo]) #only uses first element
df$insured <- apply(df[df$low_mo:df$high_mo], 1, sum) #only uses first element
Edit:
Although I did not specify that I wanted a fast solution, I am working with a lot of data so I tested which of #akrun's solutions was the fastest. I changed the dataframe so it was 1e5 (100,000) rows. The results are below in case anyone is curious.
microbenchmark(o1 <- sapply(seq_len(nrow(df)), function(i) sum(df[i, df$low_mo[i]:df$high_mo[i]])),
o2 <- {colInd <- Map(`:`, df$low_mo, df$high_mo);
rowInd <- rep(seq_len(nrow(df)), lengths(colInd));
as.vector(tapply(df[-(1:2)][cbind(rowInd, unlist(colInd)-2)],
rowInd, FUN = sum))},
o3 <- {colInd1 <- Map(function(x, y) which(!seq_along(df) %in% x:y), df$low_mo, df$high_mo);
rowInd1 <- rep(seq_len(nrow(df)), lengths(colInd1));
rowSums(replace(df, cbind(rowInd1, unlist(colInd1)), NA)[-(1:2)], na.rm = TRUE)},
times = 5)
Unit: milliseconds
expr min lq mean median uq max neval
o1 20408.5072 20757.0285 20903.9386 20986.2275 21069.3163 21298.6137 5
o2 433.5463 436.3066 448.6448 455.6551 456.8836 460.8325 5
o3 470.6834 482.4449 492.9594 485.6210 504.1353 521.9122 5
> identical(o1, o2)
[1] TRUE
> identical(o2, o3)
[1] TRUE
There are couple of way to do this. Loop through the sequence of rows, subset the dataset by the row index, and the columns generated by taking the sequence of 'low_mo' and 'high_mo' for each row, get the sum
o1 <- sapply(seq_len(nrow(df)), function(i) sum(df[i, df$low_mo[i]:df$high_mo[i]]))
Or another option is to extract the elements based on the row/column index and then do a group by sum
colInd <- Map(`:`, df$low_mo, df$high_mo)
rowInd <- rep(seq_len(nrow(df)), lengths(colInd))
o2 <- as.vector(tapply(df[-(1:2)][cbind(rowInd, unlist(colInd)-2)],
rowInd, FUN = sum))
identical(o1, o2)
#[1] TRUE
Or another approach is to change the column values that are not in the sequence to NA and use the rowSums
colInd1 <- Map(function(x, y) which(!seq_along(df) %in% x:y), df$low_mo, df$high_mo)
rowInd1 <- rep(seq_len(nrow(df)), lengths(colInd1))
o3 <- rowSums(replace(df, cbind(rowInd1, unlist(colInd1)),
NA)[-(1:2)], na.rm = TRUE)
identical(o1, o3)
#[1] TRUE