Related
I have a dataset of 100 patients (7 are shown here), 2 covariates and 50 phenotypes(5 are shown here). I want to perform a multivariable logistic regression for each phenotype using Covariate1 and Covariate2 as covariates to predict the Outcome, I would like to get a table like this, where I have the p-value, OR and confidence interval(CI)per each of the covariates.
I tried:
for (i in df) {
print(i)
model <-glm(Outcome~ x[i] +Covariate1 +Covariate2, family = binomial(link = "logit"), data=df)
I also tried the solution for this question. But x and y a reversed in my question, so it did not work:
R: automate table for results of several multivariable logistic regressions
Thanks very much for your help!
This is an example dataset
df<-structure(list(ID = c(1, 2, 3, 4, 5, 6, 7), Outcome = c(0, 0,
1, 1, 0, 1, 0), Covariate1 = c(1, 2, 3, 4, 5, 6, 7), Covariate2 = c(0,
0, 0, 1, 1, 1, 1), P1 = c(1, 0, 0, 1, 1, 1, 2), P2 = c(0, 2,
0, 1, 1, 1, 1), P3 = c(0, 0, 0, 1, 1, 1, 1), P4 = c(0, 0, 0,
1, 2, 1, 1), P5 = c(0, 0, 0, 1, 1, 1, 2)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -7L))
if I understood correctly
df <- structure(
list(
ID = c(1, 2, 3, 4, 5, 6, 7),
Outcome = c(0, 0, 1, 1, 0, 1, 0),
Covariate1 = c(1, 2, 3, 4, 5, 6, 7),
Covariate2 = c(0, 0, 0, 1, 1, 1, 1),
P1 = c(1, 0, 0, 1, 1, 1, 2),
P2 = c(0, 2, 0, 1, 1, 1, 1),
P3 = c(0, 0, 0, 1, 1, 1, 1),
P4 = c(0, 0, 0, 1, 2, 1, 1),
P5 = c(0, 0, 0, 1, 1, 1, 2)
),
class = c("tbl_df",
"tbl", "data.frame"),
row.names = c(NA,-7L)
)
library(tidyverse)
first_tables <- map(
.x = select(df, starts_with("P")),
.f = ~ glm(
Outcome ~ .x + Covariate1 + Covariate2,
family = binomial(link = "logit"),
data = df
)
) %>%
map(broom::tidy)
map_df(
.x = first_tables,
.f = ~ .x %>% mutate(
p = p.value,
OR = exp(estimate),
CI5 = exp(estimate - 1.96 * std.error),
CI95 = exp(estimate + 1.96 * std.error),
.keep = "unused"
) %>%
select(-statistic),
.id = "phenotype"
) %>%
filter(term == ".x") %>%
select(-term)
#> # A tibble: 5 x 5
#> phenotype p OR CI5 CI95
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 P1 0.997 5.84e-10 0 Inf
#> 2 P2 0.996 1.53e- 4 0 Inf
#> 3 P3 0.824 2.00e+ 0 0.00442 904.
#> 4 P4 0.998 3.66e- 9 0 Inf
#> 5 P5 0.997 2.72e-10 0 Inf
Created on 2023-01-11 with reprex v2.0.2
I am trying to get a bar plot for sentiment scores corrected as per the following order and put into two separate colors:
(NEGATIVE) anger, disgust, fear, sadness, negative --- (POSITIVE) anticipation, joy, surprise, trust, positive.
Below is the code which only gives a decreasing plot.
barplot(sort(colSums(s), decreasing = TRUE),
las = 2,
col = rainbow(2),
ylab = 'Count',
main = 'User Synergies')
> dput(head(s))
structure(list(anger = c(1, 0, 0, 0, 0, 0), anticipation = c(0,
0, 5, 0, 0, 0), disgust = c(0, 0, 0, 0, 0, 0), fear = c(1, 0,
2, 1, 0, 0), joy = c(1, 0, 1, 0, 0, 0), sadness = c(1, 0, 2,
1, 0, 0), surprise = c(0, 0, 2, 1, 0, 0), trust = c(4, 2, 3,
1, 0, 1), negative = c(2, 0, 3, 2, 1, 1), positive = c(4, 4,
7, 1, 0, 2)), row.names = c(NA, 6L), class = "data.frame")
Another way:
positive <- c("anticipation", "joy", "surprise", "trust", "positive")
negative <- c("anger", "disgust", "fear", "sadness", "negative")
barplot(colSums(s[,c(negative, positive)]),
las = 2,
col = c(rep("red", length(negative)), rep("cyan", length(positive))),
ylab = 'Count', ylim = c(0, 20),
main = 'User Synergies')
The result:
Try this ,
df <- structure(list(anger = c(1, 0, 0, 0, 0, 0),
anticipation = c(0, 0, 5, 0, 0, 0),
disgust = c(0, 0, 0, 0, 0, 0),
fear = c(1, 0,2, 1, 0, 0),
joy = c(1, 0, 1, 0, 0, 0),
sadness = c(1, 0, 2, 1, 0, 0),
surprise = c(0, 0, 2, 1, 0, 0),
trust = c(4, 2, 3, 1, 0, 1),
negative = c(2, 0, 3, 2, 1, 1),
positive = c(4, 4,7, 1, 0, 2)),
row.names = c(NA, 6L), class = "data.frame")
pn <- rainbow(2) # "#FF0000" "#00FFFF" one for positive and the other for negative
s <- sort(colSums(df) , decreasing = TRUE)
names(s)
#> [1] "positive" "trust" "negative" "anticipation" "fear"
#> [6] "sadness" "surprise" "joy" "anger" "disgust"
# arrange colors based on names of sorted columns
col <- c(pn[1] , pn[1] , pn[2] , pn[1] , pn[2] ,
pn[2] , pn[1] , pn[1] , pn[2] , pn[2])
barplot(s ,
las = 2,
col = col,
ylab = 'Count',
main = 'User Synergies')
Created on 2022-05-31 by the reprex package (v2.0.1)
You may try
library(dplyr)
library(reshape2)
df <- data.frame(
anger = 200,
disgust = 100,
fear = 900,
sadness = 400,
negative = 1500,
anticipation = 2000,
joy = 1200,
surprise = 300,
trust = 2500,
positive = 5000
)
pall <- c("red", "blue")
colSums(df) %>%
melt %>%
tibble::rownames_to_column(., "sentiments") %>%
mutate(sentiments = factor(sentiments, levels = c("anger", "disgust", "fear", "sadness", "negative", "anticipation", "joy", "surprise", "trust", "positive"))) %>%
mutate(colo = ifelse(sentiments %in% c("anger", "disgust", "fear", "sadness", "negative"), 0, 1) %>% as.factor) %>%
barplot(data = ., value ~ sentiments, col = pall[.$colo], las = 2, xlab = "")
Another approach :
df <- structure(list(anger = c(1, 0, 0, 0, 0, 0),
anticipation = c(0, 0, 5, 0, 0, 0),
disgust = c(0, 0, 0, 0, 0, 0),
fear = c(1, 0,2, 1, 0, 0),
joy = c(1, 0, 1, 0, 0, 0),
sadness = c(1, 0, 2, 1, 0, 0),
surprise = c(0, 0, 2, 1, 0, 0),
trust = c(4, 2, 3, 1, 0, 1),
negative = c(2, 0, 3, 2, 1, 1),
positive = c(4, 4,7, 1, 0, 2)),
row.names = c(NA, 6L), class = "data.frame")
s <- sort(colSums(df) , decreasing = TRUE)
pos <- c("positive" , "trust" , "anticipation" ,
"surprise" , "joy")
col <- names(s)
col <- ifelse(col %in% pos , "cyan" , "red")
barplot(s ,
las = 2,
col = col,
ylab = 'Count',
main = 'User Synergies')
Created on 2022-05-31 by the reprex package (v2.0.1)
I would like to subset my data frame based on the index column; I would like to keep those cases whose index is saved in myvar (eg. 110, 111). I don't understand why I receive 0 observations when running this code:
newdata <- df[ which(df$index=="myvars"), ]
Sample data:
df<-structure(list(index = c(111, 110, 101, 111), et = c(1, 1, 1,
1), d1_t2 = c(0, 1, 1, 1), d1_t3 = c(0, 0, 1, 1), d1_t4 = c(0,
1, 0, 1), d2_t1 = c(0, 0, 1, 1), d2_t2 = c(0, 1, 1, 1), d2_t3 = c(0,
0, 0, 1), d2_t4 = c(1, 0, 1, 1), d3_t1 = c(1, 0, 1, 1), d3_t2 = c(1,
1, 0, 1), d3_t3 = c(1, 0, 1, 1), d3_t4 = c(1, 1, 0, 1), d4_t1 = c(0,
0, 1, 1), d4_t2 = c(1, 1, 0, 1), d4_t3 = c(0, 0, 1, 1), d4_t4 = c(1,
0, 1, 1), d5_t1 = c(1, 0, 0, 1), d5_t2 = c(0, 1, 1, 1), d5_t3 = c(1,
0, 1, 1), d5_t4 = c(0, 0, 1, 1), d6_t1 = c(1, 0, 0, 1), d6_t2 = c(0,
0, 1, 1), d6_t3 = c(1, 0, 1, 1), d6_t4 = c(1, 0, 1, 1), d7_t1 = c(1,
1, 1, 1), d7_t2 = c(1, 1, 1, 1), d7_t3 = c(1, 0, 1, 1), d7_t4 = c(1,
0, 1, 1)), row.names = c(NA, 4L), class = "data.frame")
Code:
myvars<-c("110", "111")
try
myvars<-c(110, 111) # <-- !! no quotes !!
df[ which(df$index %in% myvars ), ] #also, no quotes round myvars
There are several basic problems with what you are trying to do.
You are not using the variable 'myvars' -- you are using a string with the value "myvars". None of your rows has the index "myvars".
You are using == which is good for one value (e.g. values==4), but myvars has multiple values in it. Instead, you could use df$index %in% myvars
This does work, but you have integer indices, and are accessing them with strings. This is unnecessary, and could lead to problems in other places.
You may be confused because of your very large and complex example data. You only need one column to test -- not twenty.
Though this problem has been 'solved' many times, it turns out there's always another problem.
Without the print function it runs with no errors, but with it I get the following:
Error in .subset2(x, i) : recursive indexing failed at level 2
Which I'm taking to mean it doesn't like graphs being created in two layers of iteration? Changing the method to 'qplot(whatever:whatever)' has the exact same problem.
It's designed to print a graph for every pairing of the variables I'm looking at. There's too many for them to fit in a singular picture, such as for the pairs function, and I need to be able to see the actual variable names in the axes.
load("Transport_Survey.RData")
variables <- select(Transport, "InfOfReceievingWeather", "InfOfReceievingTraffic", "InfOfSeeingTraffic", "InfWeather.Ice", "InfWeather.Rain", "InfWeather.Wind", "InfWeather.Storm", "InfWeather.Snow", "InfWeather.Cold", "InfWeather.Warm", "InfWeather.DarkMorn", "InfWeather.DarkEve", "HomeParking", "WorkParking", "Disability", "Age", "CommuteFlexibility", "Gender", "PassionReduceCongest")
varnames <- list("InfOfReceivingWeather", "InfOfReceivingTraffic", "InfOfSeeingTraffic", "InfWeather.Ice", "InfWeather.Rain", "InfWeather.Wind", "InfWeather.Storm", "InfWeather.Snow", "InfWeather.Cold", "InfWeather.Warm", "InfWeather.DarkMorn", "InfWeather.DarkEve", "HomeParking", "WorkParking", "Disability", "Age", "CommuteFlexibility", "Gender", "PassionReduceCongest")
counterx = 1
countery = 1
for (a in variables) {
for (b in variables) {
print(ggplot(variables, mapping=aes(x=variables[[a]], y=variables[[b]],
xlab=varnames[counterx], ylab=varnames[countery]))+
geom_point())
countery = countery+1
counterx = counterx+1
}
}
#variables2 <- select(Transport, one_of(InfOfReceivingWeather, InfOfReceivingTraffic, InfOfSeeingTraffic, InfWeather.Ice, InfWeather.Rain, InfWeather.Wind, InfWeather.Storm, InfWeather.Snow, InfWeather.Cold, InfWeather.Warm, InfWeather.DarkMorn, InfWeather.DarkEve, HomeParking, WorkParking, Disability, Age, CommuteFlexibility, Gender, PassionReduceCongest))
Here is a mini-data frame for reference, sampled from the columns I'm using:
structure(list(InfOfReceievingWeather = c(1, 1, 1, 1, 4), InfOfReceievingTraffic = c(1,
1, 1, 1, 4), InfOfSeeingTraffic = c(1, 1, 1, 1, 4), InfWeather.Ice = c(3,
1, 3, 5, 5), InfWeather.Rain = c(1, 1, 2, 2, 4), InfWeather.Wind = c(1,
1, 2, 2, 4), InfWeather.Storm = c(1, 1, 1, 2, 5), InfWeather.Snow = c(1,
1, 2, 5, 5), InfWeather.Cold = c(1, 1, 1, 2, 5), InfWeather.Warm = c(1,
1, 1, 1, 3), InfWeather.DarkMorn = c(1, 1, 1, 1, 1), InfWeather.DarkEve = c(1,
1, 1, 1, 1), HomeParking = c(1, 1, 3, 1, 1), WorkParking = c(1,
4, 4, 5, 4), Disability = c(1, 1, 1, 1, 1), Age = c(19, 45, 35,
40, 58), CommuteFlexibility = c(2, 1, 5, 1, 2), Gender = c(2,
2, 2, 2, 1), PassionReduceCongest = c(0, 0, 2, 0, 2)), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))
You get an error in the assignment of your a and b. Basically, when defining a and b in variables, they become the vector of values contained in columns of variables. Thus, in your aes mapping, when you are calling variables[[a]], basically, you are writing (for the first iteration of a in variables):
variables[[c(1, 1, 1, 1, 4)]] instead of variables[["InfOfReceievingWeather"]]. So, it can't work.
To get over this issue, you have to either choose between:
for (a in variables) {
for (b in variables) {
print(ggplot(variables, mapping=aes(x=a, y=b)) ...
or
for (a in 1:ncol(variables)) {
for (b in 1:ncol(variables)) {
print(ggplot(variables, mapping=aes(x=variables[[a]], y=variables[[b]])) ...
Despite the first one seems to be simpler, I will rather prefere the second option because it will allow you to recycle a and b as column indicator to extract colnames of variables for xlab and ylab.
At the end, writing something like this should work:
for (a in 1:ncol(variables)) {
for (b in 1:ncol(variables)) {
print(ggplot(variables, mapping=aes(x=variables[[a]], y=variables[[b]])) +
xlab(colnames(variables)[a])+
ylab(colnames(variables)[b])+
geom_point())
}
}
Does it answer your question ?
I would like to convert wide data to long data in R, and my data set is for cross-classified models, exploring participants’ response to each target item that has different characteristics.
condition is one of the two conditions where participants were
assigned to.
The participants were tested twice: t1 and t2.
As for item-level predictor variables, x1 and x2, are coded.
As for response, whether participants’ response to the item was right or wrong was coded.
two test formats were administered, test1 and test2.
Although there are so many tutorials for a wide to long conversion, I could not find a one specifically explaining conversion for cross-classified models.
I would like to use tidyverse if possible for the sake of consistency.
My sample data is the following:
structure(list(item_name = c("x1", "x2", "participant_id", "1",
"2", "3", "4", "5", "6", "7"), participant_variable_1 = c(NA,
NA, NA, 20, 23, 21, 20, 19, 22, 30), condition = c(NA, NA, NA,
"A", "B", "A", "B", "A", "B", "A"), t1.item1.test1 = c(1, 3,
NA, 0, 1, 0, 1, 0, 0, 1), t1.item2.test1 = c(2, 2, NA, 0, 0,
0, 1, 1, 0, 1), t1.item3.test1 = c(1, 3, NA, 0, 0, 0, 1, 0, 0,
0), t1.item4.test1 = c(3, 1, NA, 1, 0, 0, 0, 1, 1, 0), t2.item1.test1 = c(1,
3, NA, 0, 1, 1, 0, 1, 1, 1), t2.item2.test1 = c(2, 2, NA, 1,
0, 1, 0, 1, 0, 1), t2.item3.test1 = c(1, 3, NA, 0, 0, 0, 1, 0,
0, 0), t2.item4.test1 = c(3, 1, NA, 1, 1, 0, 1, 1, 1, 0), t1.item1.test2 = c(1,
3, NA, 0, 1, 0, 1, 0, 0, 1), t1.item2.test2 = c(2, 2, NA, 0,
0, 0, 1, 1, 0, 1), t1.item3.test2 = c(1, 3, NA, 0, 0, 0, 1, 0,
0, 0), t1.item4.test2 = c(3, 1, NA, 1, 0, 0, 0, 1, 1, 0), t2.item1.test2 = c(1,
3, NA, 0, 1, 1, 0, 1, 1, 1), t2.item2.test2 = c(2, 2, NA, 1,
0, 1, 0, 1, 0, 1), t2.item3.test2 = c(1, 3, NA, 0, 0, 0, 1, 0,
0, 0), t2.item4.test2 = c(3, 1, NA, 1, 1, 0, 1, 1, 1, 0)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
I would like to have a long data, which looks like the following:
Please and thank you for your guidance!
This answer requires heavy use of the new pivot_ functions in the dev version of tidyr. You can install that with devtools::install_github("tidyverse/tidyr") if you're willing to run the dev version.
First we split the data into item and participant info - you're not really getting any benefit from storing both in the same table:
item_info = dat[1:2, ]
participant_info = dat[4:nrow(dat), ] %>%
rename(participant_id = item_name)
Then it's time for a lot of pivoting:
# I have the dev version of tidyr so that is being loaded
library(tidyverse)
item_long = item_info %>%
select(-participant_variable_1, -condition) %>%
pivot_longer(
cols = t1.item1:t2.item4,
names_to = c("time", "item"),
names_pattern = "t(\\d)\\.(item\\d)",
) %>%
pivot_wider(names_from = item_name, values_from = value)
participant_long = participant_info %>%
pivot_longer(
cols = t1.item1:t2.item4,
names_to = c("time", "item"),
names_pattern = "t(\\d)\\.(item\\d)",
values_to = "response"
)
combined = participant_long %>%
left_join(item_long, by = c("item", "time"))
Result:
> combined
# A tibble: 56 x 8
participant_id participant_variable_1 condition time item response x1 x2
<chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 1 20 A 1 item1 0 1 3
2 1 20 A 1 item2 0 2 2
3 1 20 A 1 item3 0 1 3
4 1 20 A 1 item4 1 3 1