structure(list(Number = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15), age = c(25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39), sex = c(0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0), bmi = c(35, 32, 29, 26, 23, 20, 17, 35, 32, 29,
26, 23, 20, 17, 21), Phenotype1 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1), `Phenotype 2` = c(0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 1, 1, 1), `Phenotype 3` = c(1, 0, 1, 0, 1, 1, 1,
1, 1, 1, 1, 0, 0, 0, 0), `Phenotype 4` = c(0, 0, 0, 0, 1, 1,
0, 1, 0, 1, 1, 1, 1, 1, 1)), row.names = c(NA, -15L), class = c("tbl_df",
"tbl", "data.frame"))
# A tibble: 15 x 8
Number age sex bmi Phenotype1 `Phenotype 2` `Phenotype 3` `Phenotype 4`
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 25 0 35 0 0 1 0
2 2 26 1 32 0 1 0 0
3 3 27 0 29 0 0 1 0
4 4 28 1 26 0 1 0 0
5 5 29 0 23 0 0 1 1
6 6 30 1 20 0 1 1 1
7 7 31 0 17 0 0 1 0
8 8 32 1 35 0 1 1 1
9 9 33 0 32 0 0 1 0
10 10 34 1 29 0 1 1 1
11 11 35 0 26 0 0 1 1
12 12 36 1 23 0 1 0 1
13 13 37 0 20 1 1 0 1
14 14 38 1 17 1 1 0 1
15 15 39 0 21 1 1 0 1
Hi all,
I have a dataset of 100 patients (15 are shown here), 3 covariates and 50 phenotypes(4 are shown here).
I want to perform a multivariable logistic regression for each phenotype using age, sex and BMI as covariates,
I would like to get a table like this, where I have the p-value, OR and confidence interval(CI)per each of the covariates.
I just don't know how to start.
Thank you very much for your help!
Best,
Caro
I wrote a function that should accomplish what you need. There are likely more elegant and more R-like ways of doing this, but this approach worked in my testing:
## Load libraries
library(broom)
library(tidyr)
library(dplyr)
## Define a function to create your summary table
summary_table <- function(x) {
# Capture number of columns passed to the function
num_vars <- ncol(x)
# Pre-define lists that will be populated and then collapsed by rest of function
models <- vector("list", length = num_vars)
first_tables <- vector("list", length = num_vars)
second_tables <- vector("list", length = num_vars)
# Loop to create each row for the final table
for (i in 1:num_vars) {
models[[i]] <- glm(x[[i]] ~ age + sex + bmi, family = "binomial", data = df)
first_tables[[i]] <- broom::tidy(models[[i]])
first_tables[[i]]$OR <- exp(first_tables[[i]]$estimate)
first_tables[[i]]$CI1 <- exp(first_tables[[i]]$estimate - (1.96 * first_tables[[i]]$std.error))
first_tables[[i]]$CI2 <- exp(first_tables[[i]]$estimate + (1.96 * first_tables[[i]]$std.error))
first_tables[[i]] <- as.data.frame(first_tables[[i]][first_tables[[i]]$term != "(Intercept)", c("term", "p.value", "OR", "CI1", "CI2")])[1:3,]
second_tables[[i]] <- first_tables[[i]] %>%
pivot_wider(names_from = term, values_from = c("p.value", "OR", "CI1", "CI2")) %>%
select("p.value_age", "OR_age", "CI1_age", "CI2_age", "p.value_bmi", "OR_bmi", "CI1_bmi", "CI2_bmi",
"p.value_sex", "OR_sex", "CI1_sex", "CI2_sex")
}
# Combine the rows together into a final table
final_table <- do.call("rbind", second_tables)
final_table <- round(final_table, 3)
row.names(final_table) <- rep(paste0("Phenotype", 1:num_vars))
return(final_table)
}
## Let "df" be your data.frame with 100 rows and 54 columns
## Use the summary_table() function, passing in the 50 columns containing your Phenotype outcome vars (I assumed they're in columns 5:54)
final_table <- summary_table(df[5:54])
## Write the final table to your working directory as a CSV
write.csv(final_table, "final_table.csv")
Related
I would like to calculate Day.Before_nextCLS with 3 columns below
tibble::tribble(
~Day, ~CLS, ~BAL.D,
0, 0, NA,
3, 0, 15000,
6, 0, 10000,
20, 0, 2000,
25, 0, -4771299,
26, 0, -1615637,
27, 0, -920917,
31, 1, -923089,
32, 1, -81863,
33, 1, 19865,
34, 1, 9865,
37, 1, 609865
)
Desired output is below tribble.
For Day27, Day.Before_nextCLS is 4,
because when CLS is 2, Day is 31, and interval between 27 and 31 is 4.
tibble::tribble(
~Day, ~CLS, ~BAL.D, ~Day.Before_nextCLS
0, 0, NA, 31,
3, 0, 15000, 28,
6, 0, 10000, 25,
20, 0, 2000, 11,
25, 0, -4771299, 6,
26, 0, -1615637, 5,
27, 0, -920917, 4,
31, 1, -923089, NA, (for we don't have date when CLS ==2)
32, 1, -81863, NA,
33, 1, 19865, NA,
34, 1, 9865, NA,
37, 1, 609865, NA,
)
How can I achieve this?
Thank you very much!!
We create a lead column and then do a group by subtract from the last value of lead column with the Day column
library(dplyr)
df1 %>%
mutate(DayLead = lead(Day)) %>%
group_by(CLS) %>%
mutate(Day.Before_nextCLS = last(DayLead) - Day, DayLead = NULL) %>%
ungroup
-output
# A tibble: 12 × 4
Day CLS BAL.D Day.Before_nextCLS
<dbl> <dbl> <dbl> <dbl>
1 0 0 NA 31
2 3 0 15000 28
3 6 0 10000 25
4 20 0 2000 11
5 25 0 -4771299 6
6 26 0 -1615637 5
7 27 0 -920917 4
8 31 1 -923089 NA
9 32 1 -81863 NA
10 33 1 19865 NA
11 34 1 9865 NA
12 37 1 609865 NA
I have the following two datasets:
df1 <- data.frame(
"group" = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5),
"numbers" = c(55, 75, 60, 55, 75, 60, 55, 75, 60, 55, 75, 60, 55, 75, 60))
df2 <- data.frame(
"group" = c(1, 1, 2, 2, 2, 3, 3, 4, 5),
"P1" = c(55, NA, 60, 55, 75, 75, 55, 55, 60),
"P2" = c(55, 75, 55, 60, NA, 75, 55, NA, 60),
"P3" = c(75, 55, 60, 75, NA, 75, 60, 55, 60))
In df1 each group has the same three numbers (in reality there are around 500 numbers).
I want to check whether the values in the column "numbers" in df1 are contained in the columns P1, P2, and P3 of df2. There are two problems I am stuck with. 1. the values in the numbers column of df1 can occur in different groups in df2 (defined by the group column in df1 and df2). 2. the datasets have different lengths. Is there a way to merge both datasets and have the following dataset:
df3 <- data.frame(
"group" = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5),
"numbers" = c(55, 75, 60, 55, 75, 60, 55, 75, 60, 55, 75, 60, 55, 75, 60,),
"P1new" = c(1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1),
"P2new" = c(1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1),
"P3new" = c(1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1))
where P1new (P2new and P3new respectively) contain the value 1 if df2$P1 contains the value in df1$numbers within the correct group (as I said numbers can reoccur in different groups). For example, P3 has the value 75 in group 1 but not in group 5. So in group 1 P3new would have a 1 and in group 5 P3new would have a 0.
This question is similar to Find matching values in different datasets by groups in R
but I could not adapt the code according to my objectives. So, I would really appreciate any help.
Interesting question. Here's a way with dplyr functions:
library(dplyr)
df2 %>%
group_by(group) %>%
summarise(across(P1:P3, ~ list(unique(na.omit(.x))))) %>%
inner_join(df1, .) %>%
rowwise() %>%
mutate(across(P1:P3, ~ +(numbers %in% .x)))
group numbers P1 P2 P3
<dbl> <dbl> <int> <int> <int>
1 1 55 1 1 1
2 1 75 0 1 1
3 1 60 0 0 0
4 2 55 1 1 0
5 2 75 1 0 1
6 2 60 1 1 1
7 3 55 1 1 0
8 3 75 1 1 1
9 3 60 0 0 1
10 4 55 1 0 1
11 4 75 0 0 0
12 4 60 0 0 0
13 5 55 0 0 0
14 5 75 0 0 0
15 5 60 1 1 1
Another possible solution:
library(tidyverse)
map_dfc(names(df2[-1]),
~ df1 %>%
group_by(group) %>%
mutate(!!.x := +(numbers %in% df2[df2$group == cur_group_id(), .x])) %>%
ungroup %>%
select(all_of(.x))) %>%
bind_cols(df1, .)
#> group numbers P1 P2 P3
#> 1 1 55 1 1 1
#> 2 1 75 0 1 1
#> 3 1 60 0 0 0
#> 4 2 55 1 1 0
#> 5 2 75 1 0 1
#> 6 2 60 1 1 1
#> 7 3 55 1 1 0
#> 8 3 75 1 1 1
#> 9 3 60 0 0 1
#> 10 4 55 1 0 1
#> 11 4 75 0 0 0
#> 12 4 60 0 0 0
#> 13 5 55 0 0 0
#> 14 5 75 0 0 0
#> 15 5 60 1 1 1
Or, without purrr, another possibility:
library(dplyr)
df1 %>%
inner_join(df2) %>%
group_by(group) %>%
mutate(across(starts_with("P"), ~ +(numbers %in% .x))) %>%
ungroup %>%
distinct
I got two data sets of different lengths. I want to create a new column in the dataset which got more rows based on filtering a specific column from the shorter df. I am getting a waring " Longer object length is not a multiple of shorter object length". And the result is also not correct. I tried to created a smaller example datasets and tried the same code and its working with correct results. I am not sure why on my original data the results are not correct and I am getting the warning.
The example datasets are
structure(list(id = 1:10, activity = c(0, 0, 0, 0, 1, 0, 0, 1,
0, 0), code = c(2, 5, 11, 15, 3, 18, 21, 3, 27, 55)), class = "data.frame", row.names = c(NA,
-10L))
the second df
structure(list(id2 = 1:20, code2 = c(2, 5, 11, 15, 9, 18, 21,
3, 27, 55, 2, 5, 11, 15, 3, 18, 21, 3, 27, 55), d_Activity = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0)), class = "data.frame", row.names = c(NA,
-20L))
I tried this on both my original datasets where I get the warning and these dummy dfs where no warning and correct results.
data2 <- data2 %>%
mutate(d_Activity = ifelse(code2 %in% data1$code & activity == 1, 1,0))
Actually, you are doing it wrong way. Let me explain-
In sample data it is working because larger df have rows (20) which is multiple of rows in smaller df (10).
So in you syntax what you are doing is, to check one complete vector with another complete vector (column of another df), because R normally works in vectorised way of operations.
the correct way of matching one to many is through purrr::map where each individual value in first argument (code2 here) operates with another vector i.e. df1$code which is not in argument of map.
df1 <- structure(list(id = 1:10, activity = c(0, 0, 0, 0, 1, 0, 0, 1,
0, 0), code = c(2, 5, 11, 15, 3, 18, 21, 3, 27, 55)), class = "data.frame", row.names = c(NA,
-10L))
df2 <- structure(list(id2 = 1:20, code2 = c(2, 5, 11, 15, 9, 18, 21,
3, 27, 55, 2, 5, 11, 15, 3, 18, 21, 3, 27, 55), d_Activity = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0)), class = "data.frame", row.names = c(NA,
-20L))
library(tidyverse)
df2 %>%
mutate(d_Activity = map(code2, ~ +(.x %in% df1$code[df1$activity == 1])))
#> id2 code2 d_Activity
#> 1 1 2 0
#> 2 2 5 0
#> 3 3 11 0
#> 4 4 15 0
#> 5 5 9 0
#> 6 6 18 0
#> 7 7 21 0
#> 8 8 3 1
#> 9 9 27 0
#> 10 10 55 0
#> 11 11 2 0
#> 12 12 5 0
#> 13 13 11 0
#> 14 14 15 0
#> 15 15 3 1
#> 16 16 18 0
#> 17 17 21 0
#> 18 18 3 1
#> 19 19 27 0
#> 20 20 55 0
Created on 2021-06-17 by the reprex package (v2.0.0)
I would like to create four data sets from the following given data frame by multiple conditions in x1 and x2
mydata=structure(list(y = c(-3, 24, 4, 5, 3, -3, -3, 24, 5, 4, 8, 7,
9, 2, 4, 8, 7, 3, 8, 12, 9, 10, 12, 11, 2),
x1 = c(0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
0, 1, 0, 1, 1, 0, 0, 1, 1, 1
),
x2 = c(1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
0, 1, 0, 0, 1, 1, 1, 0)), class = "data.frame",
row.names = c(NA,25L))
The first data set is mydata00 which is constructed with these conditions x1=0 and x2=0,
mydata00=filter(mydata, c(mydata$x1==0 & mydata$x2==0))
> mydata00
y x1 x2
1 -3 0 0
2 -3 0 0
3 8 0 0
4 3 0 0
5 9 0 0
Now, I need only the unique values of y and corresponding x1 and x2. Finally, I would like to sort y. So my final data set must look like
y x1 x2
1 -3 0 0
2 3 0 0
3 8 0 0
4 9 0 0
I would like to do the job for mydata11, mydata10, mydata01, where ,
mydata11=filter(mydata, c(mydata$x1==1 & mydata$x2==1))
mydata10=filter(mydata, c(mydata$x1==1 & mydata$x2==0))
mydata01=filter(mydata, c(mydata$x1==0 & mydata$x2==1))
Can I use any for loop or builtin functionn to create these data sets?
Any help is appreciated.
We can split the data based on unique values of x1 and x2 and get unique rows in each list after ordering it by y.
temp <- lapply(split(mydata, list(mydata$x1, mydata$x2)), function(x)
unique(x[order(x$y), ]))
temp
#$`0.0`
# y x1 x2
#6 -3 0 0
#18 3 0 0
#16 8 0 0
#21 9 0 0
#$`1.0`
# y x1 x2
#14 2 1 0
#5 3 1 0
#10 4 1 0
#4 5 1 0
#...
If we need data as a separate dataframe, we can name them appropriately and use list2env.
names(temp) <- paste0("mydata", names(temp))
list2env(temp, .GlobalEnv)
tidyverse way of doing this would be :
library(tidyverse)
mydata %>% group_split(x1, x2) %>% map(~.x %>% arrange(y) %>% distinct)
I know that there are more questions like this. However, I can not get it done.
I have a very large data.table with about 800.000 rows. Every row contains some specific data, then some demand data for 2 years (from column 12 onward), and in the last column is a number. This number determines how many demand columns should be replaced.
Example with one row:
ITEM COUNTRY Q1 Q2 Q3 Q4 ---- Q24 NUMBER
1 1 0 0 0 0 ---- 2 2
If this is a vector called x for example, I would do this:
x[2:(2+x$NUMBER-1)] <- NA
Now I want to do this for all rows. A for loop is way too slow. So I was thinking of apply.
fun_NA <- function(x){ #x is then a row of the data.table
#still use that with x[,] as x is still a data.table and not a vector
if(x[,60]>0){
x[,12:(12+as.numeric(x[,60])-1)] <- NA
}
}
dt = apply(dt, 1, fun_NA)
where dtis my data.table, which has 60 columns. The first demand value starts in column 12. It does not work.. Now it gives the error "Error in x[,60] : incorrect number of dimensions", but I tried other ways too and received other errors.
Reason why I want to do this:
The number represent the month (after first month of my data set) when the item was able to sell, so in the months before, the demand was not 0, but just did not exist. I need it to be NA, as 0 will cause wrong calculations later on.
EDIT:
Removed the comma's so this is the new code,
fun_NA <- function(x){ #x is then a row of the data.table
if(x[60]>0){
x[12:(12+as.numeric(x[60])-1)] <- NA
}
}
dt = apply(dt, 1, fun_NA)
However, this returns a large list with all NULL and NA elements..
Edit: the head of the data-table is as follows: (dput)
structure(list(ITEM = c(1, 1, 2, 2, 2, 2), COUNTRY = c(1, 2,
3, 4, 5, 2), DATE = c("2015-02-02", "2015-02-02", "2014-09-27",
"2014-09-27", "2014-09-27", "2014-09-27"), q_1 = c(0, 0, 2, 0,
0, 133), q_2 = c(0, 0, 24, 0, 9, 119), q_3 = c(0, 0, 15, 0, 13,
121), q_4 = c(0, 0, 7, 0, 2, 51), q_5 = c(0, 0, 12, 0, 6, 59),
q_6 = c(0, 0, 3, 0, 0, 36), q_7 = c(0, 0, 6, 0, 6, 41), q_8 = c(0,
0, 19, 0, 4, 42), q_9 = c(0, 0, 3, 0, 5, 48), q_10 = c(0,
0, 5, 0, 11, 49), q_11 = c(0, 0, 6, 0, 1, 42), q_12 = c(0,
0, 0, 0, 8, 70), q_13 = c(0, 0, 1, 0, 19, 81), q_14 = c(0,
0, 5, 0, 98, 86), q_15 = c(0, 0, 12, 0, 10, 152), q_16 = c(0,
0, 7, 0, 8, 95), q_17 = c(0, 0, 30, 0, 5, 62), q_18 = c(0,
0, 6, 0, 10, 47), q_19 = c(0, 0, 7, 0, 1, 35), q_20 = c(2,
0, 7, 0, 0, 47), q_21 = c(0, 2, 16, 5, 4, 70), q_22 = c(0,
0, 7, 0, 7, 46), q_23 = c(0, 0, 8, 0, 79, 20), q_24 = c(0,
0, 5, 0, 26, 45), NUMBER = c(13, 13, 8, 8, 8, 8)), .Names = c("ITEM",
"COUNTRY", "DATE", "q_1", "q_2", "q_3", "q_4", "q_5", "q_6",
"q_7", "q_8", "q_9", "q_10", "q_11", "q_12", "q_13", "q_14",
"q_15", "q_16", "q_17", "q_18", "q_19", "q_20", "q_21", "q_22",
"q_23", "q_24", "NUMBER"), class = c("data.table", "data.frame"
), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x0000000004490788>)
Usually, it is more efficient to work column-wise.
lapply(1:24, function(i) dt[i <= NUMBER, (paste0("q_", i)) := NA])
ITEM COUNTRY DATE q_1 q_2 q_3 q_4 q_5 q_6 q_7 q_8 q_9 q_10 q_11 q_12 q_13 q_14 q_15 q_16 q_17 q_18 q_19 q_20
1: 1 1 2015-02-02 NA NA NA NA NA NA NA NA NA NA NA NA NA 0 0 0 0 0 0 2
2: 1 2 2015-02-02 NA NA NA NA NA NA NA NA NA NA NA NA NA 0 0 0 0 0 0 0
3: 2 3 2014-09-27 NA NA NA NA NA NA NA NA 3 5 6 0 1 5 12 7 30 6 7 7
4: 2 4 2014-09-27 NA NA NA NA NA NA NA NA 0 0 0 0 0 0 0 0 0 0 0 0
5: 2 5 2014-09-27 NA NA NA NA NA NA NA NA 5 11 1 8 19 98 10 8 5 10 1 0
6: 2 2 2014-09-27 NA NA NA NA NA NA NA NA 48 49 42 70 81 86 152 95 62 47 35 47
q_21 q_22 q_23 q_24 NUMBER
1: 0 0 0 0 13
2: 2 0 0 0 13
3: 16 7 8 5 8
4: 5 0 0 0 8
5: 4 7 79 26 8
6: 70 46 20 45 8
Explanation
We loop over all 24 columns. For each row it is tested whether the value in the current column needs to be replaced by NA according to its column number. The values are updated in place which saves time and memory.
I've tested the solution with a sample data set of 1 million rows which took less than 0.2 seconds.
Since your input to the function is a vector, you should get rid of all the commas, because it only has one dimension.
fun_NA <- function(x){ #x is then a row of the data.table
#still use that with x[,] as x is still a data.table and not a vector
if(x[60]>0){
x[12:(12+as.numeric(x[60])-1)] <- NA
}
}
dt = apply(dt, 1, fun_NA)