I write a function to count daily number of people in hospital but it can't work when the number is 0 in some days.
my function:
tsdata2 <- function(df){
t.f <- as.data.frame(table(df$DATE_INHOSPITAL2)) %>% rename(whole =Freq)
## sex
man.d <- df %>% filter(GENDER == 1)
man.f <- as.data.frame(table(man.d$DATE_INHOSPITAL2)) %>% rename(man =Freq)
woman.d <- df %>% filter(GENDER == 2)
woman.f <- as.data.frame(table(woman.d$DATE_INHOSPITAL2)) %>% rename(woman =Freq)
## age 65
agelo65.d <- df %>% filter(age_group65 == 1)
agelo65.f <- as.data.frame(table(agelo65.d$DATE_INHOSPITAL2)) %>% rename(agelo65 =Freq)
ageup65.d <- df %>% filter(age_group65 == 2)
ageup65.f <- as.data.frame(table(ageup65.d$DATE_INHOSPITAL2)) %>% rename(ageupwith65 =Freq)
## age 10
age10.1.d <- df %>% filter(age_group10 == 1)
age10.1.d.f <- as.data.frame(table(age10.1.d$DATE_INHOSPITAL2)) %>% rename(agelo40 =Freq)
age10.2.d <- df %>% filter(age_group10 == 2)
age10.2.d.f <- as.data.frame(table(age10.2.d$DATE_INHOSPITAL2)) %>% rename(age41_50 =Freq)
age10.3.d <- df %>% filter(age_group10 == 3)
age10.3.d.f <- as.data.frame(table(age10.3.d$DATE_INHOSPITAL2)) %>% rename(age51_60 =Freq)
age10.4.d <- df %>% filter(age_group10 == 4)
age10.4.d.f <- as.data.frame(table(age10.4.d$DATE_INHOSPITAL2)) %>% rename(age61_70 =Freq)
age10.5.d <- df %>% filter(age_group10 == 5)
age10.5.d.f <- as.data.frame(table(age10.5.d$DATE_INHOSPITAL2)) %>% rename(age71_80 =Freq)
age10.6.d <- df %>% filter(age_group10 == 6)
age10.6.d.f <- as.data.frame(table(age10.6.d$DATE_INHOSPITAL2)) %>% rename(ageup80 =Freq)
datebreaks<-seq(as.Date("2014-01-01"),as.Date("2018-12-31"),by="1 day")
full <- data.frame(Var1 = as.character(datebreaks) )
result <- full %>%
left_join(t.f) %>%
left_join(man.f) %>%
left_join(woman.f) %>%
left_join(agelo65.f) %>%
left_join(ageup65.f) %>%
left_join(age10.1.d.f) %>%
left_join(age10.2.d.f) %>%
left_join(age10.3.d.f) %>%
left_join(age10.4.d.f) %>%
left_join(age10.5.d.f) %>%
left_join(age10.6.d.f) %>% replace(., is.na(.), 0)
return(result)
}
list <- split(total,total$DISEASE_CODE1_2to3)
test <- map(list,tsdata2)
I think the error was because the number of hospital admissions on a given day was zero.
How can I improve this code that it can work even the number is zero.
test <- map(list,tsdata2)
Joining, by = "Var1"
Joining, by = "Var1"
Joining, by = "Var1"
Joining, by = "Var1"
Joining, by = "Var1"
Joining, by = "Var1"
Error: `by` required, because the data sources have no common variables
The reason of such an error thrown is that you are applying left_join on empty data frames with no columns to join. Along your data frame filtering and contingency table creation, data frames with no columns to join were generated. Please see below the simulation :
library(dplyr)
df1 <- data.frame(a = 1:10, b = letters[1:10])
df2 <- data.frame(a = 1:10, c = letters[11:20])
df2 <- df2[,-c(1, 2)]
str(df2)
# 'data.frame': 10 obs. of 0 variables
df2 %>% left_join(df1)
The code above throws an error:
Error: by required, because the data sources have no common
variables Call rlang::last_error() to see a backtrace
To avoid such a problem you can implement simple check if the data frame is with no columns then change to dummy data frame:
library(dplyr)
df1 <- data.frame(a = 1:10, b = letters[1:10])
df2 <- df1[,-c(1, 2)]
df_dummy <- data.frame(a = 1, c = 0)
if(ncol(df2) == 0) df2 <- df_dummy
df1 %>% left_join(df2)
#
# Joining, by = "a"
# a b c
# 1 1 a 0
# 2 2 b NA
# 3 3 c NA
# 4 4 d NA
# 5 5 e NA
# 6 6 f NA
# 7 7 g NA
# 8 8 h NA
# 9 9 i NA
# 10 10 j NA
Related
I would like to randomly add NA values to my dataframe with the proportion set by group.
library(tidyverse)
set.seed(1)
dat <- tibble(group = c(rep("A", 100),
rep("B", 100)),
value = rnorm(200))
pA <- 0.5
pB <- 0.2
# does not work
# was trying to create another column that i could use with
# case_when to set value to NA if missing==1
dat %>%
group_by(group) %>%
mutate(missing = rbinom(n(), 1, c(pA, pB))) %>%
summarise(mean = mean(missing))
I'd create a small tibble to keep track of the expected missingness rates, and join it to the first data frame. Then go through row by row to decide whether to set a value to missing or not.
This is easy to generalize to more than two groups as well.
library("tidyverse")
set.seed(1)
dat <- tibble(
group = c(
rep("A", 100),
rep("B", 100)
),
value = rnorm(200)
)
expected_nans <- tibble(
group = c("A", "B"),
p = c(0.5, 0.2)
)
dat_with_nans <- dat %>%
inner_join(
expected_nans,
by = "group"
) %>%
mutate(
r = runif(n()),
value = if_else(r < p, NA_real_, value)
) %>%
select(
-p, -r
)
dat_with_nans %>%
group_by(
group
) %>%
summarise(
mean(is.na(value))
)
#> # A tibble: 2 × 2
#> group `mean(is.na(value))`
#> <chr> <dbl>
#> 1 A 0.53
#> 2 B 0.17
Created on 2022-03-11 by the reprex package (v2.0.1)
Nesting and unnesting works
library(tidyverse)
dat <- tibble(group = c(rep("A", 1000),
rep("B", 1000)),
value = rnorm(2000))
pA <- .1
pB <- 0.5
set.seed(1)
dat %>%
group_by(group) %>%
nest() %>%
mutate(p = case_when(
group=="A" ~ pA,
TRUE ~ pB
)) %>%
mutate(data = purrr::map(data, ~ mutate(.x, missing = rbinom(n(), 1, p)))) %>%
unnest() %>%
summarise(mean = mean(missing))
# A tibble: 2 × 2
group mean
<chr> <dbl>
1 A 0.11
2 B 0.481
set.seed(1)
dat %>%
group_by(group) %>%
nest() %>%
mutate(p = case_when(
group=="A" ~ pA,
TRUE ~ pB
)) %>%
mutate(data = purrr::map(data, ~ mutate(.x, missing = rbinom(n(), 1, p)))) %>%
unnest() %>%
ungroup() %>%
mutate(value = case_when(
missing == 1 ~ NA_real_,
TRUE ~ value
)) %>%
select(-p, -missing)
I have a data set df that has been split into int1 and int2. In int1andint2, there is two elements for the IDA and three elements for theID` B.
My goal is to create a 2x2 matrix for ID A and 3x3 for ID B, and have it divided from my example list of matrices l1. Currently, my code is creating a 3x3 matrix for ID A and 2x2 matrix for ID B using a combination of the product from g1 and f2 using map2() resulting to lstmat.
Any suggestions on how I can get the desired output of a 2x2 matrix for ID A and 3x3 matrix for ID B?
Example data:
library(lubridate)
library(tidyverse)
date <- rep_len(seq(dmy("26-12-2010"), dmy("20-12-2011"), by = "days"), 500)
ID <- rep(c("A","B"), 5000)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$jDate <- julian(as.Date(df$date), origin = as.Date('1970-01-01'))
df$Month <- month(df$date)
df$year <- year(df$date)
t1 <- c(100,150)
t2 <- c(200,250)
mat <- cbind(t1,t2)
t1 <- c(150,150,200)
t2 <- c(250,250,350)
t3 <- c(350,350, 400)
mat2 <- cbind(t1,t2, t3)
l1 <- list(mat, mat2)
int1 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "3") %>%
group_split()
int2 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "2") %>%
group_split()
names(int1) <- sapply(int1, function(x) paste(x$ID[1],
sep = '_'))
names(int2) <- sapply(int2, function(x) paste(x$ID[1],
sep = '_'))
int1 <- int1[-1]
int2 <- int2[-1]
Any suggestions for changes to this code for the desired result? :
g1 <- as.integer(gl(length(int1), 3, length(int1)))
f2 <- function(.int1, .int2) {
t(outer(seq_along(.int1), seq_along(.int2),
FUN = Vectorize(function(i, j) min(.int1[[i]]$jDate) -
min(.int2[[j]]$jDate))))
}
lstMat <- map2(split(int1, g1), split(int2, g1), f2)
map2(l1, lstMat, `/`)
As the 'int1', 'int2' have duplicated names, split on the names instead of creating a grouping index with gl
lstMat <- map2(split(int1, names(int1)), split(int2, names(int2)), f2)
map2(l1, lstMat, `/`)
-output
[[1]]
t1 t2
[1,] 3.571429 5.263158
[2,] 8.333333 8.928571
[[2]]
t1 t2 t3
[1,] 5.357143 6.578947 7.291667
[2,] 8.333333 8.928571 9.210526
[3,] 25.000000 19.444444 14.285714
I try to apply formatting to a data frame created from a printed TableOne object but it won't "stick"
Sample
library(dplyr)
library(tableone)
data(ovarian)
data <- ovarian
data$futime <- data$futime * 100
vars <- c("futime","fustat")
catvars <- c("fustat")
table1 <- CreateTableOne(vars = vars, factorVars = catvars,strata = "rx", data = data)
print(table1, printToggle = F, quote = F) %>%as.data.frame() %>% format(big.mark = ",")
Result:
1 2 p test
n 13 13
futime (mean (SD)) 51730.77 (34688.14) 68176.92 (32467.63) 0.224
fustat = 1 (%) 7 (53.8) 5 (38.5) 0.694
This behaves similar to simply creating my own data frame
c(1213,2,3,1213,2,3) %>% table()%>% as.data.frame() %>% format(big.mark = ",")
Result:
. Freq
1 2 2
2 3 2
3 1213 2
This is unlike when simply using the format option on a numeric variable or even a one column data frame
123321789 %>% format(big.mark = ",")
Result:
[1] "123,321,789"
or
c(1213,2,3,1213,2,3) %>% as.data.frame() %>% format(big.mark = ",")
Result:
1 1,213
2 2
3 3
4 1,213
5 2
6 3
This is linked to the fact that table returns factors.
The examples you provided apply to numeric data.
Try:
library(dplyr)
result <- c(1213,2,3,1213,2,3) %>% table() %>% as.data.frame
class(result$.)
#> [1] "factor"
result %>% mutate_all( ~format(as.numeric(as.character(.x)),big.mark=','))
#> . Freq
#> 1 2 2
#> 2 3 2
#> 3 1,213 2
OK, so this one lead me down a rabbit hole of text formatting.
Ended up writing a function to address the matter, used address some weird rounding issues.
styleTableOne <- function(x){
if(!is.na(as.numeric(x))){return(format(as.numeric(x),big.mark = ","))}
if(x == ""){return(x)}
if(x == "<0.001"){return(x)}
if(x == "0.0"){return(x)}
if(x == " "){return(x)}
if (length(strsplit(x, split = "(", fixed = T)[[1]]) == 2){
set1 <- strsplit(x, split = "(",fixed = T)[[1]][1] %>% as.numeric()
set2 <- strsplit(x, split = "(",fixed = T)[[1]][2] %>% str_remove(fixed(")")) %>% as.numeric()
set1 <- case_when(
set1 > 100 ~ round(set1,0),
set1 > 25 ~ round(set1,1),
T ~ round(set1,2)
)
set2 <- case_when(
set1 > 100 ~ round(set2,0),
set1 > 25 ~ round(set2,1),
T ~ round(set2,2)
)
set1 %<>% format(big.mark = ",")
set2 %<>% format(big.mark = ",")
set <- paste(set1,set2,sep = " (")
set <- paste0(set,")")
return(set)}
x %>% strsplit(split = " ",fixed = T) %>% .[[1]] -> x
x <- subset(x, x != "")
set1 <- strsplit(x, split = " ",fixed = T)[1] %>% as.numeric()
set2 <- strsplit(x, split = " ",fixed = T)[2] %>% str_remove(fixed("[")) %>% str_remove(fixed(",")) %>% as.numeric()
set3 <- strsplit(x, split = " ",fixed = T)[3] %>% str_remove(fixed("]")) %>% as.numeric()
set1 <- case_when(
set1 > 100 ~ round(set1,0),
set1 > 25 ~ round(set1,1),
T ~ round(set1,2)
)
set2 <- case_when(
set1 > 100 ~ round(set2,0),
set1 > 25 ~ round(set2,1),
T ~ round(set2,2)
)
set3 <- case_when(
set1 > 100 ~ round(set3,0),
set1 > 25 ~ round(set3,1),
T ~ round(set3,2)
)
set1 %<>% format(big.mark = ",")
set2 %<>% format(big.mark = ",")
set3 %<>% format(big.mark = ",")
set <- paste0(set1," (",set2,"-",set3,")")
return(set)
}
Then you can do:
library(survival)
library(dplyr)
library(tableone)
data(ovarian)
data <- ovarian
data$futime <- data$futime * 100
vars <- c("futime","fustat")
catvars <- c("fustat")
table1 <- CreateTableOne(vars = vars, factorVars = catvars,strata = "rx", data = data)
print(table1, printToggle = F) %>%
as.data.frame() %>%
sapply(sapply, styleTableOne) %>%
as.data.frame(row.names = row.names(print(table1)))
result:
1 2 p test
n 13 13
futime (mean (SD)) 51,731 (34,688) 68,177 (32,468) 0.224
fustat = 1 (%) 7 (53.8) 5 (38.5) 0.694
I have a dataframe:
genes_1 = c("a","b","c","d","e")
genes_2 = c("f","g","c","e","j")
genes_3 = c("a","b","m","n","o")
df = data.frame(genes_1, genes_2, genes_3)
My desired output:
genes_1 = c("","","","d","")
genes_2 = c("f","g","","","j")
genes_3 = c("","","m","n","o")
df = data.frame(genes_1, genes_2, genes_3)
How can I achieve this?
Thanks
0-dependency base R solution:
data.frame(
genes_1 = c("a","b","c","d","e"),
genes_2 = c("f","g","c","e","j"),
genes_3 = c("a","b","m","n","o"),
stringsAsFactors = FALSE
) -> xdf
dups <- names(which(table(unlist(xdf, use.names = FALSE)) > 1))
xdf[] <- lapply(xdf, function(x) { x[x %in% dups] <- "" ; x })
xdf
unlist() recursively unwinds all the columns into a single character vector.
table() counts all occurrences of each element.
which() narrows down to only the ones which are TRUE
names() grabs the character select vector elements.
We then work by column to replace all occurrences in the vector that match with ""
library(microbenchmark)
library(data.table)
microbenchmark(
base = {
ydf <- xdf
dups <- names(which(table(unlist(ydf, use.names = FALSE)) > 1))
ydf[] <- lapply(ydf, function(x) { x[x %in% dups] <- "" ; x })
},
base.2 = {
ydf <- xdf
tmp <- unlist(ydf)
ydf[arrayInd(which(duplicated(tmp) | duplicated(tmp, fromLast = TRUE)), dim(ydf))] <- ""
},
tidyverse = {
ydf <- xdf
ydf %>%
gather(genes, value) %>%
add_count(value) %>%
mutate(value = ifelse(n > 1, "", value)) %>%
select(-n) %>%
group_by(genes) %>%
mutate(ID = 1:n()) %>%
spread(genes, value) %>%
select(-ID) -> ydf
},
data.table = {
ydt <- data.table(xdf)
ydt[,lapply(.SD, function(x) { x[x %in% dups] <- "" ; x })]
}
) %>%
{ print(.) ; . } %>%
autoplot()
Another base solution:
tmp <- unlist(df)
df[arrayInd(which(duplicated(tmp) | duplicated(tmp,fromLast=TRUE)), dim(df))] <- NA
# genes_1 genes_2 genes_3
#1 <NA> f <NA>
#2 <NA> g <NA>
#3 <NA> <NA> m
#4 d <NA> n
#5 <NA> j o
unlist just creates a long vector for all the values in df
arrayInd then creates a two-column row/column index for subsetting df for the duplicated values.
Here is a tidyverse solution. df2 is the final output.
library(tidyverse)
df2 <- df %>%
gather(genes, value) %>%
add_count(value) %>%
mutate(value = ifelse(n > 1, "", value)) %>%
select(-n) %>%
group_by(genes) %>%
mutate(ID = 1:n()) %>%
spread(genes, value) %>%
select(-ID)
My data is below
grp <- paste('group', sample(1:3, 100, replace = T))
x <- rnorm(100, 100)
y <- rnorm(100, 10)
df <- data.frame(grp = grp, x =x , y =y , stringsAsFactors = F)
lag_size <- c(10, 4, 9)
Now when I try to use
df %>% group_by(grp) %>% mutate_all(lag, n = lag_size) %>% arrange(grp)
it gives an error
Error in mutate_impl(.data, dots) :
Expecting a single value:
whereas this works fine
df %>% group_by(grp) %>% mutate_all(lag, n = 10) %>% arrange(grp)
If we need to do the lag based on the 'grp' i.e. to lag the corresponding 'grp' with the value specified in 'lag_size'
library(tidyverse)
res <- map2(split(df[2:3], df$grp) , lag_size, ~.x %>%
mutate_all(lag, n = .y)) %>%
bind_rows(., .id = 'grp')
We can check the lag in 'grp' by the position of the first non-NA element
res %>%
group_by(grp) %>%
summarise(n = which(!is.na(x))[1]-1)
# A tibble: 3 x 2
# grp n
# <chr> <dbl>
#1 group 1 10
#2 group 2 4
#3 group 3 9