Fixing mismatched matrices - r

I have a data set df that has been split into int1 and int2. In int1andint2, there is two elements for the IDA and three elements for theID` B.
My goal is to create a 2x2 matrix for ID A and 3x3 for ID B, and have it divided from my example list of matrices l1. Currently, my code is creating a 3x3 matrix for ID A and 2x2 matrix for ID B using a combination of the product from g1 and f2 using map2() resulting to lstmat.
Any suggestions on how I can get the desired output of a 2x2 matrix for ID A and 3x3 matrix for ID B?
Example data:
library(lubridate)
library(tidyverse)
date <- rep_len(seq(dmy("26-12-2010"), dmy("20-12-2011"), by = "days"), 500)
ID <- rep(c("A","B"), 5000)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$jDate <- julian(as.Date(df$date), origin = as.Date('1970-01-01'))
df$Month <- month(df$date)
df$year <- year(df$date)
t1 <- c(100,150)
t2 <- c(200,250)
mat <- cbind(t1,t2)
t1 <- c(150,150,200)
t2 <- c(250,250,350)
t3 <- c(350,350, 400)
mat2 <- cbind(t1,t2, t3)
l1 <- list(mat, mat2)
int1 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "3") %>%
group_split()
int2 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "2") %>%
group_split()
names(int1) <- sapply(int1, function(x) paste(x$ID[1],
sep = '_'))
names(int2) <- sapply(int2, function(x) paste(x$ID[1],
sep = '_'))
int1 <- int1[-1]
int2 <- int2[-1]
Any suggestions for changes to this code for the desired result? :
g1 <- as.integer(gl(length(int1), 3, length(int1)))
f2 <- function(.int1, .int2) {
t(outer(seq_along(.int1), seq_along(.int2),
FUN = Vectorize(function(i, j) min(.int1[[i]]$jDate) -
min(.int2[[j]]$jDate))))
}
lstMat <- map2(split(int1, g1), split(int2, g1), f2)
map2(l1, lstMat, `/`)

As the 'int1', 'int2' have duplicated names, split on the names instead of creating a grouping index with gl
lstMat <- map2(split(int1, names(int1)), split(int2, names(int2)), f2)
map2(l1, lstMat, `/`)
-output
[[1]]
t1 t2
[1,] 3.571429 5.263158
[2,] 8.333333 8.928571
[[2]]
t1 t2 t3
[1,] 5.357143 6.578947 7.291667
[2,] 8.333333 8.928571 9.210526
[3,] 25.000000 19.444444 14.285714

Related

Match list elements based on attribute component

I have a data set that i split into two list int1 and int2.
library(lubridate)
library(tidyverse)
library(purrr)
date <- rep_len(seq(dmy("01-01-2011"), dmy("01-01-2013"), by = "days"), 300)
ID <- rep(c("A","B", "C"), 300)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$month <- month(df$date)
df$year <- year(df$date)
# Create first list
int1 <- df %>%
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(month == "1") %>%
group_split()
# Create second list
int2 <- df %>%
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(month == "2") %>%
group_split()
names(int1) <- sapply(int1, function(x) paste(x$ID[1],
x$year[1], sep = '_'))
names(int2) <- sapply(int2, function(x) paste(x$ID[1],
x$year[1], sep = '_'))
I then assign a attribute to each list (match). I created a function check to grab this attribute more easily. I removed some elements from one list for this exmaple.
int1 <- int1[-c(3,6)]
# Convenience function to grab the attributes for you
check <- function(x) {
return(attr(x, "match"))
}
# Add an attribute to hold the attributes of each list element
attr(int1, "match") <- data.frame(id = sapply(int1, function(x) paste(x$ID[1])),
interval_start_date = sapply(int1, function(x) paste(x$new[1]))
)
# Check the attributes
check(int1)
# Add an attribute "tab" to hold the attributes of each list element
attr(int2, "match") <- data.frame(id = sapply(int2, function(x) paste(x$ID[1])),
interval_start_date = sapply(int2, function(x) paste(x$new[1]))
)
# Check the attributes
check(int2)
I would like to remove elements that are not in another based on the attribute that I had added. Specifically I would like to remove any that don't have the same interval_start_date and ID associated with it. For the interval_start_date, only the year and the day have to match, as the month will most likely differ between the two list. In this case, I would like int2 to match int1. Any thoughts on how I could do this? A base r method is preferred, if possible.
# Expected results
expected_int2 <- list(int2[[1]], int2[[2]], int2[[3]], int2[[4]], int2[[5]],
int2[[6]], int2[[7]])
names(expected_int2) <- sapply(int1, function(x) paste(x$ID[1],
x$year[1], sep = "_"))
We may create an index with %in% after pasteing the 'id' and the formatted 'interval_start_date' i.e. after removing the 'month' part
i1 <- with(check(int2), paste(id, format(as.Date(interval_start_date),
"%Y-%d"))) %in% with(check(int1), paste(id,
format(as.Date(interval_start_date), "%Y-%d")))
> which(i1)
[1] 1 2 4 5 7 8 9
out <- int2[i1]

Remove list elements that are not present in another list based on element names

I have two list that I am working with int1 and int2. Both list have similar names for the list elements. I would like to remove specific components in one list, in this case int2 that are not present in another list int1. Is there a good way to do this in base R? I would like my results to look like the expected_int2.
library(lubridate)
library(tidyverse)
library(purrr)
date <- rep_len(seq(dmy("01-01-2011"), dmy("31-07-2011"), by = "days"), 200)
ID <- rep(c("A","B", "C"), 200)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$Month <- month(df$date)
# Create first list
int1 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "1") %>%
group_split()
# Assign names to int1
names(int1) <- sapply(int1, function(x) paste(x$ID[1],
x$new[1], sep = "_"))
#Remove list elements for the example
int1 <- int1[-c(6, 8, 9)]
# Create second list
int2 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "2") %>%
group_split()
# Assign names to int2
names(int2) <- sapply(int2, function(x) paste(x$ID[1],
x$new[1], sep = "_"))
# Expected results
expected_int2 <- list(int2[[1]], int2[[2]], int2[[3]], int2[[4]], int2[[5]], int2[[6]])
names(expected_int2) <- sapply(int1, function(x) paste(x$ID[1],
x$new[1], sep = "_"))
We can remove the month part from the names, to check if they are similar to subset
i1 <- sub("(.*)-\\d+-(.*)", "\\1-\\2", names(int2)) %in%
sub("(.*)-\\d+-(.*)", "\\1-\\2", names(int1))
out <- int2[i1]
names(out) <- names(int1)

Remove elements when getting the "missing element" error

I have a data set df that I subset into two list int1 and int2. Each one of the elements in this list represents a 10-day period for single individual (e.g., the first three elements in int1 represent three different 10-day periods for ID "A"). int2 is a bit different because it only has one 10-day period for ID "A" and "B". This is because the data for month 4 only has one 10-day period.
I also have a list l1 that contains two matrices.
Setting up the example data:
library(lubridate)
library(tidyverse)
library(purrr)
date <- rep_len(seq(dmy("01-01-2011"), dmy("10-04-2011"), by = "days"), 100)
ID <- rep(c("A","B"), 100)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$julian <- yday(df$date)
df$month <- month(df$date)
int1 <- df %>%
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(month == "1") %>%
group_split()
int2 <- df %>%
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(month == "4") %>%
group_split()
m1 <- matrix(1:9, nrow = 3, ncol = 3)
m2 <- matrix(20:28, nrow = 3, ncol = 3)
l1 <- list(m1, m2)
In the following code, I use the objects that I have created above. g1 is created as a sequence that is used in lstMat.
f1 is a function that does a number of calculations between int1 and int2 is also created to be used in lstMat.
g1 <- rep(seq_along(l1), sapply(l1, nrow))
# Function to calculate the number of julian dates between the first date in the
# first interval and the first date of the second interval
f1 <- function(.int1, .int2) {
t(outer(seq_along(.int1), seq_along(.int2),
FUN = Vectorize(function(i, j) {
min(.int1[[i]]$jDate) -
min(.int2[[j]]$jDate)
})
))
}
g1 <- rep(seq_along(l1), sapply(l1, nrow))
# Function to calculate the number of julian dates between the first date in the
# first interval and the first date of the second interval
f1 <- function(.int1, .int2) {
t(outer(seq_along(.int1), seq_along(.int2),
FUN = Vectorize(function(i, j) {
min(.int1[[i]]$jDate) -
min(.int2[[j]]$jDate)
})
))
}
This is the section of my script that I have been getting an error on.
lstMat <- purrr::map2(split(int1[seq_len(length(g1))], g1),
split(int2[seq_len(length(g1))], g1), f1)
Here is the error:
Error in `stop_subscript()`:
! Can't subset elements that don't exist.
x Locations 3, 4, 5, and 6 don't exist.
i There are only 2 elements.
Run `rlang::last_error()` to see where the error occurred.
I think the error is occuring due to the mismatch in length between g1 and int2 when trying to create the lstMat object. I was wondering how I could modify the code to remove those missing elements from g1 when I try to split int2 based on the g1 when running lstMat.

How to sovle self-defining function to count can't work

I write a function to count daily number of people in hospital but it can't work when the number is 0 in some days.
my function:
tsdata2 <- function(df){
t.f <- as.data.frame(table(df$DATE_INHOSPITAL2)) %>% rename(whole =Freq)
## sex
man.d <- df %>% filter(GENDER == 1)
man.f <- as.data.frame(table(man.d$DATE_INHOSPITAL2)) %>% rename(man =Freq)
woman.d <- df %>% filter(GENDER == 2)
woman.f <- as.data.frame(table(woman.d$DATE_INHOSPITAL2)) %>% rename(woman =Freq)
## age 65
agelo65.d <- df %>% filter(age_group65 == 1)
agelo65.f <- as.data.frame(table(agelo65.d$DATE_INHOSPITAL2)) %>% rename(agelo65 =Freq)
ageup65.d <- df %>% filter(age_group65 == 2)
ageup65.f <- as.data.frame(table(ageup65.d$DATE_INHOSPITAL2)) %>% rename(ageupwith65 =Freq)
## age 10
age10.1.d <- df %>% filter(age_group10 == 1)
age10.1.d.f <- as.data.frame(table(age10.1.d$DATE_INHOSPITAL2)) %>% rename(agelo40 =Freq)
age10.2.d <- df %>% filter(age_group10 == 2)
age10.2.d.f <- as.data.frame(table(age10.2.d$DATE_INHOSPITAL2)) %>% rename(age41_50 =Freq)
age10.3.d <- df %>% filter(age_group10 == 3)
age10.3.d.f <- as.data.frame(table(age10.3.d$DATE_INHOSPITAL2)) %>% rename(age51_60 =Freq)
age10.4.d <- df %>% filter(age_group10 == 4)
age10.4.d.f <- as.data.frame(table(age10.4.d$DATE_INHOSPITAL2)) %>% rename(age61_70 =Freq)
age10.5.d <- df %>% filter(age_group10 == 5)
age10.5.d.f <- as.data.frame(table(age10.5.d$DATE_INHOSPITAL2)) %>% rename(age71_80 =Freq)
age10.6.d <- df %>% filter(age_group10 == 6)
age10.6.d.f <- as.data.frame(table(age10.6.d$DATE_INHOSPITAL2)) %>% rename(ageup80 =Freq)
datebreaks<-seq(as.Date("2014-01-01"),as.Date("2018-12-31"),by="1 day")
full <- data.frame(Var1 = as.character(datebreaks) )
result <- full %>%
left_join(t.f) %>%
left_join(man.f) %>%
left_join(woman.f) %>%
left_join(agelo65.f) %>%
left_join(ageup65.f) %>%
left_join(age10.1.d.f) %>%
left_join(age10.2.d.f) %>%
left_join(age10.3.d.f) %>%
left_join(age10.4.d.f) %>%
left_join(age10.5.d.f) %>%
left_join(age10.6.d.f) %>% replace(., is.na(.), 0)
return(result)
}
list <- split(total,total$DISEASE_CODE1_2to3)
test <- map(list,tsdata2)
I think the error was because the number of hospital admissions on a given day was zero.
How can I improve this code that it can work even the number is zero.
test <- map(list,tsdata2)
Joining, by = "Var1"
Joining, by = "Var1"
Joining, by = "Var1"
Joining, by = "Var1"
Joining, by = "Var1"
Joining, by = "Var1"
Error: `by` required, because the data sources have no common variables
The reason of such an error thrown is that you are applying left_join on empty data frames with no columns to join. Along your data frame filtering and contingency table creation, data frames with no columns to join were generated. Please see below the simulation :
library(dplyr)
df1 <- data.frame(a = 1:10, b = letters[1:10])
df2 <- data.frame(a = 1:10, c = letters[11:20])
df2 <- df2[,-c(1, 2)]
str(df2)
# 'data.frame': 10 obs. of 0 variables
df2 %>% left_join(df1)
The code above throws an error:
Error: by required, because the data sources have no common
variables Call rlang::last_error() to see a backtrace
To avoid such a problem you can implement simple check if the data frame is with no columns then change to dummy data frame:
library(dplyr)
df1 <- data.frame(a = 1:10, b = letters[1:10])
df2 <- df1[,-c(1, 2)]
df_dummy <- data.frame(a = 1, c = 0)
if(ncol(df2) == 0) df2 <- df_dummy
df1 %>% left_join(df2)
#
# Joining, by = "a"
# a b c
# 1 1 a 0
# 2 2 b NA
# 3 3 c NA
# 4 4 d NA
# 5 5 e NA
# 6 6 f NA
# 7 7 g NA
# 8 8 h NA
# 9 9 i NA
# 10 10 j NA

Remove all instances of duplicate cells (not entire rows / columns) in a dataframe in R

I have a dataframe:
genes_1 = c("a","b","c","d","e")
genes_2 = c("f","g","c","e","j")
genes_3 = c("a","b","m","n","o")
df = data.frame(genes_1, genes_2, genes_3)
My desired output:
genes_1 = c("","","","d","")
genes_2 = c("f","g","","","j")
genes_3 = c("","","m","n","o")
df = data.frame(genes_1, genes_2, genes_3)
How can I achieve this?
Thanks
0-dependency base R solution:
data.frame(
genes_1 = c("a","b","c","d","e"),
genes_2 = c("f","g","c","e","j"),
genes_3 = c("a","b","m","n","o"),
stringsAsFactors = FALSE
) -> xdf
dups <- names(which(table(unlist(xdf, use.names = FALSE)) > 1))
xdf[] <- lapply(xdf, function(x) { x[x %in% dups] <- "" ; x })
xdf
unlist() recursively unwinds all the columns into a single character vector.
table() counts all occurrences of each element.
which() narrows down to only the ones which are TRUE
names() grabs the character select vector elements.
We then work by column to replace all occurrences in the vector that match with ""
library(microbenchmark)
library(data.table)
microbenchmark(
base = {
ydf <- xdf
dups <- names(which(table(unlist(ydf, use.names = FALSE)) > 1))
ydf[] <- lapply(ydf, function(x) { x[x %in% dups] <- "" ; x })
},
base.2 = {
ydf <- xdf
tmp <- unlist(ydf)
ydf[arrayInd(which(duplicated(tmp) | duplicated(tmp, fromLast = TRUE)), dim(ydf))] <- ""
},
tidyverse = {
ydf <- xdf
ydf %>%
gather(genes, value) %>%
add_count(value) %>%
mutate(value = ifelse(n > 1, "", value)) %>%
select(-n) %>%
group_by(genes) %>%
mutate(ID = 1:n()) %>%
spread(genes, value) %>%
select(-ID) -> ydf
},
data.table = {
ydt <- data.table(xdf)
ydt[,lapply(.SD, function(x) { x[x %in% dups] <- "" ; x })]
}
) %>%
{ print(.) ; . } %>%
autoplot()
Another base solution:
tmp <- unlist(df)
df[arrayInd(which(duplicated(tmp) | duplicated(tmp,fromLast=TRUE)), dim(df))] <- NA
# genes_1 genes_2 genes_3
#1 <NA> f <NA>
#2 <NA> g <NA>
#3 <NA> <NA> m
#4 d <NA> n
#5 <NA> j o
unlist just creates a long vector for all the values in df
arrayInd then creates a two-column row/column index for subsetting df for the duplicated values.
Here is a tidyverse solution. df2 is the final output.
library(tidyverse)
df2 <- df %>%
gather(genes, value) %>%
add_count(value) %>%
mutate(value = ifelse(n > 1, "", value)) %>%
select(-n) %>%
group_by(genes) %>%
mutate(ID = 1:n()) %>%
spread(genes, value) %>%
select(-ID)

Resources