if and for statement for the National inpatient sample - r

I have a dataset, attached. It has 16 columns.
The first column shows whether patients got surgery or not, and the other 15 are Day1-15 of surgery (coded as 1, 0).
I want to create a new column that satisfies a few conditions. I want that column to show the exact day of the procedure.
If column Day3 for example has a value of 1, I want the new value in the new column to be 3 (if and only if the first column crani and PRDAY3 column have a value of 1), and so on to be applied on all of the days' columns (day1-15).
Would really appreciate your help. Please let me know if you have any questions regarding the dataset or the problem I'm trying to solve.
tts <- function(timedc){
for (i in 15) {
if (TBI$PRDAYi == "1"){
timedc = c(timedc, TBI$PRDAYi)
}
return(timedc)
}
for (i in TBI$crani){
if (TBI$crani == "1"){
tts
}
}
}
*When tts is time to surgery.
I'm getting this error message:
Warning in if (TBI$crani == "1") { :
the condition has length > 1 and only the first element will be used
I want to create a column that has the exact day of the procedure from this database, as above.
dataset below.
> dput(TBI[1:10, 1:6])
structure(list(TBI.crani = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), TBI.PRDAY1 = c(NA,
0, NA, NA, 1, NA, 0, 0, 0, 1), TBI.PRDAY2 = c(NA, 2, NA, NA,
11, NA, NA, 0, 16, 2), TBI.PRDAY3 = c(NA, 2, NA, NA, 0, NA, NA,
0, 0, 2), TBI.PRDAY4 = c(NA, NA, NA, NA, 9, NA, NA, NA, 0, 5),
TBI.PRDAY5 = c(NA, NA, NA, NA, 11, NA, NA, NA, 0, 1)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
enter image description here

I believe the following function does what the question asks for. With the posted data it returns a vector of zeros, since the first column TBI.crani is always zero.
tts <- function(data){
f <- function(x){
if(all(is.na(x)))
NA_integer_
else {
w <- which(x == 1)
if(length(w)) w[1] else 0L
}
}
crani <- grep("crani", names(data))
day_cols <- grep("DAY", names(data))
apply(data, 1, function(x){
if(x[crani] == 1) f(x[day_cols]) else 0L
})
}
tts(TBI)
TBI$NEW <- tts(TBI)

Related

Conditionally replace cells in data frame based on another data frame

In the interest of learning better coding practices, can anyone show me a more efficient way of solving my problem? Maybe one that doesn't require new columns...
Problem: I have two data frames: one is my main data table (t) and the other contains changes I need to replace in the main table (Manual_changes). Example: Sometimes the CaseID is matched with the wrong EmployeeID in the file.
I can't provide the main data table, but the Manual_changes file looks like this:
Manual_changes = structure(list(`Case ID` = c(46605, 25321, 61790, 43047, 12157,
16173, 94764, 38700, 41798, 56198, 79467, 61907, 89057, 34232,
100189), `Employee ID` = c(NA, NA, NA, NA, NA, NA, NA, NA, 906572,
164978, 145724, 874472, 654830, 846333, 256403), `Age in Days` = c(3,
3, 3, 12, 0, 0, 5, 0, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
-15L), class = c("tbl_df", "tbl", "data.frame"))
temp = merge(t, Manual_changes, by = "Case ID", all.x = TRUE)
temp$`Employee ID.y` = ifelse(is.na(temp$`Employee ID.y`), temp$`Employee ID.x`, temp$`Employee ID.y`)
temp$`Age in Days.y`= ifelse(is.na(temp$`Age in Days.y`), temp$`Age in Days.x`, temp$`Age in Days.y`)
temp$`Age in Days.x` = NULL
temp$`Employee ID.x` = NULL
colnames(temp) = colnames(t)
t = temp
We could use coalesce
library(dplyr)
left_join(t, Manual_changes, by = "Case ID") %>%
mutate(Employee_ID.y = coalesce(`Employee ID.x`, `Employee ID.y`),
`Age in Days.y` = coalesce(`Age in Days.x`, `Age in Days.y`))
Or with data.table
library(data.table)
setDT(t)[Manual_changes,
c('Employee ID', 'Age in Days') :=
.(fcoalesce(`Employee ID.x`, `Employee ID.y`),
fcoalesce(`Age in Days.x`, `Age in Days.y`)),
on = .(`Case ID`)]

what is the most elegant way to check for patterns of missing data in R?

I have a set of numeric vectors in R each length 16. I would like to select those vectors that have all values present in one of four positions: 1:4, 5:8, 9:12, 13:16
e.g. vector c(NA, 1, NA, 1, 1, 1, 1, 1, NA, NA, 1, NA, NA, 1, NA, 1, NA) would pass the test, since positions 5:8 are all non NA.
What is the most elegant (i.e. using minimum easy-to-read code) way to test this?
With a list of indices, you can iterate over those ranges and look for ones without any NA:
vec <- c(NA, 1, NA, 1, 1, 1, 1, 1, NA, NA, 1, NA, NA, 1, NA, 1, NA)
sapply(list(1:4, 5:8, 9:12, 13:16),
function(ind) !anyNA(vec[ind]))
# [1] FALSE TRUE FALSE FALSE
If you want to return the values within those indices:
inds <- list(1:4, 5:8, 9:12, 13:16)
good <- sapply(inds, function(ind) !anyNA(vec[ind]))
# should check that `any(good)` is true
inds[[ which(good)[1] ]]
# [1] 5 6 7 8
vec[ inds[[ which(good)[1] ]] ]
# [1] 1 1 1 1
Here is an option with rleid to get the run-length-encoding id of the vector, use that as grouping variable to check if any of the sequence have full set of non-NA elements
library(data.table)
any(as.logical(ave(seq_along(v1) * v1, rleid(v1),
FUN = function(x) all(!is.na(x))) ))
#[1] TRUE
Or it could be also
any(with(rle(!is.na(v1)), lengths[values] >=4))
#[1] TRUE
Or another option is table
4 %in% table(v1 * (seq_along(v1) -1) %/% 4)
#[1] TRUE
data
v1 <- c(NA, 1, NA, 1, 1, 1, 1, 1, NA, NA, 1, NA, NA, 1, NA, 1, NA)
The following code will return a single value (TRUE or FALSE). It returns TRUE if the vector passes the test.
vec <- c(NA, 1, NA, 1, 1, 1, 1, 1, NA, NA, 1, NA, NA, 1, NA, 1, NA)
!all(tapply(vec, rep(1:length(vec), each = 4, len = length(vec)), anyNA))
# [1] TRUE

Merge multiple files into one big data table. Column names do not match in the files

I have 50+ csv files in a folder on my computer that I would like merged into 1 giant data table. Below is an example of how 3 out of my 50 tables could look (one, two, and three) and how I would like my final table to look (together).
one <- data.frame("County" = c("Autauga", "Barbour", "Bibb"), "AAAA" = c(1,
1, 1), "BBBB" = c(2, 2, 2))
two <- data.frame("County" = c("Cape May", "Mercer", "Bergen"), "BBBB" =
c(1, 1, 1), "CCCC" = c(2, 2, 2), "DDDD" = c(1, 2 ,3))
three <- data.frame("County" = c("Lincoln", "Jackson", "Pike"), "CCCC" =
c(1, 1, 1))
together <- data.frame("County" = c("Autauga", "Barbour", "Bibb", "Cape
May", "Mercer", "Bergen", "Lincoln", "Jackson", "Pike"), "AAAA" = c(1, 1, 1,
NA, NA, NA, NA, NA, NA), "BBBB" = c(2, 2, 2, 1, 1, 1, NA, NA, NA), "CCCC" =
c(NA, NA, NA, 2, 2, 2, 1, 1, 1), "DDDD" = c(NA, NA, NA, 1, 2, 3, NA, NA,
NA))
If anyone could help me with this, that would be great! Also the blanks do not need to be "NA", they can just be left as blanks.
We can use bind_rows
library(tidyverse)
bind_rows(one, two, three)
If there are many datasets, places it in a list and then use bind_rows/rbindlist from data.table
Instead of creating multiple data.table/data.frame objects in the global env, read it into a list and then use rbindlist
library(data.table)
rbindlist(lapply(files, fread))

find strings in data.frame to fill in new column

I used dplyr on my data to create a subset of data like this:
dd <- data.frame(ID = c(700689L, 712607L, 712946L, 735907L, 735908L, 735910L, 735911L, 735912L, 735913L, 746929L, 747540L),
`1` = c("eg", NA, NA, "eg", "eg", NA, NA, NA, NA, "eg", NA),
`2` = c(NA, NA, NA, "sk", "lk", NA, NA, NA, NA, "eg", NA),
`3` = c(NA, NA, NA, "sk", "lk", NA, NA, NA, NA, NA, NA),
`4` = c(NA, NA, NA, "lk", "lk", NA, NA, NA, NA, NA, NA),
`5` = c(NA, NA, NA, "lk", "lk", NA, NA, NA, NA, NA, NA),
`6` = c(NA, NA, NA, "lk", "lk", NA, NA, NA, NA, NA, NA))
I now want to check every column except ID if it contains certain strings. In this example I want to create 1 column with "1" for every ID that contains a column with "eg" and "0" for the rest. Likewise one more column which tells me if there is either a "sk" or "lk" in the other columns. After that the old columns except ID can be removed from the data.frame
The difficult part for me is doing this with a dynamic number of columns, as my dplyr-subset will return different amounts of columns based on the specific case, but I need to check every one that is created in every case. I wanted to use unite first to put all strings together but I will have the same problem then: How can I unite all columns except the first ID one.
If this can be solved within dplyr it would be perfect but any working solution is appreciated.
The result should look like this:
result <- data.frame(ID = c(700689L, 712607L, 712946L, 735907L, 735908L, 735910L, 735911L, 735912L, 735913L, 746929L, 747540L),
with_eg = c(1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0),
with_sk_or_lk = c(0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0))
From your description, you want one column to check for "eg" and another column to check for both "lk" and "sk". If this is the case, then the following base R method will work.
dfNew <- cbind(id=dd[1],
eg=pmin(rowSums(dd[-1] == "eg", na.rm=TRUE), 1),
other=pmin(rowSums(dd[-1] == "sk" | dd[-1] == "lk", na.rm=TRUE), 1))
Here, the presence of "eg" is checked across the entire data.frame (except the id column) and a logical matrix is returned, rowSums adds the TRUE values across the rows, with na.rm removing the NAs, then pmin takes the minimum of the output of rowSums and 1, so that any elements with 2 are replaced by 1 and any values with 0 are preserved.
This same logic is applied to the construction of the "other" variable, except the presence of either "lk" or "sk" are checked in the initial logical matrix. Finally, data.frame returns a 3 column data.frame with the desired values.
This returns
dfNew
ID eg other
1 700689 1 0
2 712607 0 0
3 712946 0 0
4 735907 1 1
5 735908 1 1
6 735910 0 0
7 735911 0 0
8 735912 0 0
9 735913 0 0
10 746929 1 0
11 747540 0 0
Here is an admittedly hacky dplyr/purrr solution. Given that your IDs don't seem like they'll ever equal 'eg', 'sk', or 'lk', I haven't included anything to not search the ID column.
library(dplyr)
library(purrr)
dd %>%
split(.$ID) %>%
map_df(~ data_frame(
ID = .x$ID,
eg = ifelse(any(.x == 'eg', na.rm = TRUE), 1, 0),
other = ifelse(any(.x == 'lk' | .x == 'sk', na.rm = TRUE), 1, 0)
))

How to properly index list items to return rows, not columns, inside a for loop

I'm trying to write a for loop within another for loop. The first loop grabs the ith vcov matrix from a list of variously sized matrices (vcmats below) and grabs a frame of 24 predictor models of appropriate dimension to multiply with the current vcov matrix from a list of frames (jacobians below) for the different models. The second loop should pull the jth record (row) from the selected predictor frame, correctly format it, then run the calculation with the vcov matrix and output an indicator variable and calculated result needed for post processing to the holding table (holdtab).
When I run the code below I get the following error: Error in jjacob[, 1:4] : incorrect number of dimensions because R is returning the column of 1s (i.e. the intercept column of jacobs), not the complete first record (i.e. jjacob = jacobs[1,]). I've substantially simplified the example but left enough complexity to demonstrate the problem. I would appreciate any help in resolving this issue.
vcmats <- list(structure(c(0.67553, -0.1932, -0.00878, -0.00295, -0.00262,
-0.00637, -0.1932, 0.19988, 0.00331, -0.00159, 0.00149, 2e-05,
-0.00878, 0.00331, 0.00047, -6e-05, 3e-05, 3e-05, -0.00295, -0.00159,
-6e-05, 0.00013, -2e-05, 6e-05, -0.00262, 0.00149, 3e-05, -2e-05,
2e-05, 0, -0.00637, 2e-05, 3e-05, 6e-05, 0, 0.00026), .Dim = c(6L,
6L)), structure(c(0.38399, -0.03572, -0.00543, -0.00453, -0.00634,
-0.03572, 0.10912, 0.00118, -0.00044, 0.00016, -0.00543, 0.00118,
0.00042, -3e-05, 4e-05, -0.00453, -0.00044, -3e-05, 0.00011,
5e-05, -0.00634, 0.00016, 4e-05, 5e-05, 0.00025), .Dim = c(5L,
5L)))
jacobians <- list(structure(list(intcpt = c(1, 1, 1, 1), species = c(1, 1,
0, 0), nage = c(6, 6, 6, 6), T = c(12, 50, 12, 50), hgt = c(90,
90, 90, 90), moon = c(7, 7, 7, 7), hXm = c(0, 0, 0, 0), covr = c(0,
0, 0, 0), het = c(0, 0, 0, 0)), .Names = c("intcpt", "species",
"nage", "T", "hgt", "moon", "hXm", "covr", "het"), row.names = c("1",
"1.4", "1.12", "1.16"), class = "data.frame"), structure(list(
intcpt = c(1, 1, 1, 1), species = c(1, 1, 0, 0), nage = c(6,
6, 6, 6), T = c(12, 50, 12, 50), hgt = c(0, 0, 0, 0), moon = c(7,
7, 7, 7), hXm = c(0, 0, 0, 0), covr = c(0, 0, 0, 0), het = c(0,
0, 0, 0)), .Names = c("intcpt", "species", "nage", "T", "hgt",
"moon", "hXm", "covr", "het"), row.names = c("2", "2.4", "2.12",
"2.16"), class = "data.frame"))
holdtab <- structure(list(model = structure(c(4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L), .Label = c("M.1.BaseCov", "M.2.Height", "M.5.Height.X.LastNewMoon",
"M.6.Height.plus.LastNew", "M.7.LastNewMoon", "M.G.Global"), class = "factor"),
aicc = c(341.317, 341.317, 341.317, 341.317, 342.1412, 342.1412,
342.1412, 342.1412), species = c(NA, NA, NA, NA, NA, NA,
NA, NA), condVar = c(NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("model",
"aicc", "species", "condVar"), row.names = c(1L, 2L, 3L, 4L,
25L, 26L, 27L, 28L), class = "data.frame")
jloop <- 1
for (imat in vcmats) { # Call the outside loop of vcov matrices
jacobs = jacobians[[jloop]] # Set tempvar jacobs as the jth member of the jacobians frame (n/24)
for (jjacob in jacobs) { # Call inside loop of lines in jacob (each individual set of predictor levels)
# I need to reduce the vector length to match my vcov matrix so
pt1 = jjacob[,1:4] # Separate Core columns from variable columns (because I don't want to drop species when ==0)
pt2 = jjacob[,5:9] # Pull out variable columns for next step
pt2 = pt2[,!apply(pt2 == 0, 2, all)] # Drop any variable columns that ==0
jjacob = cbind(pt1, pt2) # Reconstruct the record now of correct dimensions for the relevant vcov matrix
jjacob = as.matrix(jjacob) # Explicitly convert jjmod - I was having trouble with this previously
tj = (t(jjacob)) # Transpose the vector
condvar = jjacob %*% imat %*% tj # run the calculation
condVarTab[record,3] = jjacob[2] # Write species 0 or 1 to the output table
condVarTab[record,4] = condvar # Write the conditional variance to the table
record = record+1 # Iterate the record number for the next output run
}
jloop = jloop+1 # Once all 24 models in a frame are calculated iterate to the next frame of models which will be associated with a new vcv matrix
}

Resources