Using data from one column to assign NAs to others - r

Goal: Add NAs to columns based on the value of another column. For example, if I have a data set with five columns (one ID column and four binary variables: caseID, var1, var2, var3, nocasedata), how can I evaluate the data from "nocasedata" to determine TRUE (no data) or FALSE (data) and then remove that column and assign NA (if TRUE) or do nothing (if FALSE) to the entire row of that case for the three other variables. (tidyverse tools preferred, but not necessary.)
Reproducible example:
df <- data.frame(caseID = c(1:5),
var1 = c(1, 0, 0, 1, 1),
var2 = c(0, 0, 1, 1, 0),
var3 = c(0, 0, 0, 1, 1),
nocasedata = c(0, 1, 0, 0, 0))
df
desired_df <- data.frame(caseID = c(1:5),
var1 = c(1, NA, 0, 1, 1),
var2 = c(0, NA, 1, 1, 0),
var3 = c(0, NA, 0, 1, 1))
desired_df

Here is a reprex of the solution using tidyverse tools, as requested.
library(tidyverse)
#> -- Attaching packages ---------------------------------------------------- tidyverse 1.2.1 --
#> v ggplot2 2.2.1 v purrr 0.2.4
#> v tibble 1.3.4 v dplyr 0.7.4
#> v tidyr 0.7.2 v stringr 1.2.0
#> v readr 1.1.1 v forcats 0.2.0
#> -- Conflicts ------------------------------------------------------- tidyverse_conflicts() --
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
df <- data.frame(caseID = c(1:5),
var1 = c(1, 0, 0, 1, 1),
var2 = c(0, 0, 1, 1, 0),
var3 = c(0, 0, 0, 1, 1),
nocasedata = c(0, 1, 0, 0, 0))
df
#> caseID var1 var2 var3 nocasedata
#> 1 1 1 0 0 0
#> 2 2 0 0 0 1
#> 3 3 0 1 0 0
#> 4 4 1 1 1 0
#> 5 5 1 0 1 0
desired_df = df %>%
mutate_at(.vars = vars(var1:var3),
.funs = funs(ifelse(nocasedata == 1, NA, .))) %>%
select(-nocasedata)
desired_df
#> caseID var1 var2 var3
#> 1 1 1 0 0
#> 2 2 NA NA NA
#> 3 3 0 1 0
#> 4 4 1 1 1
#> 5 5 1 0 1

Related

averaging mass spec peak counts by sample column names

Hopefully this is straightforward, and I'm just thinking too hard. I have a matrix of peak counts from mass spec (MS) where peaks are rows and columns are sample names. The sample locations have several sampling sites and I would like to add the counts between sites within locations.
For example, one sample with three replicates is identified as "S19S_0010_Sed_Field_ICR.D_p2", "S19S_0010_Sed_Field_ICR.M_p2", and "S19S_0010_Sed_Field_ICR.U_p2" where it's the same location but downstream (D), midstream (M), and upstream (U). The first two samples have one count of a specific peak each, so I would like to merge the three samples to just say "S19S_0010_Sed_Field_ICR.all_p2" with two counts of the wavelength. Example dataset:
> dput(data.sed.ex)
structure(list(S19S_0004_Sed_Field_ICR.M_p15 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0), S19S_0006_Sed_Field_ICR.D_p2 = c(0, 0, 0,
0, 0, 0, 1, 1, 0, 0), S19S_0006_Sed_Field_ICR.M_p2 = c(0, 0,
0, 0, 0, 0, 1, 0, 0, 0), S19S_0006_Sed_Field_ICR.U_p2 = c(0,
0, 0, 0, 0, 0, 1, 1, 0, 0), S19S_0008_Sed_Field_ICR.M_p15 = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0), S19S_0009_Sed_Field_ICR.M_p2 = c(0,
0, 1, 0, 0, 0, 1, 0, 0, 0), S19S_0009_Sed_Field_ICR.U_p2 = c(0,
0, 0, 0, 0, 0, 1, 0, 0, 0), S19S_0010_Sed_Field_ICR.D_p15 = c(0,
0, 0, 0, 0, 0, 1, 0, 0, 0), S19S_0010_Sed_Field_ICR.M_p15 = c(0,
0, 0, 0, 0, 0, 1, 0, 0, 0), S19S_0010_Sed_Field_ICR.U_p15 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c("200.002276", "200.015107",
"200.0564158", "200.0565393", "200.0578394", "200.0677581", "200.092796",
"200.1291723", "200.1292836", "200.9238455"), class = "data.frame")
TIA
maybe wrangling to a long format can help. In this format, you can summarize by groups e.g. sample, or sample, and location, using sum, mean, sd among others.
hope this helps,
Convert to long format
## dd is the `data.sed.ex` object above
library(tidyverse)
ddLong <- dd %>%
rownames_to_column(var = "peak") %>%
pivot_longer(cols = matches("^S")) %>%
mutate(sample = gsub("(.*)\\.(.*)", "\\1", name), ## pull sample info
location = gsub("(.*)\\.([DMU])_(.*)", "\\2", name), ## pull D M U
p = gsub("(.*)\\.([DMU])_(p.*)", "\\3", name), ## get p2, p15
peak = as.numeric(peak)) ## coerce peak to numeric
ddLong
#> # A tibble: 100 × 6
#> peak name value sample location p
#> <dbl> <chr> <dbl> <chr> <chr> <chr>
#> 1 200. S19S_0004_Sed_Field_ICR.M_p15 0 S19S_0004_Sed_Field… M p15
#> 2 200. S19S_0006_Sed_Field_ICR.D_p2 0 S19S_0006_Sed_Field… D p2
#> 3 200. S19S_0006_Sed_Field_ICR.M_p2 0 S19S_0006_Sed_Field… M p2
#> 4 200. S19S_0006_Sed_Field_ICR.U_p2 0 S19S_0006_Sed_Field… U p2
#> 5 200. S19S_0008_Sed_Field_ICR.M_p15 0 S19S_0008_Sed_Field… M p15
#> 6 200. S19S_0009_Sed_Field_ICR.M_p2 0 S19S_0009_Sed_Field… M p2
#> 7 200. S19S_0009_Sed_Field_ICR.U_p2 0 S19S_0009_Sed_Field… U p2
#> 8 200. S19S_0010_Sed_Field_ICR.D_p15 0 S19S_0010_Sed_Field… D p15
#> 9 200. S19S_0010_Sed_Field_ICR.M_p15 0 S19S_0010_Sed_Field… M p15
#> 10 200. S19S_0010_Sed_Field_ICR.U_p15 0 S19S_0010_Sed_Field… U p15
#> # … with 90 more rows
Summarize by one or more groups
## summarise using group_by + verbs
ddLong %>%
group_by(sample, location) %>%
summarise(n = n(),
sum.value = sum(value),
mean.peak = mean(peak))
#> `summarise()` has grouped output by 'sample'. You can override using the
#> `.groups` argument.
#> # A tibble: 10 × 5
#> # Groups: sample [5]
#> sample location n sum.value mean.peak
#> <chr> <chr> <int> <dbl> <dbl>
#> 1 S19S_0004_Sed_Field_ICR M 10 0 200.
#> 2 S19S_0006_Sed_Field_ICR D 10 2 200.
#> 3 S19S_0006_Sed_Field_ICR M 10 1 200.
#> 4 S19S_0006_Sed_Field_ICR U 10 2 200.
#> 5 S19S_0008_Sed_Field_ICR M 10 1 200.
#> 6 S19S_0009_Sed_Field_ICR M 10 2 200.
#> 7 S19S_0009_Sed_Field_ICR U 10 1 200.
#> 8 S19S_0010_Sed_Field_ICR D 10 1 200.
#> 9 S19S_0010_Sed_Field_ICR M 10 1 200.
#> 10 S19S_0010_Sed_Field_ICR U 10 0 200.
ddLong %>%
group_by(sample, p) %>%
summarise(n = n(),
sum.value = sum(value),
mean.peak = mean(peak))
#> `summarise()` has grouped output by 'sample'. You can override using the
#> `.groups` argument.
#> # A tibble: 5 × 5
#> # Groups: sample [5]
#> sample p n sum.value mean.peak
#> <chr> <chr> <int> <dbl> <dbl>
#> 1 S19S_0004_Sed_Field_ICR p15 10 0 200.
#> 2 S19S_0006_Sed_Field_ICR p2 30 5 200.
#> 3 S19S_0008_Sed_Field_ICR p15 10 1 200.
#> 4 S19S_0009_Sed_Field_ICR p2 20 3 200.
#> 5 S19S_0010_Sed_Field_ICR p15 30 2 200.

Conditional rolling sum across columns

I have a data frame of values across successive years (columns) for unique individuals (rows). A dummy data example is provided here:
dt = structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), `2015` = c(0,
0.8219178, 0, 0.1369863, 0, 1.369863, 0.2739726, 0.8219178, 5,
0), `2016` = c(0, 1.369863, 0, 0.2739726, 0, 0.2739726, 0, 3.2876712,
0, 0), `2017` = c(0.6849315, 0, 0, 0.6849315, 0, 0.5479452, 0,
0, 0, 0), `2018` = c(1.0958904, 0.5479452, 1.9178082, 0, 0, 0,
0, 0, 0, 3), `2019` = c(0, 0, 0, 1.0958904, 0, 0.9589041, 0.5479452,
0, 0, 0), `2020` = c(0.4383562, 0, 0, 0, 0.2739726, 0.6849315,
0, 0, 0, 0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-10L))
I want to create a dataset where the maximum value for each individual that should appear for each year is 1. In cases where it exceeds this value, I want to carry the excess value over 1 into the next year (column) and sum it to that year's value for each individual and so on.
The expected result is:
dt_expected = structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), `2015` = c(0,
0.8219178, 0, 0.1369863, 0, 1, 0.2739726, 0.8219178, 1, 0), `2016` = c(0,
1, 0, 0.2739726, 0, 0.6438356, 0, 1, 1, 0), `2017` = c(0.6849315,
0.369863, 0, 0.6849315, 0, 0.5479452, 0, 1, 1, 0), `2018` = c(1,
0.5479452, 1, 0, 0, 0, 0, 1, 1, 1), `2019` = c(0.0958904, 0,
0.9178082, 1, 0, 0.9589041, 0.5479452, 0.2876712, 1, 1), `2020` = c(0.4383562,
0, 0, 0.0958904, 0.2739726, 0.6849315, 0, 0, 0, 1)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
I am at a total loss of where to start with this problem, so any assistance achieving this using data.table would be greatly appreciated. My only thought is to use lapply with an ifelse function for the conditional component. Then should I be using rowSums or Reduce to achieve my outcome of shifting excess values across columns?
A translation of Martin Morgan's answer to data.table:
for (i in 2:(ncol(dt) - 1)) {
x = dt[[i]]
set(dt, j = i, value = pmin(x, 1))
set(dt, j = i + 1, value = dt[[i + 1L]] + pmax(x - 1, 0))
}
Not particularly pretty or efficient, but as a starting point I used pmin() and pmax() to update each year (and the subsequent year), iteratively. The current year is the minimum of the current year and 1 (pmin(x, 1)); the subsequent year is the current subsequent year plus the excess of the previous year (pmax(x - 1, 0))
update <- function(df) {
result = df
for (idx in 2:(ncol(df) - 1)) {
x = result[[ idx ]]
result[[ idx ]] = pmin(x, 1)
result[[ idx + 1 ]] = result[[ idx + 1 ]] + pmax(x - 1, 0)
}
result
}
We have
> all.equal(update(dt), dt_expected)
[1] TRUE
I don't know how to translate this into efficient data.table syntax, but the function 'works' as is on a data.table, update(as.data.table(dt)).
Not sure if there is a more efficient way with built in functions, but I simply wrote a recursive function that implements your described algorithm for the rows and then apply it over every row.
f <- function(l, rest = 0, out = list()) {
if (length(l) == 0) return(unlist(out))
if (l[[1]] + rest <= 1) {
f(l[-1], rest = 0, out = append(out, list(l[[1]] + rest)))
} else (
f(l[-1], rest = l[[1]] + rest - 1, out = append(out, list(1)))
)
}
dt[-1] <- apply(dt[-1], 1, f, simplify = F) |>
do.call(what = rbind)
dt
#> # A tibble: 10 × 7
#> ID `2015` `2016` `2017` `2018` `2019` `2020`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0 0 0.685 1 0.0959 0.438
#> 2 2 0.822 1 0.370 0.548 0 0
#> 3 3 0 0 0 1 0.918 0
#> 4 4 0.137 0.274 0.685 0 1 0.0959
#> 5 5 0 0 0 0 0 0.274
#> 6 6 1 0.644 0.548 0 0.959 0.685
#> 7 7 0.274 0 0 0 0.548 0
#> 8 8 0.822 1 1 1 0.288 0
#> 9 9 1 1 1 1 1 0
#> 10 10 0 0 0 1 1 1
Created on 2022-03-25 by the reprex package (v2.0.1)
Here is my solution:
dt |>
pivot_longer(cols = -ID, "year") |>
arrange(ID, year) |>
group_by(ID) |>
mutate(x = {
r <- accumulate(value,
~max(0,.y + .x - 1),
.init = 0)
pmin(1, value + head(r, -1))
}) |>
select(x, year, ID) |>
pivot_wider(names_from = "year", values_from = "x")
##> + # A tibble: 10 × 7
##> # Groups: ID [10]
##> ID `2015` `2016` `2017` `2018` `2019` `2020`
##> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##> 1 1 0 0 0.685 1 0.0959 0.438
##> 2 2 0.822 1 0.370 0.548 0 0
##> 3 3 0 0 0 1 0.918 0
##> 4 4 0.137 0.274 0.685 0 1 0.0959
##> 5 5 0 0 0 0 0 0.274
##> 6 6 1 0.644 0.548 0 0.959 0.685
##> 7 7 0.274 0 0 0 0.548 0
##> 8 8 0.822 1 1 1 0.288 0
##> 9 9 1 1 1 1 1 0
##> 10 10 0 0 0 1 1 1

Apply a function to each group

I have this dataset:
A<- c(10,20,10,31,51,1,60,1,02,0,12,0,20,1,0,0,0,0,1,0,1,1,1)
B<- c(1,0,0,1,1,1,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0)
C<- c(1,0,0,1,1,1,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,1)
SUB <- c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2)
dat <- as.data.frame(cbind(SUB,B,A,C))
I wrote a function calculating the cor among A/B, B/C, C/A.
Z <- function(a,b,c) {
cor1 = cor(a,b)
cor2 = cor(b,c)
cor3 = cor(c,a)
x = c(cor1,cor2,cor3)
return(x)
}
if I type
Z(dat$A, dat$B,dat$C)
I get the vector of results:
> [1] 0.11294312 0.91417410 0.06457059
I need to condition my function to the SUB variable and get a matrix whose rows are the cor among A/B, B/C, C/A for each SUB.
For instance:
A/B B/C C/A
SUB1 0.11294312 0.91417410 0.06457059
SUB2 0.10335312 0.96744677 0.16356059
Thank you,
Best regards
base R
You can split with by and then recombine.
do.call(rbind, by(dat, dat$SUB, function(x) Z(x$A, x$B, x$C)))
# [,1] [,2] [,3]
# 1 -0.1534126 1.0000000 -0.15341258
# 2 0.1081781 0.8215838 0.04608456
The row names 1 and 2 are the SUB values themselves; if SUB is more "interesting" than counting numbers, it will be more apparent. Column names can be applied trivially.
dplyr
library(dplyr)
dat %>%
group_by(SUB) %>%
summarize(as.data.frame(matrix(Z(A, B, C), nr = 1)))
# # A tibble: 2 x 4
# SUB V1 V2 V3
# <dbl> <dbl> <dbl> <dbl>
# 1 1 -0.153 1.00 -0.153
# 2 2 0.108 0.822 0.0461
Try split in combination with sapply
sapply( split(dat,dat$SUB), function(x) Z(x["A"],x["B"],x["C"]) )
1 2
[1,] -0.1534126 0.10817808
[2,] 1.0000000 0.82158384
[3,] -0.1534126 0.04608456
Actually there's no need for your function if you use the upper.tri of the correlation matrix. Recently you can do this very easily by piping:
sapply(unique(dat$SUB), \(i) cor(dat[dat$SUB == i, -1]) |> {\(x) x[upper.tri(x)]}())
# [,1] [,2]
# [1,] -0.1534126 0.10817808
# [2,] 1.0000000 0.82158384
# [3,] -0.1534126 0.04608456
R.version.string
# [1] "R version 4.1.2 (2021-11-01)"
Data
dat <- structure(list(SUB = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2), B = c(1, 0, 0, 1, 1, 1, 0, 1,
1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), A = c(10, 20, 10,
31, 51, 1, 60, 1, 2, 0, 12, 0, 20, 1, 0, 0, 0, 0, 1, 0, 1, 1,
1), C = c(1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 1)), class = "data.frame", row.names = c(NA, -23L
))
This is a lengthy answer, but it should be pretty flexible.
library(tidyverse)
cor.by.group.combos <- function(.data, groups, vars){
by <- gsub(x = rlang::quo_get_expr(enquo(groups)), pattern = "\\((.*)?\\)", replacement = "\\1")[-1]
piv <- gsub(x = rlang::quo_get_expr(enquo(vars)), pattern = "\\((.*)?\\)", replacement = "\\1")[-1]
.data %>%
group_by(!!!groups) %>%
group_split() %>%
map(.,
~pivot_longer(., cols = all_of(piv), names_to = "name", values_to = "val") %>%
nest(data = val) %>%
full_join(.,.,by = by) %>%
filter(name.x != name.y) %>%
mutate(test = paste(name.x, "vs",name.y, sep = "."),
grp = paste0(by,!!!groups),
cor = map2_dbl(data.x,data.y, ~cor(unlist(.x), unlist(.y)))) %>%
select(test,grp, cor)
) %>%
bind_rows() %>%
pivot_wider(names_from = test, values_from = cor)
}
cor.by.group.combos(dat, vars(SUB), vars(A, B, C))
#> # A tibble: 2 x 7
#> grp A.vs.B A.vs.C B.vs.A B.vs.C C.vs.A C.vs.B
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 SUB1 -0.153 -0.153 -0.153 1 -0.153 1
#> 2 SUB2 0.108 0.0461 0.108 0.822 0.0461 0.822
In essence, what we are doing is splitting the data by group, and then applying a cor test to every combination of the selected variables. The way I set this up will give some duplicate tests (e.g., A.vs.B and B.vs.A). You could fix this by using combn instead of full_join, but I didn't take the time to work out the details. This function should work if you change the input variables, the grouping variables, ect. You can also apply multiple groups with this method.

generate a weighted matrix from r dataframe

I have a toy example of a dataframe:
df <- data.frame(matrix(, nrow = 5, ncol = 0))
df["A|A"] <- c(0.3, 0, 0, 100, 23)
df["A|B"]= c(0, 0, 0.3, 10, 0.23)
df["A|C"]= c(0.3, 0.1, 0, 100, 2)
df["B|B"]= c(0, 0, 0, 12, 2)
df["B|B"]= c(0, 0, 0.3, 0, 0.23)
df["B|C"]= c(0.3, 0, 0, 21, 3)
df["C|A"]= c(0.3, 0, 1, 100, 0)
df["C|B"]= c(0, 0, 0.3, 10, 0.2)
df["C|C"]= c(0.3, 0, 1, 1, 0.3)
I need to get a matrix with counts of non-zero values between A and A, A and B, ..., C and C.
I started splitting the colnames and assigning them to variables. But I don't know how to create a matrix with certain rows and columns in a loop
counts <- colSums(df != 0)
df <- rbind(df, counts)
for(i in colnames(df)) {
cluster1 <- (strsplit(i, "\\|")[[1]])[1]
cluster2 <- (strsplit(i, "\\|")[[1]])[2]
}
A base R option
> table(read.table(text = rep(names(df), colSums(df > 0)), sep = "|"))
V2
V1 A B C
A 3 3 4
B 0 2 3
C 3 3 4
or a longer version
table(
data.frame(
do.call(
rbind,
strsplit(
as.character(subset(stack(df), values > 0)$ind),
"\\|"
)
)
)
)
gives
X2
X1 A B C
A 3 3 4
B 0 2 3
C 3 3 4
Reshape the data into 'long' format with pivot_longer, then separate the 'name' column into two, and reshape back to 'wide' with pivot_wider, specifying the values_fn as a lambda function to get the count of non-zero values
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = everything()) %>%
separate(name, into = c('name1', 'name2')) %>%
pivot_wider(names_from = name2, values_from = value,
values_fn = list(value = ~ sum(. > 0)), values_fill = 0)
-output
# A tibble: 3 x 4
name1 A B C
<chr> <int> <int> <int>
1 A 3 3 4
2 B 0 2 3
3 C 3 3 4

Error using rms package for predict survival probability in r

I have a dataset of 5 variables: CANT_ANU, RETIRE, SEX, PROF and AGE_INI. Like this:
library(data.table)
library(rms)
dt <- structure(list(CANT_ANU = c(32.1671232876712, 12.1671232876712,
45, 28.2465753424658, 33, 3, 34.2493150684932, 31.3287671232877,
40.413698630137, 31, 19.0849315068493, 31.0846994535519, 20.586301369863,
34, 36.0849315068493, 32.9150684931507, 24.0849315068493, 3.24657534246575,
17.0792349726776, 15.0849315068493, 25.7486338797814, 19, 31.3287671232877,
17.1616438356164, 31.0849315068493), RETIRE = c(1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1),
SEX = c(1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,
1, 1, 1, 1, 1, 0, 1, 0, 1), PROF = c(0, 1, 0, 0, 0,
0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
0), AGE_INI = c(49.2383561643836, 64.0575342465753, 32.9945205479452,
44.6767123287671, 40.4383561643836, 68.7890410958904, 38.2459016393443,
38.7479452054795, 30.5561643835616, 41.0684931506849, 51.1260273972603,
41.6219178082192, 54.9479452054794, 36.0846994535519, 32.0109589041096,
36.1698630136986, 44.6794520547945, 66.4630136986301, 54.8493150684931,
54.4246575342466, 50.8191780821918, 48.6657534246575, 37.0493150684932,
53.2712328767123, 37.6684931506849)), row.names = c(NA, -25L
), class = c("data.table", "data.frame"))
I want to get survival probabilities for each subject on the entire dataset from a estimated exponential model , so I tried this:
mod.par.exp <- psm(formula = Surv(CANT_ANU, RETIRE) ~ SEX + PROF + AGE_INI,data = dt,dist = "exponential")
probs.exp.rms <- survest(mod.par.exp,dt,times = seq(10,50,10), conf.int=.9)
But I get the following error:
Error in x %*% fit$var[-last, last, drop = FALSE] :
argumentos no compatibles
What I am doing wrong? I previously tried the examples indicated in survest.psm in the package manual and they worked.
the error is called because the x is a 4x4 matrix and fit$var[-last, last, drop = FALSE] is a 3x1 column vector.
Here is the part of the code:
survest.psm
Attached a reprex that shows you why the error occurs.
For conf.int = 0 or conf.int = FALSE it works.
I hope the answer helps you to narrow down the problem. It might also be useful to contact the package author or post an issue on the github site.
library(data.table)
library(rms)
#> Lade nötiges Paket: Hmisc
#> Lade nötiges Paket: lattice
#> Lade nötiges Paket: survival
#> Lade nötiges Paket: Formula
#> Lade nötiges Paket: ggplot2
#>
#> Attache Paket: 'Hmisc'
#> The following objects are masked from 'package:base':
#>
#> format.pval, units
#> Lade nötiges Paket: SparseM
#>
#> Attache Paket: 'SparseM'
#> The following object is masked from 'package:base':
#>
#> backsolve
dt <- structure(list(CANT_ANU = c(32.1671232876712, 12.1671232876712,
45, 28.2465753424658, 33, 3, 34.2493150684932, 31.3287671232877,
40.413698630137, 31, 19.0849315068493, 31.0846994535519, 20.586301369863,
34, 36.0849315068493, 32.9150684931507, 24.0849315068493, 3.24657534246575,
17.0792349726776, 15.0849315068493, 25.7486338797814, 19, 31.3287671232877,
17.1616438356164, 31.0849315068493), RETIRE = c(1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1),
SEX = c(1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,
1, 1, 1, 1, 1, 0, 1, 0, 1), PROF = c(0, 1, 0, 0, 0,
0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
0), AGE_INI = c(49.2383561643836, 64.0575342465753, 32.9945205479452,
44.6767123287671, 40.4383561643836, 68.7890410958904, 38.2459016393443,
38.7479452054795, 30.5561643835616, 41.0684931506849, 51.1260273972603,
41.6219178082192, 54.9479452054794, 36.0846994535519, 32.0109589041096,
36.1698630136986, 44.6794520547945, 66.4630136986301, 54.8493150684931,
54.4246575342466, 50.8191780821918, 48.6657534246575, 37.0493150684932,
53.2712328767123, 37.6684931506849)), row.names = c(NA, -25L
), class = c("data.table", "data.frame"))
mod.par.exp <- psm(formula = Surv(CANT_ANU, RETIRE) ~ SEX + PROF + AGE_INI,data = dt,dist = "exponential")
# Throws the error:
probs.exp.rms <- survest(mod.par.exp,dt,times = seq(10,50,10), conf.int = .9)
#> Error in x %*% fit$var[-last, last, drop = FALSE]: nicht passende Argumente
# reason for the error:
## setup of nvar Line 38 of linked github site:
## nvar <- length(fit$coef) - num.intercepts(fit)
## in your case nvar is:
nvar_test <- length(mod.par.exp$coef) - num.intercepts(mod.par.exp); nvar_test
#> [1] 3
## setup of x since linear.predictors are missing (first if)... and some more conditions:
## x is defined in Line 52 of linked github site.
## if(missing(x)) x <- cbind(Intercept=1, predict(fit, newdata, type="x"))
## in your case x is:
x_test <- cbind(Intercept=1, predict(mod.par.exp, dt, type="x")); x_test
#> Intercept SEX PROF AGE_INI
#> 1 1 1 0 49.23836
#> 2 1 1 1 64.05753
#> 3 1 1 0 32.99452
#> 4 1 1 0 44.67671
#> 5 1 0 0 40.43836
#> 6 1 0 0 68.78904
#> 7 1 0 1 38.24590
#> 8 1 0 1 38.74795
#> 9 1 1 0 30.55616
#> 10 1 1 0 41.06849
#> 11 1 0 0 51.12603
#> 12 1 1 0 41.62192
#> 13 1 1 0 54.94795
#> 14 1 0 0 36.08470
#> 15 1 1 0 32.01096
#> 16 1 1 0 36.16986
#> 17 1 1 0 44.67945
#> 18 1 1 0 66.46301
#> 19 1 1 0 54.84932
#> 20 1 1 0 54.42466
#> 21 1 1 1 50.81918
#> 22 1 0 0 48.66575
#> 23 1 1 0 37.04932
#> 24 1 0 1 53.27123
#> 25 1 1 0 37.66849
## next condition from line 55 to 62:
## g1 in line 56 can be calculated properly
## last is calculated from 57 - 60
# last <- {
# nscale <- length(fit$icoef) - 1
# ncol(fit$var) - (1 : nscale) + 1
# }
## in your case last is:
last_test <- {
nscale <- length(mod.par.exp$icoef) - 1
ncol(mod.par.exp$var) - (1 : nscale) + 1
}
last_test
#> [1] 4
## computation of g2 in Line 62 throws the error:
## g2 <- drop(x %*% fit$var[-last, last, drop=FALSE])
column_vec <- mod.par.exp$var[-last_test, last_test, drop = FALSE]; column_vec
#> [,1]
#> [1,] -0.0222297822
#> [2,] -0.0003043264
#> [3,] -0.0022725899
## error in matrix multiplication: 4x4 %*% 3x1 does not work!!
x_test %*% column_vec
#> Error in x_test %*% column_vec: nicht passende Argumente
# setting conf.int to FALSE or 0 works.
probs.exp.rms_noconf <- survest(mod.par.exp, dt, times = seq(10,50,10), conf.int = FALSE)
probs.exp.rms_noconf
#> 10 20 30 40 50
#> 1 0.6109711 0.37328575 0.228066821 0.1393422481 8.513409e-02
#> 2 0.5169610 0.26724870 0.138157161 0.0714218675 3.692232e-02
#> 3 0.8116670 0.65880336 0.534728967 0.4340218715 3.522812e-01
#> 4 0.6790361 0.46109007 0.313096820 0.2126040542 1.443658e-01
#> 5 0.6610245 0.43695340 0.288836903 0.1909282713 1.262083e-01
#> 6 0.1565442 0.02450608 0.003836283 0.0006005478 9.401225e-05
#> 7 0.7981757 0.63708443 0.508505305 0.4058765720 3.239608e-01
#> 8 0.7933484 0.62940168 0.499334815 0.3961464753 3.142822e-01
#> 9 0.8324214 0.69292545 0.576806000 0.4801456797 3.996836e-01
#> 10 0.7262735 0.52747323 0.383089845 0.2782280123 2.020696e-01
#> 11 0.4825998 0.23290259 0.112398750 0.0542436172 2.617796e-02
#> 12 0.7194063 0.51754539 0.372325405 0.2678532341 1.926953e-01
#> 13 0.5135481 0.26373161 0.135438854 0.0695543604 3.571951e-02
#> 14 0.7197743 0.51807509 0.372897156 0.2684018024 1.931887e-01
#> 15 0.8202983 0.67288936 0.551970025 0.4527800935 3.714148e-01
#> 16 0.7812767 0.61039332 0.476886090 0.3725800018 2.910881e-01
#> 17 0.6789980 0.46103834 0.313044133 0.2125563541 1.443253e-01
#> 18 0.2936587 0.08623542 0.025323782 0.0074365485 2.183807e-03
#> 19 0.5153319 0.26556694 0.136855109 0.0705257999 3.634419e-02
#> 20 0.5229761 0.27350397 0.143036034 0.0748044237 3.912092e-02
#> 21 0.7206720 0.51936820 0.374294139 0.2697433227 1.943965e-01
#> 22 0.5274674 0.27822187 0.146752970 0.0774074094 4.082989e-02
#> 23 0.7721483 0.59621304 0.460364901 0.3554699878 2.744756e-01
#> 24 0.6071013 0.36857201 0.223760552 0.1358453261 8.247188e-02
#> 25 0.7655297 0.58603572 0.448627746 0.3434378625 2.629119e-01
Created on 2020-07-06 by the reprex package (v0.3.0)

Resources