generate multiple data for each row with tidyverse - r

I have a list of variable named theta and want to draw n number of random variable from using each theta.
S = 5
n = 5
test = tibble(
s = 1:S,
theta = rgamma(S, shape = 10*s, rate = 50)
) %>%
mutate(data = rexp(n, theta))
ideally, I want my result be something like this:
S theta data
1 some value [a list with n number]
...
then expand it into a tibble:
S theta d1 d2 d3 .. dn
...
Hope this is clear.
Thanks.

If we need a list then, we can use map to loop through the each element of 'theta' and get the rexp in a list
library(tidyverse)
test1 <- test %>%
pull(theta) %>%
map(~rexp(n, .)) %>%
mutate(test, data = .)
str(test1$data)
#List of 5
# $ : num [1:5] 5.88 7.94 1.64 3.3 11.25
# $ : num [1:5] 4.5942 0.5424 1.7479 0.0469 0.9573
# $ : num [1:5] 1.192 2.447 0.239 1.497 2.359
# $ : num [1:5] 1.2323 0.0996 1.5778 0.1278 0.6982
# $ : num [1:5] 0.15 0.733 0.19 3.548 2.08
The list column can be unnested
test1 %>%
unnest(data)
data
S <- 5
n <- 5
test <- tibble(
s = 1:S,
theta = rgamma(S, shape = 10*s, rate = 50)
)

Related

How can I write a dataframe to a csv after running scale() in R?

I'm scaling one column in a dataset with the intention of fitting a linear model. However, when I try to write the dataframe (with scaled column) to a csv, it doesn't work because the scaled column became complex with center and scale attributes.
Can someone please indicate how to convert the scaled column to something that can write to a csv? (and maybe why scale() needs to do it this way.)
# make a data frame
testDF <- data.frame(x1 = c(1,2,2,3,2,4,4,5,6,15,36,42,11,12,23,24,25,66,77,18,9),
x2 = c(1,4,5,9,4,15,17,25,35,200,1297,1764,120,150,500,500,640,4200,6000,365,78))
# scale the x1 attribute
testDF <- testDF %>%
mutate(x1_scaled = scale(x1, center = TRUE, scale = TRUE))
# write to csv doesn't work
write_csv(as.matrix(testDF), "testDF.csv")
# but plotting and lm do work
ggplot(testDF, aes(x1_scaled)) +
geom_histogram(aes(y = ..density..),binwidth = 1)
Lm_scaled <- lm(x2 ~ x1_scaled, data = testDF)
plot(Lm_scaled)
scale returns a matrix output. We could extract the column or use as.vector to remove the dim attribute
testDF <- testDF %>%
mutate(x1_scaled = as.vector(scale(x1, center = TRUE, scale = TRUE)))
Check the structure of the output without as.vector and with as.vector
> testDF %>%
+ mutate(x1_scaled = scale(x1, center = TRUE, scale = TRUE)) %>% str
'data.frame': 21 obs. of 3 variables:
$ x1 : num 1 2 2 3 2 4 4 5 6 15 ...
$ x2 : num 1 4 5 9 4 15 17 25 35 200 ...
$ x1_scaled: num [1:21, 1] -0.824 -0.776 -0.776 -0.729 -0.776 ...
..- attr(*, "scaled:center")= num 18.4
..- attr(*, "scaled:scale")= num 21.2
> testDF %>%
+ mutate(x1_scaled = as.vector(scale(x1, center = TRUE, scale = TRUE))) %>% str
'data.frame': 21 obs. of 3 variables:
$ x1 : num 1 2 2 3 2 4 4 5 6 15 ...
$ x2 : num 1 4 5 9 4 15 17 25 35 200 ...
$ x1_scaled: num -0.824 -0.776 -0.776 -0.729 -0.776 ...
You can simply convert the scale column to numeric in base R and write out the dataframe:
testDF$x1_scaled <- as.numeric(testDF$x1_scaled)
write_csv(testDF, "testDF.csv")

how to partition the data with specific row size

Say I have the following matrix:
X = matrix(rnorm(4000), nrow=400, ncol=10)
size = c(80,80,79,80,81)
I want to partition the matrix row-wise according to the above size in a concise way. To illustrate,
x1 = X[1:80,]
x2 = X[81:160,]
.
.
x5 = X[320:400,]
I usually have different sizes and different matrices, so I won't be able to do this manually every time. Your help is very much appreciated.
Try the code below
e <- cumsum(size)
b <- c(1, head(e + 1, -1))
out <- Map(function(rb, re) X[rb:re, ], b, e)
and you will see
> str(out)
List of 5
$ : num [1:80, 1:10] 0.253 -0.368 0.804 -0.603 -0.119 ...
$ : num [1:80, 1:10] 0.467 -0.743 -0.401 1.48 0.853 ...
$ : num [1:79, 1:10] -1.123 -0.873 -1.039 -0.247 -0.774 ...
$ : num [1:80, 1:10] -1.409 -0.683 -0.514 0.485 -0.347 ...
$ : num [1:81, 1:10] 0.58 0.529 -0.803 0.49 -0.847 ...
Furthermore, if you want to create variables, try
list2env(setNames(out,paste0("x",seq_along(out))),envir = .GlobalEnv)
You can get start and end points using cumsum and use Map to subset them and create list of matrices.
X = matrix(rnorm(4000), nrow=400, ncol=10)
size = c(80,80,79,80,81)
val <- cumsum(size)
result <- Map(function(x, y) X[x:y, ], c(1, val[-length(val)] + 1), val)
where c(1, val[-length(val)] + 1) creates the starting row numbers and val are the ending ones.
c(1, val[-length(val)] + 1)
#[1] 1 81 161 240 320
val
#[1] 80 160 239 319 400
Using map2
library(purrr)
size1 <- cumsum(size)
map2(size1, c(1, size1[-length(size1)] + 1), ~ X[.x:.y,])

imap_dfr a function to 3 lists with nested maps?

I have this code which works for list [[1]] and list of list [[200]]:
SHAP_Prep_data <- shap.prep(xgb_model = xgb.mod[[1]][[200]],
shap_contrib = shap_values_results[[1]][[200]]$shap_score %>% select(-BIAS0),
X_train = as.matrix(TrainTestData[[1]]$XTrain[[200]])
#top_n = 4
)
I can simply replace out the [[200]] for [[300]], [[400]] etc. and obtain a new data structure (the shap.prep function comes from the shapforxgboost package.
xgb.mod[[1]][[200]] is a single xgboost model
shap_values_results[[1]][[200]]$shap_score %>% select(-BIAS0) is a data frame with the following structure.
> str(shap_values_results[[1]][[200]]$shap_score %>% select(-BIAS0))
'data.frame': 2190 obs. of 29 variables:
$ holiday : num -0.276 -0.347 -0.284 -0.356 -0.197 ...
$ temp : num 0.35 0.25 0.144 0.227 0.16 ...
$ wind : num -0.116 -0.187 -0.25 -0.265 -0.135 ...
$ humidity : num -0.021 0.0125 -0.037 0.016 -0.0196 ...
$ barometer : num -0.0191742 -0.0000462 0.0444956 -0.0148842 -0.0551703 ...
$ weekday : num -0.00421 -0.00937 0.0012 -0.01194 -0.00931 ...
$ weekend : num 0 0 0 0 0 0 0 0 0 0 ...
$ workday_on_holiday : num -0.00949 -0.00949 -0.00885 -0.00949 -0.00885 ...
$ weekend_on_holiday : num 0 0 0 0 0 0 0 0 0 0 ...
$ protocol_active : num 0 0 0 0 0 0 0 0 0 0 ...
$ text_fog : num 0.00714 0.00714 0.00783 0.00783 0.00772 ...
$ text_light_rain : num -0.000364 -0.000364 -0.000364 -0.000364 -0.000364 ...
$ text_mostly_cloudy : num -0.0013 -0.0013 -0.0013 -0.0013 -0.0013 ...
$ text_passing_clouds : num 0.00135 0.00152 0.00363 0.00152 0.00345 ...
$ text_rain : num -0.0000682 -0.0000682 -0.0000682 -0.0000682 -0.0000682 ...
$ text_scattered_clouds: num -0.0941 -0.0832 -0.1497 -0.0813 -0.0965 ...
$ text_sunny : num 0.000635 0.007435 0.009286 0.007435 0.007009 ...
$ month_1 : num 0.045 0.0503 0.062 0.062 0.0484 ...
$ month_2 : num 0.0602 0.0529 0.0526 0.0529 0.1008 ...
$ month_3 : num 0.0467 0.0348 0.0333 0.0348 0.0467 ...
$ month_4 : num -0.03439 -0.03439 -0.00777 -0.03439 -0.00164 ...
$ month_5 : num -0.02191 -0.02191 -0.00836 -0.02026 -0.01533 ...
$ month_6 : num -0.05498 -0.00637 -0.04769 -0.05101 -0.05155 ...
$ month_7 : num -0.1302 -0.1126 -0.0878 -0.0963 -0.1535 ...
$ month_8 : num -0.0418 -0.051 -0.0727 -0.0437 -0.0957 ...
$ month_9 : num 0.164 0.185 0.141 0.193 0.122 ...
$ month_10 : num 0.206 0.251 0.243 0.251 0.211 ...
$ month_11 : num 0.0929 0.0744 0.0302 0.0568 0.0961 ...
$ month_12 : num 0.059 0.0608 0.0806 0.0608 0.0788 ...
Finally as.matrix(TrainTestData[[1]]$XTrain[[200]]) is a dgcMatrix which I convert to a simple matrix using as.matrix() which has structure:
> str(as.matrix(TrainTestData[[1]]$XTrain[[200]]))
num [1:2190, 1:29] 0 0 0 0 0 0 0 0 0 0 ...
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:29] "holiday" "temp" "wind" "humidity" ...
I have 3 pieces of data I would like to apply the shap.prep function to.
The desired output would be to have a list (or list of lists) where the shap.prep function has been applied. The function requires 3 inputs shap.prep(xgb_model = NULL, shap_contrib = NULL, X_train,
top_n = NULL) which is what I am providing.
How can I use imap correctly to pass all three objects to the shap_prep function and obtain lists as my output?
It's difficult for me to give some dput() data since the I am not sure if its possible to dput() a trained XGBoost model.
EDIT:
I am adding the closest thing I can get to a reproducible example.
data(iris)
df <- split(iris, iris$Species) # I just want to create some lists here
library(xgboost)
library(SHAPforxgboost)
dtrainFunction <- function(i){
dt = xgb.DMatrix(data = data.matrix(i[, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")]), label = i$Species)
}
dtrain <- map(df, dtrainFunction) # I just apply the dtrainFunction which just puts each list into an xgb.DMatrix
xgb.mod <- map(dtrain, ~xgboost(data = .x, nround = 20)) # Apply the xgboost model to each list
# could not get this part of the code to work but it's not important. I manually put the results into a list below.
# shap_values_function <- function(j){
# map2(
# .x = xgb.mod[[j]],
# .y = df[[j]][, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")],
# ~shap.values(xgb_model = .x, X_train = as.matrix(.y))
# )
# }
#
# shap_values_results <- lapply(seq(1:3), shap_values_function)
# Here I manually put the results into a list which are lists of shap.values
shap_values_results <- list(
shap.values(xgb_model = xgb.mod[[1]], X_train = as.matrix(df[[1]][, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")])),
shap.values(xgb_model = xgb.mod[[2]], X_train = as.matrix(df[[2]][, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")])),
shap.values(xgb_model = xgb.mod[[3]], X_train = as.matrix(df[[3]][, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")]))
)
# Something is wrong here which is something to do with shap_contrib and BIAS0
SHAP_Prep_data <- shap.prep(xgb_model = xgb.mod[[1]],
shap_contrib = shap_values_results[[1]]$shap_score[, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")],
X_train = as.matrix(df[[1]])
#top_n = 4
)
shap.plot.summary(data_long = SHAP_Prep_data)
My actual code for the SHAP_Prep_data data is slightly different to the one above since I have list of lists.
EDIT 2:
I have tried the following which throws an error:
SHAP_Prep_data <- pmap(
list(
.x = xgb.model[[1]],
.y = shap_values_results[[1]],
.z = TrainTestData[[1]]$XTrain
), ~shap.prep(
xgb_model = .x,
shap_contrib = .y,
X_train = as.matrix(.z))
)
Error in as.matrix(.z) : object '.z' not found
EDIT 3:
When I apply the function on the iris data example:
SHAP_Prep_data <- pmap(
list(
.x = xgb.mod,
.y = shap_values_results,
.z = dtrain
), ~shap.prep(
xgb_model = .x,
shap_contrib = .y,
X_train = as.matrix(.z))
)
Error in as.matrix(.z) : object '.z' not found
EDIT 4:
I want to be able to access the $shap_score data which is created from the shap.values function used earlier (and also remove the column BIAS0 in the data from the following line).
shap_contrib = shap_values_results[[1]][[1300]]$shap_score %>% select(-BIAS0)
So would another map be needed here? or should I extract the shap_score data earlier in the function and remove the BIAS0 column there so that I can just call NEWDATA_shap_score[[1]][[1300]]?
The issue is that
str(as.matrix(df[[1]]))
#chr [1:50, 1:5] "5.1" "4.9" "4.7" "4.6" "5.0" "5.4" "4.6" "5.0" "4.4" "4.9" "5.4" "4.8" "4.8" "4.3" "5.8" "5.7" "5.4" "5.1" "5.7" "5.1" ...
# - attr(*, "dimnames")=List of 2
# ..$ : chr [1:50] "1" "2" "3" "4" ...
# ..$ : chr [1:5] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" ...
returns a character matrix as the last column is a character column. Remove the last column and then do the conversion
out <- shap.prep(xgb_model = xgb.mod[[1]],
shap_contrib = shap_values_results[[1]]$shap_score[,
c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")],
X_train = as.matrix(df[[1]][-5]) ###
#top_n = 4
)
Regarding using this with pmap
out2 <- pmap(list( xgb.model[[1]],
shap_values_results[[1]],
TrainTestData[[1]]$XTrain),
~shap.prep(
xgb_model = ..1,
shap_contrib = ..2$shap_score %>% select(-BIAS0),
X_train = as.matrix(..3)))
If we also want to apply this on the list of lists
pmap(list(xgb.model,
shap_values_results,
TrainTestData), ~
pmap(list(..1, ..2, ..3$xTrain), ~
shap.prep(xgb_model = ..1,
shap_contrib = ..2$shap_score %>% select(-BIAS0),
X_train = as.matrix(..3))))
It's hard to say without a reproducible example, but it sounds like you want pmap rather than imap
a <- list(letters[1:3])
b <- list(letters[4:6])
c <- list(letters[7:9])
purrr::pmap(list(a,b,c), function (x,y,z) paste(x, y, z))
#> [[1]]
#> [1] "a d g" "b e h" "c f i"
Created on 2020-01-08 by the reprex package (v0.3.0)

How to split list at every 10th item in R?

I have a list of 100 items.
I want to split it after each 10th item in Code 1.
Code 2 is about a list of two former lists and splitting it to 20 lists of 10 items each.
Code 1
Expected output: ten lists of 10 items.
A <- 100
a <- rnorm(A) # [1:100]
n <- 10
str(a)
# Not resulting in equal size of chunks with vectors so reject
# http://stackoverflow.com/a/3321659/54964
#d <- split(d, ceiling(seq_along(d)/(length(d)/n)))
# Works for vectors but not with lists
# http://stackoverflow.com/a/16275428/54964
#d <- function(d,n) split(d, cut(seq_along(d), n, labels = FALSE))
str(d)
Test code 2
Input: a list of two lists
aa <- list(a, rnorm(a))
Expected output: 20 lists of 10 item size
Testing Loki's answer
segmentLists <- function(A, segmentSize) {
res <- lapply(A, function(x) split(unlist(x), cut(seq_along(unlist(x)), segmentSize, labels = F)))
#print(res)
res <- unlist(res, recursive = F)
}
segmentLists(aa, 10)
Output: loop going on, never stopping
OS: Debian 8.5
R: 3.3.1
you can use lapply.
aa <- list(a, rnorm(a))
aa
n <- 10
x <- lapply(aa, function(x) split(unlist(x), cut(seq_along(unlist(x)), n, labels = F)))
y <- unlist(x, recursive = F)
str(y)
# List of 20
# $ 1 : num [1:10] 1.0895 -0.0477 0.225 -0.6308 -0.1558 ...
# $ 2 : num [1:10] -0.469 -0.381 0.709 -0.798 1.183 ...
# $ 3 : num [1:10] 0.757 -1.128 -1.394 -0.712 0.494 ...
# $ 4 : num [1:10] 1.135 0.324 0.75 -0.83 0.794 ...
# $ 5 : num [1:10] -0.786 -0.068 -0.179 0.354 -0.597 ...
# $ 6 : num [1:10] -0.115 0.164 -0.365 -1.827 -2.036 ...
...
length(y)
# [1] 20
to remove the names of the list elements in y ($ 1, $ 2 etc.) you can use unname()
str(unname(y))
# List of 20
# $ : num [1:10] 1.0895 -0.0477 0.225 -0.6308 -0.1558 ...
# $ : num [1:10] -0.469 -0.381 0.709 -0.798 1.183 ...
# $ : num [1:10] 0.757 -1.128 -1.394 -0.712 0.494 ...
# $ : num [1:10] 1.135 0.324 0.75 -0.83 0.794 ...
# $ : num [1:10] -0.786 -0.068 -0.179 0.354 -0.597 ...
...
Using a function, you have to return res at the end of the function.
segmentLists <- function(A, segmentSize)
{
res <- lapply(A, function(x) split(unlist(x), cut(seq_along(unlist(x)), segmentSize, labels = F)))
#print(res)
res <- unlist(res, recursive = F)
res <- unname(res)
res
}

How can I assign every 5th sample in my dataset as a Testing dataset and the rest of the samples as the training dataset?

I have a dataset with 133 samples(133 rows and 2 coloumns(attributes)). I need to assign every 5th sample starting from 1(1st, 6th, 11th, 16th.... etc) as testing dataset and the rest as training dataset. How can I do it?
Using the colon-operator in Maltab:
testset = dataset(1:5:end,:)
trainingset = setdiff(dataset, testset, 'rows')
You can also do this without setdiff:
testset = dataset(1:5:end,:)
trainingset = dataset;
trainingset(1:5:end, :) = [];
In r you can just use seq to generate the sequence of rows you want to keep. Then, you can either take a basic [ based subset, or you can use %in% to split the dataset into two parts.
Here's an example of the latter, with some sample data:
set.seed(1)
mydf <- data.frame(matrix(rnorm(266), ncol = 2))
mySamples <- setNames(split(mydf, rownames(mydf) %in% seq(1, 133, by = 5)),
c("training", "testing"))
str(mySamples)
# List of 2
# $ training:'data.frame': 106 obs. of 2 variables:
# ..$ X1: num [1:106] 0.184 -0.836 1.595 0.33 0.487 ...
# ..$ X2: num [1:106] 0.3066 -1.5364 -0.301 -0.5283 -0.0569 ...
# $ testing :'data.frame': 27 obs. of 2 variables:
# ..$ X1: num [1:27] -0.6265 -0.8205 1.5118 -0.0449 0.919 ...
# ..$ X2: num [1:27] -1.518 -0.652 -0.464 -1.286 -0.929 ...
Here's the alternative slightly more manual approach using [:
testingRows <- seq(1, 133, by = 5)
trainingRows <- setdiff(sequence(nrow(mydf)), testingRows)
testing <- mydf[testingRows, ]
training <- mydf[trainingRows, ]

Resources