Calculating the mean of 3 columns in data frame

Calculating the mean of 3 columns in data frame - r

I have 3 data frames and they are just replicates. So I want to bind them and calculate the mean of each fraction.
Three data frames:
Nr.1
> dput(head(tbl_gel1))
structure(list(Name = c("yal003w", "yal005c", "yal012w", "yal016w",
"yal035w", "yal038w"), `1_1` = c(1.08346521189121, NA, NA, NA,
NA, NA), `1_10` = c(0.267721905361376, 1.43303883148383, 1.61684304894131,
NA, NA, NA), `1_11` = c(0.189487668138674, 0.75522363065885,
1, NA, NA, NA), `1_12` = c(NA, 1.01340492119247, NA, NA, NA,
NA), `1_13` = c(0.374782308020683, 0.945489433731933, NA, NA,
NA, 0.0317297633029047), `1_14` = c(0.437488212634424, 1.18763709680314,
NA, NA, NA, 0.0278039649538794), `1_15` = c(1, 0.963283876302253,
NA, NA, NA, 0.101985769564935), `1_16` = c(0.933864874212228,
0.534233379286527, NA, NA, NA, 0.216767470594226), `1_17` = c(1,
0.665519263271478, NA, NA, 1, 1), `1_18` = c(0.666036574750145,
0.570465125348879, NA, NA, NA, 1.42894349812116), `1_19` = c(0.514337131747938,
0.23204076838128, NA, NA, 1, 1.2521214021452), `1_2` = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), `1_20` = c(NA,
NA, NA, NA, NA, 1.40803677399372), `1_21` = c(1.09990599806138,
NA, NA, NA, NA, 1.04631699593704), `1_22` = c(1.26442418472118,
NA, NA, NA, NA, 0.928872017485782), `1_23` = c(1.11596921281805,
NA, NA, NA, 1, 0.34698227364696), `1_24` = c(0.754496014447251,
NA, NA, NA, 1, 0.222234793614252), `1_3` = c(6.29254185223621,
NA, NA, 0.693642968439352, NA, NA), `1_4` = c(1.36347593974479,
NA, NA, 1, NA, NA), `1_5` = c(0.765885344543765, NA, NA, 1, NA,
NA), `1_6` = c(0.238118001668604, 0.679584207611477, NA, NA,
NA, NA), `1_7` = c(0.847897771442355, 0.277348019879946, NA,
NA, NA, NA), `1_8` = c(0.356154192700505, 1, 0.409523853881517,
NA, NA, NA), `1_9` = c(0.180109142324181, 1, 0.578310191227172,
NA, NA, 0.093113736249161)), .Names = c("Name", "1_1", "1_10",
"1_11", "1_12", "1_13", "1_14", "1_15", "1_16", "1_17", "1_18",
"1_19", "1_2", "1_20", "1_21", "1_22", "1_23", "1_24", "1_3",
"1_4", "1_5", "1_6", "1_7", "1_8", "1_9"), row.names = c(NA,
6L), class = "data.frame")
Nr. 2
> dput(head(tbl_gel2))
structure(list(Name = c("yal003w", "yal005c", "yal012w", "yal016w",
"yal035w", "yal038w"), `2_1` = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), `2_2` = c(1.0548947840373, NA,
NA, NA, NA, NA), `2_3` = c(1.61794716486303, 0.346821796129205,
NA, NA, NA, NA), `2_4` = c(1, NA, NA, 0.378254379051086, NA,
NA), `2_5` = c(0.670710809411423, NA, NA, 1, NA, NA), `2_6` = c(0.313872585645673,
NA, NA, NA, NA, NA), `2_7` = c(0.299293639466945, 0.13920907824675,
NA, NA, NA, NA), `2_8` = c(0.311431376422469, 0.511742245543671,
0.342807141055383, NA, NA, NA), `2_9` = c(0.243672215177189,
1, 0.689138745271004, NA, NA, 0.0540861571772987), `2_10` = c(0.154732102234279,
1.08973258347909, 1, NA, NA, NA), `2_11` = c(0.149365726324845,
1.1210733533474, 1.0427649268992, NA, NA, 0.0955468461925663),
`2_12` = c(0.153741630869067, 2.96276072446013, 1, NA, NA,
NA), `2_13` = c(0.629371115599316, 0.952868912207058, 0.0771105403237483,
NA, NA, 0.0885212695236819), `2_14` = c(0.907644486740723,
1.43000783337778, NA, NA, NA, 0.138102409899801), `2_15` = c(1.09683345304359,
0.423641943213571, NA, NA, NA, 0.255699738225622), `2_16` = c(0.913095779338154,
0.510977400533081, NA, NA, 0.520556617688936, 0.284898552722227
), `2_17` = c(0.935941553863477, 0.388225948821767, NA, NA,
1.14984991998928, 1), `2_18` = c(2.21746156904543, 0.642743615867438,
NA, NA, NA, 2.22716071647178), `2_19` = c(0.500618035526774,
0.282924681750454, NA, NA, NA, 1), `2_20` = c(0.701627311828743,
0.254001731153973, NA, NA, 1, 1.15996914621286), `2_21` = c(1.97359874904275,
NA, NA, NA, 1.67526802494991, 1.38709456754353), `2_22` = c(2.09198896289293,
NA, NA, NA, NA, 0.921672834103247), `2_23` = c(1.18791465369551,
NA, NA, NA, NA, 0.576309066193914), `2_24` = c(0.473199477125101,
0.176144702328764, NA, NA, 1, 0.130236848112641)), .Names = c("Name",
"2_1", "2_2", "2_3", "2_4", "2_5", "2_6", "2_7", "2_8", "2_9",
"2_10", "2_11", "2_12", "2_13", "2_14", "2_15", "2_16", "2_17",
"2_18", "2_19", "2_20", "2_21", "2_22", "2_23", "2_24"), row.names = c(NA,
6L), class = "data.frame")
Nr.3
> dput(head(tbl_gel3))
structure(list(Name = c("yal003w", "yal005c", "yal012w", "yal016w",
"yal035w", "yal038w"), `3_1` = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), `3_2` = c(1, 1.4605309655311,
NA, NA, NA, NA), `3_3` = c(1.74480713727388, 0.42825619952525,
NA, NA, NA, NA), `3_4` = c(1, 0.431712121875013, NA, 0.395182020245312,
NA, NA), `3_5` = c(2.26247329056518, 0.644462177666441, NA, 1,
NA, NA), `3_6` = c(0.619783374266709, 0.472094874244026, NA,
NA, NA, NA), `3_7` = c(0.45731912574756, 0.176354321796083, NA,
NA, NA, NA), `3_8` = c(0.271829278733367, 0.517232771669986,
0.153774052052871, NA, NA, NA), `3_9` = c(0.141017619508583,
1.41279969394534, 0.651948154271122, NA, NA, NA), `3_10` = c(NA,
1.64435171100405, 0.998807430240956, NA, NA, NA), `3_11` = c(0.110046035477971,
1.33684444261939, 1.25595310581771, NA, NA, 0.0236163735479745
), `3_12` = c(NA, 0.982250906830292, 0.39283619985401, NA, NA,
0.0688303458902568), `3_13` = c(0.136798076436642, 0.55729642483448,
0.176525038283566, NA, NA, 0.0251189412372225), `3_14` = c(0.316623893146817,
1, NA, NA, NA, 0.0727823461722849), `3_15` = c(NA, 0.607991038574375,
NA, NA, NA, 0.133968257432001), `3_16` = c(0.362994392402489,
0.547183167896534, NA, NA, NA, 0.0777347708647245), `3_17` = c(1,
0.116561118715651, NA, NA, 0.710972173471528, 1), `3_18` = c(NA,
3.63330458071475, NA, NA, NA, 3.24019081192985), `3_19` = c(NA,
NA, NA, NA, NA, 2.46635222132474), `3_20` = c(0.452303676849426,
0.0896715384025126, NA, NA, 1, 1), `3_21` = c(1.50169299468485,
0.513442106966708, NA, NA, 1.45124841710635, 1.02529618467026
), `3_22` = c(0.565232592993276, 0.748536315065533, NA, NA, 2.9089322117881,
0.782555457293307), `3_23` = c(1.62622280168665, 0.704926586534075,
NA, NA, NA, 0.584486806995139), `3_24` = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_)), .Names = c("Name",
"3_1", "3_2", "3_3", "3_4", "3_5", "3_6", "3_7", "3_8", "3_9",
"3_10", "3_11", "3_12", "3_13", "3_14", "3_15", "3_16", "3_17",
"3_18", "3_19", "3_20", "3_21", "3_22", "3_23", "3_24"), row.names = c(NA,
6L), class = "data.frame")
I used function below to bind them. There are different number of rows in each data frame and in some cases different names so in the final table should be more rows than in each of them.
mylist <- list(tbl_gel1,tbl_gel2,tbl_gel3)
tbl_all <- Reduce(function(x, y) merge(x, y, all=T,by="Name",sort=F),
mylist, accumulate=F)
Everything goes fine until this moment.
Now I want to calculate the mean of each fraction (there is 24 fractions in total)
## Calculating the mean
tbl_all1 <- tbl_all[-1]
ind <- c(1, 25, 49)
tbl_mean <- cbind(tbl_all[1], sapply(0:23, function(i) rowMeans(tbl_all1[ind+i])))
There is something wrong with that function because sum of many rows gives 0. That's definitely wrong because in tbl_gel1 and others are only rows with atleast one number in any fraction.
If I take a look on tbl_mean I see that rows with sum of 0 are in the bottom.

Related

Replacing values by cell index across all columns of dataframe R

looking for answers similar to these posts; R: Replace multiple values in multiple columns of dataframes with NA ; Multiple replacement in R
My dataframe my.df contains NAs.
dput(my.df)
structure(list(`AICAR (GDSC1:1001)_GDSC1` = c(10.1253052794007,
NA, NA, NA, NA, NA, 9.3362273693641, NA, NA, NA), `vinblastine (GDSC1:1004)_GDSC1` = c(-5.56689193211021,
NA, NA, NA, NA, NA, -3.49808657768651, NA, NA, -5.7323006155361
), `cisplatin (GDSC1:1005)_GDSC1` = c(3.20680858158152, NA, NA,
NA, NA, NA, NA, NA, NA, NA), `cytarabine (GDSC1:1006)_GDSC1` = c(-1.29089026889862,
NA, NA, NA, NA, NA, NA, NA, NA, NA), `docetaxel (GDSC1:1007)_GDSC1` = c(-9.21190331946225,
NA, NA, NA, NA, NA, NA, NA, NA, -6.51430196744496), `methotrexate (GDSC1:1008)_GDSC1` = c(NA,
NA, NA, NA, NA, NA, -4.96153980941858, NA, NA, NA), `gefitinib (GDSC1:1010)_GDSC1` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, -4.65609368323825), `navitoclax (GDSC1:1011)_GDSC1` = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), `vorinostat (GDSC1:1012)_GDSC1` = c(-0.1834250603902,
1.80666265545084, 0.503152683902549, 1.78569632218743, NA, 1.01934567070847,
0.321867836558935, NA, 2.18003424956055, 0.143794452798708)), row.names = c(NA,
10L), class = "data.frame")
I get the cell location of each NA using idx <- my.df %>% lapply(., function(x) which(is.na(x)))
Convert these NAs to 0 by my.df %>% mutate_if(.,is.numeric, ~replace(., is.na(.), 0)) before I calculate correlations.
Now how can I return the NAs into their dedicated cells based on theidx?
I recon loops, tidy, purrr or something similar can do this fast? Would be great if a match could be done between the column names of my.df and the names of idx for quality control in the code.
Thanks!

Collapsing Dataframe Rows along several variables

I have a dataframe that looks something like this, in which I have several rows for each user, and many NAs in the columns.
user
Effect T1
Effect T2
Effect T3
Benchmark T1
Benchmark T2
Benchmark T3
Tom
01
NA
NA
02
NA
NA
Tom
NA
07
NA
NA
08
NA
Tom
NA
NA
13
NA
NA
14
Larry
03
NA
NA
04
NA
NA
Larry
NA
09
NA
NA
10
NA
Larry
NA
NA
15
NA
NA
16
Dave
05
NA
NA
06
NA
NA
Dave
NA
11
NA
NA
12
NA
Dave
NA
NA
17
NA
NA
18
I want to collapse the columns using the name and filling the values from reach row, this this.
user
Effect T1
Effect T2
Effect T3
Benchmark T1
Benchmark T2
Benchmark T3
Tom
01
07
13
02
08
14
Larry
03
09
15
04
10
16
Dave
05
11
17
06
12
18
How might I accomplish this?
Thank you in advance for your help. Update: I've added the dput of a subset of the actual data below.
structure(list(name = c("Abraham_Ralph", "Abraham_Ralph", "Abraham_Ralph",
"Ackerman_Gary", "Adams_Alma", "Adams_Alma", "Adams_Alma", "Adams_Alma",
"Adams_Sandy", "Aderholt_Robert", "Aderholt_Robert", "Aderholt_Robert",
"Aderholt_Robert", "Aderholt_Robert", "Aguilar_Pete", "Aguilar_Pete",
"Aguilar_Pete"), state = c("LA", "LA", "LA", "NY", "NC", "NC",
"NC", "NC", "FL", "AL", "AL", "AL", "AL", "AL", "CA", "CA", "CA"
), seniority = c(1, 2, 3, 15, 1, 2, 3, 4, 1, 8, 9, 10, 11, 12,
1, 2, 3), legeffect_112 = c(NA, NA, NA, 0.202061712741852, NA,
NA, NA, NA, 1.30758035182953, 3.73544979095459, NA, NA, NA, NA,
NA, NA, NA), legeffect_113 = c(NA, NA, NA, NA, 0, NA, NA, NA,
NA, NA, 0.908495426177979, NA, NA, NA, NA, NA, NA), legeffect_114 = c(2.07501077651978,
NA, NA, NA, NA, 0.84164834022522, NA, NA, NA, NA, NA, 0.340001106262207,
NA, NA, 0.10985741019249, NA, NA), legeffect_115 = c(NA, 0.493490308523178,
NA, NA, NA, NA, 0.587624311447144, NA, NA, NA, NA, NA, 0.159877583384514,
NA, NA, 0.730929613113403, NA), legeffect_116 = c(NA, NA, 0.0397605448961258,
NA, NA, NA, NA, 1.78378939628601, NA, NA, NA, NA, NA, 0.0198802724480629,
NA, NA, 0.0497006773948669), benchmark_112 = c(NA, NA, NA, 0.738679468631744,
NA, NA, NA, NA, 0.82908970117569, 1.39835929870605, NA, NA, NA,
NA, NA, NA, NA), benchmark_113 = c(NA, NA, NA, NA, 0.391001850366592,
NA, NA, NA, NA, NA, 1.58223271369934, NA, NA, NA, NA, NA, NA),
benchmark_114 = c(1.40446054935455, NA, NA, NA, NA, 0.576326191425323,
NA, NA, NA, NA, NA, 1.42212760448456, NA, NA, 0.574363172054291,
NA, NA), benchmark_115 = c(NA, 1.3291300535202, NA, NA, NA,
NA, 0.537361204624176, NA, NA, NA, NA, NA, 1.45703768730164,
NA, NA, 0.523149251937866, NA), benchmark_116 = c(NA, NA,
0.483340591192245, NA, NA, NA, NA, 1.31058621406555, NA,
NA, NA, NA, NA, 0.751261711120605, NA, NA, 1.05683290958405
)), row.names = c(NA, -17L), class = c("tbl_df", "tbl", "data.frame"
))

A data.table solution:
# melt data, remove NA, then recast ...
dt <- dcast(melt(data.table(d), "name")[!value %in% NA], name ~ variable)
dcast(melt(data.table(d), "name")[!value %in% c(NA) & !variable %in% c("variable", "seniority", "state")], name ~ variable)
name legeffect_112 legeffect_113 legeffect_114 legeffect_115 legeffect_116 benchmark_112 benchmark_113 benchmark_114 benchmark_115 benchmark_116
1: Abraham_Ralph <NA> <NA> 2.07501077651978 0.493490308523178 0.0397605448961258 <NA> <NA> 1.40446054935455 1.3291300535202 0.483340591192245
2: Ackerman_Gary 0.202061712741852 <NA> <NA> <NA> <NA> 0.738679468631744 <NA> <NA> <NA> <NA>
3: Adams_Alma <NA> 0 0.84164834022522 0.587624311447144 1.78378939628601 <NA> 0.391001850366592 0.576326191425323 0.537361204624176 1.31058621406555
4: Adams_Sandy 1.30758035182953 <NA> <NA> <NA> <NA> 0.82908970117569 <NA> <NA> <NA> <NA>
5: Aderholt_Robert 3.73544979095459 0.908495426177979 0.340001106262207 0.159877583384514 0.0198802724480629 1.39835929870605 1.58223271369934 1.42212760448456 1.45703768730164 0.751261711120605
6: Aguilar_Pete <NA> <NA> 0.10985741019249 0.730929613113403 0.0497006773948669 <NA> <NA> 0.574363172054291 0.523149251937866 1.05683290958405
Data/Setup
# Load data.table
# install.packages("data.table")
library(data.table)
# Read example data
d <- structure(list(name = c("Abraham_Ralph", "Abraham_Ralph", "Abraham_Ralph",
"Ackerman_Gary", "Adams_Alma", "Adams_Alma", "Adams_Alma", "Adams_Alma",
"Adams_Sandy", "Aderholt_Robert", "Aderholt_Robert", "Aderholt_Robert",
"Aderholt_Robert", "Aderholt_Robert", "Aguilar_Pete", "Aguilar_Pete",
"Aguilar_Pete"), state = c("LA", "LA", "LA", "NY", "NC", "NC",
"NC", "NC", "FL", "AL", "AL", "AL", "AL", "AL", "CA", "CA", "CA"
), seniority = c(1, 2, 3, 15, 1, 2, 3, 4, 1, 8, 9, 10, 11, 12,
1, 2, 3), legeffect_112 = c(NA, NA, NA, 0.202061712741852, NA,
NA, NA, NA, 1.30758035182953, 3.73544979095459, NA, NA, NA, NA,
NA, NA, NA), legeffect_113 = c(NA, NA, NA, NA, 0, NA, NA, NA,
NA, NA, 0.908495426177979, NA, NA, NA, NA, NA, NA), legeffect_114 = c(2.07501077651978,
NA, NA, NA, NA, 0.84164834022522, NA, NA, NA, NA, NA, 0.340001106262207,
NA, NA, 0.10985741019249, NA, NA), legeffect_115 = c(NA, 0.493490308523178,
NA, NA, NA, NA, 0.587624311447144, NA, NA, NA, NA, NA, 0.159877583384514,
NA, NA, 0.730929613113403, NA), legeffect_116 = c(NA, NA, 0.0397605448961258,
NA, NA, NA, NA, 1.78378939628601, NA, NA, NA, NA, NA, 0.0198802724480629,
NA, NA, 0.0497006773948669), benchmark_112 = c(NA, NA, NA, 0.738679468631744,
NA, NA, NA, NA, 0.82908970117569, 1.39835929870605, NA, NA, NA,
NA, NA, NA, NA), benchmark_113 = c(NA, NA, NA, NA, 0.391001850366592,
NA, NA, NA, NA, NA, 1.58223271369934, NA, NA, NA, NA, NA, NA),
benchmark_114 = c(1.40446054935455, NA, NA, NA, NA, 0.576326191425323,
NA, NA, NA, NA, NA, 1.42212760448456, NA, NA, 0.574363172054291,
NA, NA), benchmark_115 = c(NA, 1.3291300535202, NA, NA, NA,
NA, 0.537361204624176, NA, NA, NA, NA, NA, 1.45703768730164,
NA, NA, 0.523149251937866, NA), benchmark_116 = c(NA, NA,
0.483340591192245, NA, NA, NA, NA, 1.31058621406555, NA,
NA, NA, NA, NA, 0.751261711120605, NA, NA, 1.05683290958405
)), row.names = c(NA, -17L), class = c("tbl_df", "tbl", "data.frame"
))

This solution is using only the base functions (no extra packages), but the one-liner may cause eyes to cross, so I'll split it into several functions.
The plan is the following:
Split the original data.frame by the values in name column, using the function by;
For each partition of the data.frame, collapse the columns;
A collapsed column returns the max value of the column, or NA if all its values are NA;
The collapsed data.frame partitions are stacked together.
So, this is a function that does that:
dfr_collapse <- function(dfr, col0)
{
# Collapse the columns of the data.frame "dfr" grouped by the values of
# the column "col0"
# Max/NA function
namax <- function(x)
{
if(all(is.na(x)))
NA # !!!
else
max(x, na.rm=TRUE)
}
# Column collapse function
byfun <- function(x)
{
lapply(x, namax)
}
# Stack the partitioning results
return(do.call(
what = rbind,
args = by(dfr, dfr[[col0]], byfun)
))
}
May not look as slick as a one-liner, but it does the job. It can be tunrned into a one-liner, but you don't want that.
Assuming that df0 is the data.frame from you dput, you can test this function with
dfr_collapse(df0)
Nota bene: for the sake of simplicity, I return an NA of type logical (see the comment # !!! above). The correct code should convert that NA to the mode of the x vector. Also, the function should check the type of its inputs, etc.

Filled values are not shown

I am a new to R.
I had some values with NAs and i filled them like this
katsastus_3_20211227_115940%>% fill(c("registration_year","reg"), .direction = "down")
when i run the code, at console i got what i desired, like this https://i.stack.imgur.com/2EkjL.png
and when im trying view(katsastus_3_20211227_115940)
i get this https://i.stack.imgur.com/zcBfK.png which is how the data was when i got them

you can reassign your data.frame (as #Peace Wang suggested in his/her first comment) using fill, e.g.:
f <- structure(list(reg = c("2017", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), modela = c("Alfa Romeo - Models in total", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), registration_year = c("Years in total", NA, NA, NA, NA, NA, NA, NA, NA, "2002", NA, NA), object_of_inspection = c("A", "B","C", "D", "E","F", "G","H", "I","A", "B","C")), row.names = c(NA,-12L), class = c("tbl_df", "tbl", "data.frame"))
f <- f%>% fill(c("registration_year","reg"), .direction = "down")

How to loop variable creation and str_replace dynamically in R

I am trying to parse multiple columns into each of their components. However the number of components varies across the columns. Specifically, suppose the following df:
id X1.startAll X2.startAll
1 ["1555726884484","1555727530298","1555727532509"]
2 ["1555735159384","1555735161545"]
3 ["1555730029709"]
4 ["1555735159384","1555735161545"]
5
6 ["1555735159384","1555735161545"]
now I have 40 of these columns (and another 120 very similar ones, to which I aim to generalize the process) and many more rows. I can do the first column quite simply using the following:
df1$X1.startAll1 <- str_replace(df1$X1.startAll, "\\[\"([0-9]+)\",*\"*([0-9]*)\"*,*\"*([0-9]*)\"*\\]", "\\1")
df1$X1.startAll2 <- str_replace(df1$X1.startAll, "\\[\"([0-9]+)\",*\"*([0-9]*)\"*,*\"*([0-9]*)\"*\\]", "\\2")
df1$X1.startAll3 <- str_replace(df1$X1.startAll, "\\[\"([0-9]+)\",*\"*([0-9]*)\"*,*\"*([0-9]*)\"*\\]", "\\3")
which yields my desired result:
id X1.startAll X1.startAll1 X1.startAll2 X1.startAll3
1 ["1555726884484","1555727530298","1555727532509"] 1555726884484 1555727530298 1555727532509
2
3 ["1555730029709"] 1555730029709
4 ["1555735159384","1555735161545"] 1555735159384 1555735161545
5
6
However, I have to do this for many columns and for many different 'array' lengths within each of these.
I have tried automating this using a for loop, however, I (1) can't figure out how to read the right number of iterations (i.e. the max the number of components in the startAll column), (2) dynamically create the variables, (3) nor how to update the string extraction dynamically ("\\i").
Any and all help on looping this process would help a lot!
Edit 2: below is a copy-pasteable sample of the data:
structure(list(X1.startAll = list(NA, NA, NA, NA, c(1555726884484,
1555727530298, 1555727532509), NA, NA, c(1555735159384, 1555735161545
), NA, NA, NA, 1555730029709, NA, NA, NA, c(1555728423843, 1555728561054,
1555728586917), c(1555725657389, 1555725657827), c(1555703810672,
1555703823206, 1555703848659), NA, NA), X2.startAll = list(NA,
NA, NA, NA, c(1555727541885, 1555727786959, 1555727897893
), NA, NA, 1555735262052, c(1555737694350, 1555737696711),
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), X3.startAll = list(
NA, NA, NA, NA, c(1555727920770, 1555728230065, 1555728843391
), NA, NA, c(1555735331144, 1555735452321, 1555735457305),
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), X4.startAll = list(
NA, NA, NA, NA, 1555728854666, NA, NA, 1555735589629, 1555738374484,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), X5.startAll = list(
NA, NA, NA, NA, c(1555728949327, 1555728988444), NA, NA,
c(1555735646258, 1555735912372, 1555735914267, 1555736071856,
1555736074184, 1555736093411, 1555736124826, 1555736238538,
1555736248889, 1555736576754, 1555736620915, 1555736874386,
1555737698921, 1555737777400, 1555737966562, 1555738152090,
1555738354075, 1555738700232, 1555738703134, 1555738716736
), 1555738415269, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), X6.startAll = list(NA, NA, NA, NA, 1555729661240, NA,
NA, NA, 1555738960285, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), X7.startAll = list(NA, NA, NA, NA, c(1555730266934,
1555730356654, 1555730533798, 1555730535289), NA, c(1555732523945,
1555733415340, 1555733477452, 1555733748200, 1555734007271, 1555734286685,
1555734288597), NA, c(1555739871726, 1555740315324, 1555740328252,
1555740329835, 1555740538272, 1555741140561, 1555741143555, 1555741152932
), c(1555743562826, 1555743566386, 1555743593201), NA, NA, NA,
c(1555727969354, 1555727985539, 1555728064237, 1555738166838,
1555826735910), NA, NA, NA, NA, NA, NA), X8.startAll = list(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA)), row.names = c(NA, -20L), class = "data.frame")

Filter partial dependencies by effect size

I fitted a model and want to take a look (and plot) at the partial dependencies.
For this task, I use the mlr package. However, since I have 80 features I only want to look at those with the highest effect on the target variable. Is there a way to calculate or show the partial dependence only for the features with the highest influence?
Here is an example: I just fitted 4 values. Let´s assume I only want to look or calculate the partial dependence for the 2 most influential features.
library(mlr)
pd = generatePartialDependenceData(mod, train_task, c("diveyTrue", "dinnerTrue","BikeParkingTrue", "latenightTrue"))
pd
PartialDependenceData
Task: dat
Features: diveyTrue, dinnerTrue, BikeParkingTrue, latenightTrue
Target: diveyTrue, dinnerTrue, BikeParkingTrue, latenightTrue
Derivative: FALSE
Interaction: FALSE
Individual: FALSE
review_count diveyTrue dinnerTrue BikeParkingTrue latenightTrue
1: 73.92993 0.0000000 NA NA NA
2: 73.68386 0.1111111 NA NA NA
3: 73.68386 0.2222222 NA NA NA
4: 73.68386 0.3333333 NA NA NA
5: 73.68386 0.4444444 NA NA NA
6: 63.56335 0.5555556 NA NA NA
... (#rows: 40, #cols: 5)
The task is a regression and the first column is the target variable. All other variables are dummies. Therefore, the target variable stays constant until the value of "diveyTrue" is greater than 0.5.
Here is a small dput():
structure(list(data = structure(list(review_count = c(73.9299260484918,
73.6838552698629, 73.6838552698629, 73.6838552698629, 73.6838552698629,
63.5633491608329, 63.5633491608329, 63.5633491608329, 63.5633491608329,
63.5633491608329, 44.123492893074, 44.0855985404284, 44.0855985404284,
44.0855985404284, 44.0855985404284, 67.9185575263356, 67.9185575263356,
67.9185575263356, 67.9185575263356, 67.9185575263356, 64.1248331786005,
64.1243679505065, 64.1243679505065, 64.1243679505065, 64.1243679505065,
64.9177431842816, 64.9177431842816, 64.9177431842816, 64.9177431842816,
64.9177431842816, 58.2709529252224, 58.2709529252224, 58.2709529252224,
58.2709529252224, 58.2709529252224, 89.8281204749236, 89.8281204749236,
89.8281204749236, 89.8281204749236, 89.8281204749236), diveyTrue = c(0,
0.111111111111111, 0.222222222222222, 0.333333333333333, 0.444444444444444,
0.555555555555556, 0.666666666666667, 0.777777777777778, 0.888888888888889,
1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
dinnerTrue = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0,
0.111111111111111, 0.222222222222222, 0.333333333333333,
0.444444444444444, 0.555555555555556, 0.666666666666667,
0.777777777777778, 0.888888888888889, 1, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), BikeParkingTrue = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0.111111111111111,
0.222222222222222, 0.333333333333333, 0.444444444444444,
0.555555555555556, 0.666666666666667, 0.777777777777778,
0.888888888888889, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), latenightTrue = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 0, 0.111111111111111, 0.222222222222222,
0.333333333333333, 0.444444444444444, 0.555555555555556,
0.666666666666667, 0.777777777777778, 0.888888888888889,
1)), row.names = c(NA, -40L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x0000000002521ef0>), task.desc = structure(list(
id = "dat", type = "regr", target = "review_count", size = 9943L,
n.feat = c(numerics = 79L, factors = 0L, ordered = 0L, functionals = 0L
), has.missings = TRUE, has.weights = FALSE, has.blocking = FALSE,
has.coordinates = FALSE), class = c("RegrTaskDesc", "SupervisedTaskDesc",
"TaskDesc")), target = c("diveyTrue", "dinnerTrue", "BikeParkingTrue",
"latenightTrue"), features = c("diveyTrue", "dinnerTrue", "BikeParkingTrue",
"latenightTrue"), derivative = FALSE, interaction = FALSE, individual = FALSE), class = "PartialDependenceData")

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Calculating the mean of 3 columns in data frame - r

Related

Replacing values by cell index across all columns of dataframe R

Collapsing Dataframe Rows along several variables

Filled values are not shown

How to loop variable creation and str_replace dynamically in R

Filter partial dependencies by effect size

Categories

Resources