getting error while calculating feature importances - R - r

I have the below data:
> paste(data_s)
[1] "c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
[2] "c(34, 34, 35, 35, 35, 34, 6, 34, 34, 6, 34, 34, 34, 6, 6, 6, 34, 34, 35, 6, 34, 34, 34, 34, 34, 34, 34, 34, 6, 34, 35, 35, 34, 34, 6, 34, 34, 34, 34, 6, 6, 35, 34, 34, 34, 35, 6, 35, 34, 34, 34, 34, 34, 34, 6, 34, 34, 6, 34, 34, 34, 6, 34, 34, 34, 34, 6, 34, 34, 34, 35, 6, 35, 34, 34, 35, 34, 6, 6, 35, 34, 34, 6, 34, 6, 6, 34, 34, 6, 34, 6, 35, 34, 6, 34, 35, 34, 6, 34, 34)"
[3] "c(1, 1, 4, 0, 3, 4, 5, 2, 4, 1, 2, 1, 4, 9, 9, 1, 1, 5, 1, 4, 4, 2, 3, 2, 3, 2, 1, 2, 5, 6, 5, 5, 5, 1, 5, 5, 2, 1, 1, 3, 4, 2, 9, 1, 4, 3, 2, 5, 2, 2, 3, 4, 4, 5, 5, 4, 1, 2, 0, 3, 4, 2, 2, 5, 0, 2, 5, 3, 3, 1, 0, 1, 4, 2, 5, 1, 1, 4, 2, 3, 5, 1, 5, 0, 2, 4, 1, 5, 4, 2, 2, 4, 5, 1, 2, 2, 0, 3, 7, 3)"
> str(data_s)
tibble [100 × 3] (S3: tbl_df/tbl/data.frame)
$ y : num [1:100] 0 0 0 0 0 0 0 0 1 0 ...
$ x1: num [1:100] 34 34 35 35 35 34 6 34 34 6 ...
$ x2: num [1:100] 1 1 4 0 3 4 5 2 4 1 ...
- attr(*, "na.action")= 'omit' Named int [1:197659] 4 5 6 7 9 14 19 20 24 27 ...
..- attr(*, "names")= chr [1:197659] "4" "5" "6" "7" ...
I am using vivi function using vivid package to explore the feature importance of variables.
I write the below code:
library("vivid")
library("dplyr")
library("xgboost")
y=data_s["y"]
x=data_s[,c("x1","x2")]
gbst <- xgboost(data = as.matrix(x),
label = as.matrix(y),
nrounds = 600)
pFun <- function(fit, data, ...) predict(fit, as.matrix(x))
viviGBst <- vivi(fit = gbst,
data = data_s,
response = "y",
reorder = FALSE,
normalized = FALSE,
predictFun = pFun)
But I get the below error:
Error:
! Assigned data `predict(x, data = X[, cols, drop = FALSE])` must be compatible with existing data.
✖ Existing data has 5000 rows.
✖ Assigned data has 100 rows.
ℹ Only vectors of size 1 are recycled.
Run `rlang::last_error()` to see where the error occurred.
Why do I get this error and how can I fix it?
I will be very glad for any help.
Thanks.

Related

Return new dataframe with columns created inside a function in R with user-given names

Please see my code below:
# functions to get percentile threshold, and assign new values to outliers
get_low_perc <- function(var_name) {
return(quantile(var_name, c(0.01)))
}
get_hi_perc <- function(var_name) {
return(quantile(var_name, c(0.99)))
}
round_up <- function(target_var, flag_var, floor) {
target_var <- as.numeric(ifelse(flag_var == 1, floor, target_var))
return(as.integer(target_var))
}
round_down <- function(target_var, flag_var, ceiling) {
target_var <- as.numeric(ifelse(flag_var == 1, ceiling, target_var))
return(as.integer(target_var))
}
# try putting it all together
no_way <- function(df, df_col_name, df_col_flagH, df_col_flagL) {
lo_perc <- get_low_perc(df_col_name)
hi_perc <- get_hi_perc(df_col_name)
df$df_col_flagH <- as.factor(ifelse(df_col_name < lo_perc, 1, 0))
df$df_col_flagL <- as.factor(ifelse(df_col_name > hi_perc, 1, 0))
df_col_name <- round_up(df_col_name, df_col_flagL, lo_perc)
df_col_name <- round_down(df_col_name, df_col_flagH, hi_perc)
# names(df)[names(df)=='df_col_flagH'] <-
# boxplot(df_col_name)
return(df)
}
I have created 5 custom functions; the first two respectively get the 1th percentile and the 99th percentile of a given variable. The last two round the values in these variables up or down depending on how far away they are from the 1st percentile and the 99th percentile values. The last function is trying to put all these functions together to essentially output a new dataframe containing the same columns in the original df, the updated column, and two new columns indicating values that were flagged as below the 1st percentile and above the 99th percentile. I have produced a mock dataframe below, since I can't seem to pass some of my data here.
df2 = data.frame(col = c(1, 3, 4, 5, 8, 7, 67, 744, 876, 8, 8, 54, 9),
col1 = c(9, 6, 8, 3, 4, 5, 8, 7, 67, 744, 87, 33, 77),
col2 = c(8, 2, 8, 4, 87, 66, 54, 99, 77, 77, 88, 67, 102))
Ideally, after I call the function using the command "no_way(df2, df2$col1, df2$new_col1, df2$new_col2)", I want an output dataframe looking like:
df2 = data.frame(col = c(1, 3, 4, 5, 8, 7, 67, 744, 876, 8, 8, 54, 9),
col1 = c(9, 6, 8, 3, 4, 5, 8, 7, 67, 744, 87, 33, 77), # updated with appropriate values
col2 = c(8, 2, 8, 4, 87, 66, 54, 99, 77, 77, 88, 67, 102),
new_col1 = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0),
new_col2 = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0))
^ Where new_col1 and new_col2 are column names given by the user when calling the function. I am currently getting the dataframe as expected, but the new columns created have kept the function parameters' names, as in:
df2 = data.frame(col = c(1, 3, 4, 5, 8, 7, 67, 744, 876, 8, 8, 54, 9),
col1 = c(9, 6, 8, 3, 4, 5, 8, 7, 67, 744, 87, 33, 77), # updated with appropriate values
col2 = c(8, 2, 8, 4, 87, 66, 54, 99, 77, 77, 88, 67, 102),
df_col_flagH = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0),
df_col_flagL = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0))
I would not mind changing the name of the columns afterwards, but I will be using this function of 17 columns therefore that wouldn't be optimal. Please help.
You should pass new column names as string.
Also ifelse(condition, 1, 0) can be simplified to as.integer(condition).
no_way <- function(df, df_col_name, df_col_flagH, df_col_flagL) {
lo_perc <- get_low_perc(df[[df_col_name]])
hi_perc <- get_hi_perc(df[[df_col_name]])
df[[df_col_flagH]] <- as.factor(as.integer(df[[df_col_name]] < lo_perc))
df[[df_col_flagL]] <- as.factor(as.integer(df[[df_col_name]] > hi_perc))
df[[df_col_name]] <- round_up(df[[df_col_name]], df_col_flagL, lo_perc)
df[[df_col_name]] <- round_down(df[[df_col_name]], df_col_flagH, hi_perc)
return(df)
}
df2 <- no_way(df2, "col1", "new_col1", "new_col2")
df2
# col col1 col2 new_col1 new_col2
#1 1 9 8 0 0
#2 3 9 2 0 0
#3 4 9 8 0 0
#4 5 9 4 1 0
#5 8 9 87 0 0
#6 7 9 66 0 0
#7 67 9 54 0 0
#8 744 9 99 0 0
#9 876 9 77 0 0
#10 8 9 77 0 1
#11 8 9 88 0 0
#12 54 9 67 0 0
#13 9 9 102 0 0

How can I represent one column's values using multiple columns in R where one new column is conditional?

Looking at similar questions, I could not find one that matched my need.
If one does contain a solution, please share its link.
I have this dput-produced data:
structure(list(Player = c("Seth Lugo", "Jacob deGrom", "Rick Porcello",
"David Peterson", "Michael Wacha", "Seth Lugo", "Jacob deGrom",
"Rick Porcello", "David Peterson", "Steven Matz", "Seth Lugo",
"Jacob deGrom", "Rick Porcello", "David Peterson", "Seth Lugo",
"Jacob deGrom", "Rick Porcello", "Michael Wacha", "David Peterson",
"Jacob deGrom", "Seth Lugo", "Rick Porcello", "Robert Gsellman",
"Michael Wacha", "Ariel Jurado", "Jacob deGrom", "Rick Porcello",
"Seth Lugo", "Robert Gsellman", "David Peterson"), Date = structure(c(1601164800,
1601078400, 1601078400, 1600905600, 1600819200, 1600732800, 1600646400,
1600560000, 1600473600, 1600387200, 1600300800, 1600214400, 1600128000,
1599955200, 1599868800, 1599782400, 1599609600, 1599523200, 1599436800,
1599350400, 1599264000, 1599177600, 1599091200, 1599004800, 1598918400,
1598832000, 1598745600, 1598745600, 1598659200, 1598572800), tzone = "UTC", class = c("POSIXct",
"POSIXt")), DblHdr = c(0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 2), DateStr = c("09/27/2020",
"09/26/2020", "09/26/2020", "09/24/2020", "09/23/2020", "09/22/2020",
"09/21/2020", "09/20/2020", "09/19/2020", "09/18/2020", "09/17/2020",
"09/16/2020", "09/15/2020", "09/13/2020", "09/12/2020", "09/11/2020",
"09/09/2020", "09/08/2020", "09/07/2020", "09/06/2020", "09/05/2020",
"09/04/2020", "09/03/2020", "09/02/2020", "09/01/2020", "08/31/2020",
"08/30/2020", "08/30/2020", "08/29/2020", "08/28/2020"), Month = c("09",
"09", "09", "09", "09", "09", "09", "09", "09", "09", "09", "09",
"09", "09", "09", "09", "09", "09", "09", "09", "09", "09", "09",
"09", "09", "08", "08", "08", "08", "08"), Tm = c("NYM", "NYM",
"NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM",
"NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM",
"NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM",
"NYM"), Opp = c("WSN", "WSN", "WSN", "WSN", "TBR", "TBR", "TBR",
"ATL", "ATL", "ATL", "PHI", "PHI", "PHI", "TOR", "TOR", "TOR",
"BAL", "BAL", "PHI", "PHI", "PHI", "PHI", "NYY", "BAL", "BAL",
"MIA", "NYY", "NYY", "NYY", "NYY"), Rslt = c("L 5-15", "L 3-4",
"L 3-5", "W 3-2", "L 5-8", "W 5-2", "L 1-2", "L 0-7", "W 7-2",
"L 2-15", "W 10-6", "W 5-4", "L 1-4", "L 3-7", "L 2-3", "W 18-1",
"W 7-6", "L 2-11", "L 8-9", "W 14-1", "W 5-1", "L 3-5", "W 9-7",
"W 9-4", "L 5-9", "L 3-5", "L 7-8", "L 2-5", "L 1-2", "W 4-3"
), W_L = c("L", "L", "L", "W", "L", "W", "L", "L", "W", "L",
"W", "W", "L", "L", "L", "W", "W", "L", "L", "W", "W", "L", "W",
"W", "L", "L", "L", "L", "L", "W"), temp = c("L 5", "L 3", "L 3",
"W 3", "L 5", "W 5", "L 1", "L 0", "W 7", "L 2", "W 10", "W 5",
"L 1", "L 3", "L 2", "W 18", "W 7", "L 2", "L 8", "W 14", "W 5",
"L 3", "W 9", "W 9", "L 5", "L 3", "L 7", "L 2", "L 1", "W 4"
), RS = c(5, 3, 3, 3, 5, 5, 1, 0, 7, 2, 10, 5, 1, 3, 2, 18, 7,
2, 8, 14, 5, 3, 9, 9, 5, 3, 7, 2, 1, 4), RA = c(15, 4, 5, 2,
8, 2, 2, 7, 2, 15, 6, 4, 4, 7, 3, 1, 6, 11, 9, 1, 1, 5, 7, 4,
9, 5, 8, 5, 2, 3), Rdiff = c(-10, -1, -2, 1, -3, 3, -1, -7, 5,
-13, 4, 1, -3, -4, -1, 17, 1, -9, -1, 13, 4, -2, 2, 5, -4, -2,
-1, -3, -1, 1), absV = c(10, 1, 2, 1, 3, 3, 1, 7, 5, 13, 4, 1,
3, 4, 1, 17, 1, 9, 1, 13, 4, 2, 2, 5, 4, 2, 1, 3, 1, 1), App_Dec = c("GS-2, L",
"GS-5", "GS-3, L", "GS-7, W", "GS-6, L", "GS-7, W", "GS-7, L",
"GS-7, L", "GS-6, W", "GS-3, L", "GS-2", "GS-2", "GS-6, L", "GS-5, L",
"GS-6, L", "GS-6, W", "GS-4", "GS-4, L", "GS-2", "GS-7, W", "GS-5, W",
"GS-6", "GS-2", "GS-3", "GS-4", "GS-6, L", "GS-5", "GS-4", "GS-4",
"GS-4"), IP = c(1.1, 5, 3, 7, 6, 6.1, 7, 7, 6, 2.2, 1.2, 2, 6,
5, 5.1, 6, 4, 4, 2, 7, 5, 6, 1.2, 3, 4, 6, 5, 3.2, 4, 4), H = c(5,
5, 8, 4, 6, 4, 4, 3, 3, 8, 8, 4, 6, 3, 7, 3, 10, 7, 3, 3, 4,
3, 4, 4, 9, 6, 4, 4, 4, 4), R = c(6, 3, 5, 1, 4, 2, 2, 1, 1,
6, 6, 3, 4, 2, 3, 1, 5, 5, 5, 1, 1, 2, 4, 2, 5, 4, 2, 1, 1, 3
), ER = c(6, 3, 3, 1, 4, 1, 2, 1, 1, 6, 6, 3, 4, 2, 3, 1, 5,
4, 5, 1, 1, 2, 4, 2, 5, 1, 2, 1, 1, 3), BB = c(2, 2, 1, 1, 0,
1, 2, 2, 4, 3, 0, 1, 2, 2, 1, 2, 0, 0, 4, 2, 2, 2, 4, 1, 0, 2,
2, 2, 0, 3), SO = c(1, 10, 3, 4, 4, 7, 14, 10, 10, 5, 3, 1, 5,
2, 5, 9, 3, 3, 3, 12, 8, 6, 0, 2, 2, 9, 2, 7, 4, 3), HR = c(0,
2, 1, 0, 2, 1, 1, 1, 1, 2, 4, 0, 1, 1, 0, 0, 0, 2, 1, 1, 1, 0,
0, 0, 1, 1, 0, 1, 1, 0), UER = c(0, 0, 2, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0),
Pit = c(38, 113, 67, 107, 66, 95, 112, 100, 102, 76, 52,
40, 94, 81, 91, 102, 66, 71, 70, 108, 81, 100, 52, 69, 84,
103, 86, 60, 57, 70), Str = c(24, 78, 42, 68, 45, 66, 70,
70, 62, 45, 30, 25, 66, 52, 60, 68, 45, 49, 37, 74, 50, 65,
22, 41, 53, 72, 55, 39, 33, 37), GSc = c(19, 53, 29, 68,
48, 65, 73, 75, 68, 20, 18, 36, 47, 53, 46, 69, 25, 33, 29,
77, 61, 62, 27, 44, 26, 57, 51, 54, 54, 42), BF = c(12, 22,
19, 26, 23, 24, 26, 26, 24, 18, 14, 11, 26, 20, 24, 23, 21,
20, 14, 26, 21, 23, 13, 15, 21, 27, 20, 16, 15, 18), AB = c(8,
20, 18, 24, 23, 23, 23, 23, 20, 15, 13, 9, 24, 18, 22, 21,
21, 20, 9, 24, 19, 21, 8, 13, 20, 25, 18, 14, 15, 15), H2B = c(2,
0, 1, 1, 1, 0, 2, 0, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1, 1, 1,
0, 0, 1, 0, 2, 2, 2, 0, 1, 0), H3B = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 1, 0), IBB = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),
HBP = c(1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), SH = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0), SF = c(1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0), GDP = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1), SB = c(0, 1,
1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 2, 0,
1, 0, 0, 0, 3, 0, 0, 0, 0), CS = c(0, 0, 0, 0, 1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), PO = c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), BK = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), WP = c(0, 1, 1, 1, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 1, 0, 0), ERA = c("40.5", "5.4", "9", "1.29", "6", "1.42",
"2.57", "1.29", "1.5", "20.25", "32.4", "13.5", "6", "3.6",
"5.0599999999999996", "1.5", "11.25", "9", "22.5", "1.29",
"1.8", "3", "21.6", "6", "11.25", "1.5", "3.6", "2.4500000000000002",
"2.25", "6.75"), WPA = c(-0.471, -0.087, -0.256, 0.34, -0.22,
0.18, 0.107, 0.219, 0.229, -0.358, -0.487, -0.186, -0.156,
0.036, -0.047, 0.049, -0.329, -0.321, -0.34, 0.193, 0.156,
0.07, -0.312, -0.042, -0.278, -0.271, 0.029, 0.02, 0.092,
-0.174), RE24 = c(-5.122, -0.193, -3.316, 2.931, -1.08, 1.509,
1.406, 2.406, 1.92, -4.641, -5.444, -1.919, -0.758, 0.679,
0.245, 2.215, -3.054, -3.054, -4.027, 2.406, 1.433, 0.92,
-3.788, -0.359, -2.812, -1.08, 0.707, 0.364, 1.166, -0.834
), aLI = c(1.45, 1.244, 0.974, 1.271, 0.965, 0.921, 0.955,
0.888, 1.066, 0.962, 0.767, 1.073, 0.941, 0.852, 1.353, 0.392,
0.857, 0.805, 0.904, 0.75, 1.037, 0.861, 1.232, 1.355, 0.914,
1.239, 1.213, 1.28, 0.748, 1.407)), row.names = c(NA, -30L
), class = c("tbl_df", "tbl", "data.frame"))
Desired output:
The numbers starting in the second column are the total absV values for each player for each column. The last column contains the sum of all the absV values for each player where absV > 5. Only a sample of the first 3 rows are shown, and the absV values are just filler numbers.
| Player | 1 | 2 | 3 | 4 | 5 | >5 |
| deGrom | 2 | 3 | 5 | 0 | 1 | 3 |
| Matz | 2 | 3 | 5 | 0 | 1 | 3 |
Code tried (I need help getting beyond the point shown). I would prefer if the code uses dplyr:
starter %>%
select(Player, absV) %>%
group_by(Player, absV) %>%
summarize(numG= n()) %>%
arrange(Player,absV)
To do this you to bifurcate your data with rows per player >5 and <=5, then rbind them together and thereafter pivot_wider. Follow this code
library(dplyr)
library(tidyr)
df <- starter %>% group_by(Player) %>%
mutate(row = row_number()) %>%
select(Player, absV, row) %>% arrange(Player)
df %>% filter(row <= 5) %>%
mutate(row = as.character(row)) %>%
rbind(df %>% filter(row > 5) %>%
summarise( absV = sum(absV)) %>%
mutate(row = ">5")) %>%
pivot_wider(id_cols = Player, names_from = row, values_from = absV)
# A tibble: 8 x 7
# Groups: Player [8]
Player `1` `2` `3` `4` `5` `>5`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Ariel Jurado 4 NA NA NA NA NA
2 David Peterson 1 5 4 1 1 NA
3 Jacob deGrom 1 1 1 17 13 2
4 Michael Wacha 3 9 5 NA NA NA
5 Rick Porcello 2 7 3 1 2 1
6 Robert Gsellman 2 1 NA NA NA NA
7 Seth Lugo 10 3 4 1 4 3
8 Steven Matz 13 NA NA NA NA NA
Note. Loading tidyverse package, at once, directly is advised.
Note-2 If you still want to sort absV before changing the data-format, add absV in arrange syntax beforehand joining them..
df <- starter %>% group_by(Player) %>%
arrange(Player, absV) %>%
mutate(row = row_number()) %>%
select(Player, absV, row)
df %>% filter(row <= 5) %>%
mutate(row = as.character(row)) %>%
rbind(df %>% filter(row > 5) %>%
summarise( absV = sum(absV)) %>%
mutate(row = ">5")) %>%
pivot_wider(id_cols = Player, names_from = row, values_from = absV)
#this will give the following diff output
# A tibble: 8 x 7
# Groups: Player [8]
Player `1` `2` `3` `4` `5` `>5`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Ariel Jurado 4 NA NA NA NA NA
2 David Peterson 1 1 1 4 5 NA
3 Jacob deGrom 1 1 1 2 13 17
4 Michael Wacha 3 5 9 NA NA NA
5 Rick Porcello 1 1 2 2 3 7
6 Robert Gsellman 1 2 NA NA NA NA
7 Seth Lugo 1 3 3 4 4 10
8 Steven Matz 13 NA NA NA NA NA
Additional Question in comments below
Follow this code to work out frequency of each absV
df %>% group_by(Player, absV) %>% mutate(freq = n()) %>% ungroup()
#check it
df %>% group_by(Player, absV) %>% mutate(freq = n()) %>% ungroup() %>% select(Player, absV, freq)
Player absV freq
<chr> <dbl> <int>
1 Seth Lugo 10 1
2 Jacob deGrom 1 3
3 Rick Porcello 2 2
4 David Peterson 1 3
5 Michael Wacha 3 1
6 Seth Lugo 3 2
7 Jacob deGrom 1 3
8 Rick Porcello 7 1
9 David Peterson 5 1
10 Steven Matz 13 1
# ... with 20 more rows
Using data.table
library(data.table)
dcast(setDT(starter), Player ~ rowid(Player), value.var = 'absV')

Concentrate data frame information in r

I have two data frames:
> df1
2013-04-1 2013-04-2 2013-04-3 2013-04-4 2013-04-5 2013-04-6 2013-04-7 2013-04-8 2013-04-9 2013-04-10 2013-04-11
bin_1 32 489 32 32 364 19 312 0 0 0 346
bin_2 8 346 8 0 98 8 12 12 46 364 346
bin_3 9 98 346 46 9 312 6 1912 0 489 0
bin_4 4 12 9 12 0 12 0 987 9 19 12
bin_5 0 0 8 8 0 0 312 6 312 12 4
df1 contains 5 rows (bins) and 23 columns (date)
> df2
orange apple pear banana watermelon lemon
2013-04-1 1 1 1 1 0 1
2013-04-2 1 1 0 1 0 0
2013-04-3 1 1 1 1 0 1
2013-04-4 0 1 0 1 1 1
2013-04-5 1 0 0 0 1 1
df2 contains 23 rows(date) and 6 columns (types of fruits)
So now, I want to concentrate these 2 dfs into 1 big data frame that contains all the information, like:
> df3
orange apple pear banana watermelon lemon
bin_1 ? ? ? ? ? ?
bin_2 ? ? ? ? ? ?
bin_3 ? ? ? ? ? ?
bin_4 ? ? ? ? ? ?
bin_5 ? ? ? ? ? ?
But how can i concentrate the data? So for example,
on 2013-04-1,
bin_1 contains 32 fruits, bin_2 contains 8 fruits, ..., bin_5 contains 0 fruits (based on df1)
only orange, apple, pear, banana, and lemon are available (based on df2)
Q. I want my df3 to contain concentrate information, like bin_1 on average contain x amount of oranges, ...etc .How can I model this?
Code:
> dput(df1)
structure(list(`2013-04-1` = c(32, 8, 9, 4, 0), `2013-04-2` = c(489,
346, 98, 12, 0), `2013-04-3` = c(32, 8, 346, 9, 8), `2013-04-4` = c(32,
0, 46, 12, 8), `2013-04-5` = c(364, 98, 9, 0, 0), `2013-04-6` = c(19,
8, 312, 12, 0), `2013-04-7` = c(312, 12, 6, 0, 312), `2013-04-8` = c(0,
12, 1912, 987, 6), `2013-04-9` = c(0, 46, 0, 9, 312), `2013-04-10` = c(0,
364, 489, 19, 12), `2013-04-11` = c(346, 346, 0, 12, 4), `2013-04-12` = c(0,
9, 12, 46, 489), `2013-04-13` = c(32, 8, 19, 46, 0), `2013-04-14` = c(0,
987, 12, 0, 6), `2013-04-15` = c(0, 346, 4, 346, 0), `2013-04-16` = c(0,
1912, 1912, 12, 364), `2013-04-17` = c(12, 98, 32, 32, 1912),
`2013-04-18` = c(12, 12, 12, 0, 346), `2013-04-19` = c(9,
46, 98, 312, 4), `2013-04-20` = c(32, 987, 46, 9, 312), `2013-04-21` = c(4,
98, 12, 32, 12), `2013-04-22` = c(19, 0, 4, 346, 0), `2013-04-23` = c(1912,
364, 0, 0, 489)), row.names = c("bin_1", "bin_2", "bin_3",
"bin_4", "bin_5"), class = "data.frame")
> dput(df2)
structure(list(orange = c(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0), apple = c(1, 1, 1, 1, 0, 1,
0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0), pear = c(1,
0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
0), banana = c(1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
0, 0, 1, 1, 0, 1, 0), watermelon = c(0, 0, 0, 1, 1, 0, 1, 1,
1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0), lemon = c(1, 0,
1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0
)), row.names = c("2013-04-1", "2013-04-2", "2013-04-3", "2013-04-4",
"2013-04-5", "2013-04-6", "2013-04-7", "2013-04-8", "2013-04-9",
"2013-04-10", "2013-04-11", "2013-04-12", "2013-04-13", "2013-04-14",
"2013-04-15", "2013-04-16", "2013-04-17", "2013-04-18", "2013-04-19",
"2013-04-20", "2013-04-21", "2013-04-22", "2013-04-23"), class = "data.frame")

Calculate row similarity percentage pair wise and add it as a new column

I have a date frame like this sample, I would like to find similar rows (not duplicate) and calculate similarity per wise. I find this solution but i would like to keep all my columns and add similarity percentage as a new variable. My aim is to find records with highest similarity percentage. How could I do it ?
sample data set
df <- tibble::tribble(
~date, ~user_id, ~Station_id, ~location_id, ~ind_id, ~start_hour, ~start_minute, ~start_second, ~end_hour, ~end_minute, ~end_second, ~duration_min,
20191015, 19900234, 242, 2, "ac", 7, 25, 0, 7, 30, 59, 6,
20191015, 19900234, 242, 2, "ac", 7, 31, 0, 7, 32, 59, 2,
20191015, 19900234, 242, 2, "ac", 7, 33, 0, 7, 38, 59, 6,
20191015, 19900234, 242, 2, "ac", 7, 39, 0, 7, 40, 59, 2,
20191015, 19900234, 242, 2, "ac", 7, 41, 0, 7, 43, 59, 3,
20191015, 19900234, 242, 2, "ac", 7, 44, 0, 7, 45, 59, 2,
20191015, 19900234, 242, 2, "ac", 7, 47, 0, 7, 59, 59, 13,
20191015, 19900234, 242, 2, "ad", 7, 47, 0, 7, 59, 59, 13,
20191015, 19900234, 242, 2, "ac", 8, 5, 0, 8, 6, 59, 2,
20191015, 19900234, 242, 2, "ad", 8, 5, 0, 8, 6, 59, 2,
20191015, 19900234, 242, 2, "ac", 8, 7, 0, 8, 8, 59, 2,
20191015, 19900234, 242, 2, "ad", 8, 7, 0, 8, 8, 59, 2,
20191015, 19900234, 242, 2, "ac", 16, 26, 0, 16, 55, 59, 30,
20191015, 19900234, 242, 2, "ad", 16, 26, 0, 16, 55, 59, 30,
20191015, 19900234, 242, 2, "ad", 17, 5, 0, 17, 6, 59, 2,
20191015, 19900234, 242, 2, "ac", 17, 5, 0, 17, 23, 59, 19,
20191015, 19900234, 242, 2, "ad", 17, 7, 0, 17, 15, 59, 9,
20191015, 19900234, 242, 2, "ad", 17, 16, 0, 17, 22, 59, 7,
20191015, 19900234, 264, 2, "ac", 17, 24, 0, 17, 35, 59, 12,
20191015, 19900234, 264, 2, "ad", 17, 25, 0, 17, 35, 59, 11,
20191016, 19900234, 242, 1, "ac", 7, 12, 0, 7, 14, 59, 3,
20191016, 19900234, 242, 1, "ad", 7, 13, 0, 7, 13, 59, 1,
20191016, 19900234, 242, 1, "ac", 17, 45, 0, 17, 49, 59, 5,
20191016, 19900234, 242, 1, "ad", 17, 46, 0, 17, 48, 59, 3,
20191016, 19900234, 242, 2, "ad", 7, 14, 0, 8, 0, 59, 47,
20191016, 19900234, 242, 2, "ac", 7, 15, 0, 8, 0, 59, 47
)
Function for comparing rows
row_cf <- function(x, y, df){
sum(df[x,] == df[y,])/ncol(df)
}
Function output
# 1) Create all possible row combinations
# 2) Rename
# 3) Run through each row
# 4) Calculate similarity
expand.grid(1:nrow(df), 1:nrow(df)) %>%
rename(row_1 = Var1, row_2 = Var2) %>%
rowwise() %>%
mutate(similarity = row_cf(row_1, row_2, df))
# A tibble: 676 x 3
row_1 row_2 similarity
<int> <int> <dbl>
1 1 1 1
2 2 1 0.75
3 3 1 0.833
4 4 1 0.75
5 5 1 0.75
6 6 1 0.75
7 7 1 0.75
8 8 1 0.667
9 9 1 0.583
10 10 1 0.5
Edit:
I would like to find similar rows in the data like here
Using your "function output", call it sim. Eliminate the self-comparisons and then keep the max similarity row grouped by row_1:
sim = sim %>%
filter(row_1 != row_2) %>%
group_by(row_1) %>%
slice(which.max(similarity))
Then you can add these to your original data:
df %>% mutate(row_1 = 1:n()) %>%
left_join(sim)
The row_2 column gives the row number of the most similar row, and similarity gives its similarity score. (You may want to improve these column names.)

FMI function for MI datasets is acting up?

I used the fmi function from SemTools package just a few weeks ago, and it worked great! Here is the code that I saved and that worked fine:
dat.imp2 <- mice(data = dat1, m = 37, method = "pmm", seed = 444)
out <- fmi(dat.imp2$imputations)
out
I have used it to compare the loss of efficiency in using 4 source variables vs 1 composite, so I re-ran it twice - first with the 4 source variables and then with 1 composite, and it was much better for composite. Also, the output showed fmi for means and variances separately.
Come back to this code a few weeks later, and it doens't work! The error message reads:
Error in dim(robj) <- c(dX, dY) :
dims [product 0] do not match the length of object [1]
So, I modified the code as follows:
imp2 <- mice(dat1, m = 37, method = "pmm", seed = 444)
out <- fmi(imp2$data)
out
This works but only with the composite variable in the dataset, and only gives me fmi for means but not variances. If I substitute this composite variable with the four source variables it gives me the following error:
Warning message:
In lavaan(slotOptions = object#Options, slotParTable = object#ParTable, :
lavaan WARNING: model has NOT converged!
I don't understand how the code that worked a couple weeks ago does not work now? Did anyone come across this problem? I wasn't able to find much online.
Thank you!
Here is the dataset with one composite variable instead (mommh)
> dput(dat2)
structure(list(mompa = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0,
0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
0, 1, 0, 0), format.spss = "F8.2", display_width = 10L), momabhx = structure(c(1,
0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1), format.spss = "F8.2", display_width = 10L),
mommh = c(63, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 35.75, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 43.25, NA, NA, 63, 41.5,
34.25, 38.5, 39, 38.5, NA, 49.75, 57.5, 59.25, 50, 42.75,
45, 49, 32.75, NA, 35.75, 64.75, 50.5, 46.5, 39.75, 51.75,
34.75, 61.25, 46, 43, 56.25, 47, 42.25, 36.5, 34.5, 47, 50,
35, 48.25, 46.5, 58.5, 35.5, 55.25, 43.5, 42.75, 35.75, 38,
35.5, 50, 38.25, 57, 45.75, 38.5, 44.25, 51.75, NA, 38.25,
39.75, 34, 57.25, 39.25, 42.25, 37.25, NA, 32.75, 52.75,
NA, NA, 55.75, 62.25, 59.75, 43.75, 59.75, 35.75, NA, 34.25,
59.25, 39, 34.75, 32.75, NA, 53.5, NA, 40.5, 50, 33.5, 45.25,
41, 50, NA, 38.5, 61.5, 36.25, 46.25, 46, 44.75, 44.75, 62.5,
38.25, 49.5, 33.75, NA, 50.25, 43, 43.75, 42.25, 60.5, NA,
50.25, 54.75, 42.75, 45.75, 61, 58.25, 44.5, 46.5, 34.25,
56.75, 40.5, 47, 42.25, 48, 44, 36.75, 39.75, 48.75, 38.25,
49.25, 49.25, NA, NA, 34.25, 44.5, NA, 51, 44, 50.75, 56.25,
35, 55, 58.75, 56.5, 68.75, 54, 53, 41.5, 50.75, NA, 32.75,
46.75, 32.75, 43, 57, 55.25, NA, NA, 43.75, 55.5, NA, NA,
32.75, NA, NA, NA, NA, 60.5, 32.75, NA, 68.25, 50.5, 32.75,
66.5, 33, 38.5, 43, 43.75, 62.75, 47, 36.5, 39.5, 39.5),
risk6 = structure(c(0, 0, 0, 0, 3, 1, 1, 1, 1, 0, 1, 1, 0,
0, 0, 2, 1, 1, 0, 1, 0, 1, 0, 1, 2, 1, 1, 0, 0, 2, 1, 1,
2, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 2, 1, 3, 2, 2, 0, 0, 0,
2, 0, 2, 2, 1, 2, 2, 1, 3, 2, 3, 1, 1, 0, 1, 3, 1, 2, 2,
0, 1, 0, 0, 1, 3, 1, 0, 1, 0, 0, 1, 3, 0, 1, 1, 0, 0, 2,
3, 3, 1, 2, 3, 2, 0, 0, 4, 1, 2, 1, 3, 2, 1, 2, 0, 1, 1,
2, 1, 1, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 1,
NA, 1, 1, 1, 2, 0, NA, 3, 0, 2, 2, 3, 4, 4, 0, 1, 0, 2, 3,
2, 2, 2, 2, 1, 3, 2, 2, 3, 1, 1, 1, 0, 0, 1, 1, 0, 2, 0,
1, 2, 3, 1, 1, 1, 2, 1, 2, 0, 0, 2, 2, 0, 1, 2, 0, 0, 2,
1, 1, 1, 1, 1, 3, 1, 0, 3, 0, 1, 0, 1, 1, 1, 2, 2, 0, 2,
3, 3, 0, 0, 0, 1, 2, 1, 1, 1, 0, 1, 1, 3, 1, 1, 0, 0, 3,
2, 0, 0, 3, 2, 1, 3, 1, 0, 3, 0, 1, 1, 2, 3, 3, 1, 4, 2,
3, 2, 2), format.spss = "F8.2", display_width = 10L), eadiff = structure(c(-1.26734803867686,
-0.355541076313792, 0.518653050779668, 1.50568568368194,
0.0940935989894723, 2.07356799670629, 1.01843817310907, -1.26734803867686,
-0.317928241044189, 0.531190662536203, 0.0940935989894723,
-1.47335895869369, -0.586627219843691, -1.26734803867686,
0.325179742519372, 0.556265886049271, 1.4179224013862, 1.2244490931259,
-0.586627219843691, 0.081555987232938, -0.149530156296961,
-0.380616299826861, -0.805175751617057, -0.368078688070326,
0.0940935989894723, -0.124454932783893, 0.955750114326398,
-0.805175751617057, 0.531190662536203, -0.830250975130125,
0.968287726082933, 0.749739194309568, -0.368078688070326,
-1.03626189514696, 3.19138587908619, -0.574089608087157,
1.67408376842917, -0.586627219843691, -0.343003464557258,
-0.162067768053496, 0.325179742519372, -1.24227281516379,
-1.03626189514696, 0.749739194309568, 0.325179742519372,
0.556265886049271, 0.762276806066102, -0.817713363373591,
-0.805175751617057, 0.119168822502541, -0.805175751617057,
-0.149530156296961, 0.0940935989894723, -1.48589657045022,
1.01843817310907, 0.312642130762837, 1.21191148136937, -0.355541076313792,
-1.04879950690349, -0.368078688070326, -0.124454932783893,
0.312642130762837, -1.25481042692032, -0.136992544540427,
1.01843817310907, -0.124454932783893, -0.368078688070326,
-0.805175751617057, 0.081555987232938, -0.805175751617057,
0.325179742519372, 2.97283734731282, 0.337717354275906, 0.0690183754764037,
-0.136992544540427, -0.830250975130125, 3.03552540609549,
0.0940935989894723, 0.0690183754764037, -0.124454932783893,
-0.817713363373591, -0.355541076313792, 0.312642130762837,
0.980825337839467, -0.343003464557258, 0.993362949596001,
-0.586627219843691, -0.574089608087157, -1.02372428339042,
-0.561551996330623, -0.111917321027358, -0.136992544540427,
-0.149530156296961, -0.830250975130125, 0.568803497805805,
0.0690183754764037, -0.805175751617057, -0.830250975130125,
0.556265886049271, 0.968287726082933, 0.531190662536203,
0.312642130762837, 0.337717354275906, 0.774814417822636,
0.337717354275906, 0.337717354275906, -0.586627219843691,
0.106631210746007, -1.02372428339042, -0.574089608087157,
-0.355541076313792, 0.737201582553033, 0.325179742519372,
0.312642130762837, 0.556265886049271, 0.0940935989894723,
0.300104519006303, -0.330465852800723, 0.0940935989894723,
-0.355541076313792, -0.599164831600226, 0.312642130762837,
0.531190662536203, -1.25481042692032, 0.531190662536203,
1.89263230020253, -0.817713363373591, -1.02372428339042,
0.980825337839467, -0.149530156296961, -0.586627219843691,
1.23698670488244, 0.556265886049271, 0.325179742519372, -0.817713363373591,
1.01843817310907, -1.02372428339042, -0.805175751617057,
-0.355541076313792, 1.67408376842917, 0.0690183754764037,
-0.368078688070326, -0.124454932783893, 0.980825337839467,
-1.03626189514696, 0.119168822502541, -1.03626189514696,
-1.03626189514696, 1.4555352366558, -0.136992544540427, -1.04879950690349,
0.749739194309568, -0.792638139860522, 0.312642130762837,
-0.0993797092708241, -0.17460537981003, -0.343003464557258,
-0.586627219843691, 0.300104519006303, -0.355541076313792,
-0.805175751617057, 0.518653050779668, -1.26734803867686,
-1.25481042692032, -0.368078688070326, -0.805175751617057,
-0.343003464557258, -0.343003464557258, -0.599164831600226,
-0.124454932783893, 1.66154615667263, -0.586627219843691,
-0.586627219843691, -0.124454932783893, 0.955750114326398,
-0.355541076313792, -0.343003464557258, 0.0940935989894723,
-0.792638139860522, -0.599164831600226, NA, -0.586627219843691,
-1.26734803867686, 0.762276806066102, 1.2244490931259, 0.081555987232938,
-0.574089608087157, -1.01118667163389, 0.312642130762837,
0.081555987232938, -0.368078688070326, -1.26734803867686,
1.63647093315956, -0.368078688070326, 0.531190662536203,
0.081555987232938, 0.543728274292737, 0.0564807637198694,
0.955750114326398, -1.25481042692032, 1.44299762489927, -1.04879950690349,
0.106631210746007, -0.586627219843691, 0.0940935989894723,
-0.162067768053496, 0.0940935989894723, -0.111917321027358,
0.968287726082933, 0.0940935989894723, 0.312642130762837,
-0.586627219843691, 0.543728274292737, -0.124454932783893,
0.543728274292737, -0.817713363373591, -0.586627219843691,
-0.368078688070326, 0.0940935989894723, -0.599164831600226,
-1.03626189514696, 0.774814417822636, 0.106631210746007,
-0.111917321027358, -0.817713363373591, -0.330465852800723,
0.993362949596001, -0.368078688070326, 1.19937386961283,
0.531190662536203, 0.749739194309568, 1.6490085449161, 0.0690183754764037,
-0.574089608087157, -0.368078688070326, 1.00590056135254,
1.4555352366558, -0.574089608087157, -0.586627219843691,
-0.817713363373591, -0.817713363373591, 0.0940935989894723,
-0.792638139860522, 0.0690183754764037), format.spss = "F8.2", display_width = 10L)), .Names = c("mompa",
"momabhx", "mommh", "risk6", "eadiff"), row.names = c(NA, -244L
), class = "data.frame")
And here is the same dataset with 4 source variables (depr, anxt, host, bpsipdr1)
> dput(dat3)
structure(list(mompa = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0,
0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
0, 1, 0, 0), format.spss = "F8.2", display_width = 10L), momabhx = structure(c(1,
0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1), format.spss = "F8.2", display_width = 10L),
risk6 = structure(c(0, 0, 0, 0, 3, 1, 1, 1, 1, 0, 1, 1, 0,
0, 0, 2, 1, 1, 0, 1, 0, 1, 0, 1, 2, 1, 1, 0, 0, 2, 1, 1,
2, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 2, 1, 3, 2, 2, 0, 0, 0,
2, 0, 2, 2, 1, 2, 2, 1, 3, 2, 3, 1, 1, 0, 1, 3, 1, 2, 2,
0, 1, 0, 0, 1, 3, 1, 0, 1, 0, 0, 1, 3, 0, 1, 1, 0, 0, 2,
3, 3, 1, 2, 3, 2, 0, 0, 4, 1, 2, 1, 3, 2, 1, 2, 0, 1, 1,
2, 1, 1, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 1,
NA, 1, 1, 1, 2, 0, NA, 3, 0, 2, 2, 3, 4, 4, 0, 1, 0, 2, 3,
2, 2, 2, 2, 1, 3, 2, 2, 3, 1, 1, 1, 0, 0, 1, 1, 0, 2, 0,
1, 2, 3, 1, 1, 1, 2, 1, 2, 0, 0, 2, 2, 0, 1, 2, 0, 0, 2,
1, 1, 1, 1, 1, 3, 1, 0, 3, 0, 1, 0, 1, 1, 1, 2, 2, 0, 2,
3, 3, 0, 0, 0, 1, 2, 1, 1, 1, 0, 1, 1, 3, 1, 1, 0, 0, 3,
2, 0, 0, 3, 2, 1, 3, 1, 0, 3, 0, 1, 1, 2, 3, 3, 1, 4, 2,
3, 2, 2), format.spss = "F8.2", display_width = 10L), eadiff = structure(c(-1.26734803867686,
-0.355541076313792, 0.518653050779668, 1.50568568368194,
0.0940935989894723, 2.07356799670629, 1.01843817310907, -1.26734803867686,
-0.317928241044189, 0.531190662536203, 0.0940935989894723,
-1.47335895869369, -0.586627219843691, -1.26734803867686,
0.325179742519372, 0.556265886049271, 1.4179224013862, 1.2244490931259,
-0.586627219843691, 0.081555987232938, -0.149530156296961,
-0.380616299826861, -0.805175751617057, -0.368078688070326,
0.0940935989894723, -0.124454932783893, 0.955750114326398,
-0.805175751617057, 0.531190662536203, -0.830250975130125,
0.968287726082933, 0.749739194309568, -0.368078688070326,
-1.03626189514696, 3.19138587908619, -0.574089608087157,
1.67408376842917, -0.586627219843691, -0.343003464557258,
-0.162067768053496, 0.325179742519372, -1.24227281516379,
-1.03626189514696, 0.749739194309568, 0.325179742519372,
0.556265886049271, 0.762276806066102, -0.817713363373591,
-0.805175751617057, 0.119168822502541, -0.805175751617057,
-0.149530156296961, 0.0940935989894723, -1.48589657045022,
1.01843817310907, 0.312642130762837, 1.21191148136937, -0.355541076313792,
-1.04879950690349, -0.368078688070326, -0.124454932783893,
0.312642130762837, -1.25481042692032, -0.136992544540427,
1.01843817310907, -0.124454932783893, -0.368078688070326,
-0.805175751617057, 0.081555987232938, -0.805175751617057,
0.325179742519372, 2.97283734731282, 0.337717354275906, 0.0690183754764037,
-0.136992544540427, -0.830250975130125, 3.03552540609549,
0.0940935989894723, 0.0690183754764037, -0.124454932783893,
-0.817713363373591, -0.355541076313792, 0.312642130762837,
0.980825337839467, -0.343003464557258, 0.993362949596001,
-0.586627219843691, -0.574089608087157, -1.02372428339042,
-0.561551996330623, -0.111917321027358, -0.136992544540427,
-0.149530156296961, -0.830250975130125, 0.568803497805805,
0.0690183754764037, -0.805175751617057, -0.830250975130125,
0.556265886049271, 0.968287726082933, 0.531190662536203,
0.312642130762837, 0.337717354275906, 0.774814417822636,
0.337717354275906, 0.337717354275906, -0.586627219843691,
0.106631210746007, -1.02372428339042, -0.574089608087157,
-0.355541076313792, 0.737201582553033, 0.325179742519372,
0.312642130762837, 0.556265886049271, 0.0940935989894723,
0.300104519006303, -0.330465852800723, 0.0940935989894723,
-0.355541076313792, -0.599164831600226, 0.312642130762837,
0.531190662536203, -1.25481042692032, 0.531190662536203,
1.89263230020253, -0.817713363373591, -1.02372428339042,
0.980825337839467, -0.149530156296961, -0.586627219843691,
1.23698670488244, 0.556265886049271, 0.325179742519372, -0.817713363373591,
1.01843817310907, -1.02372428339042, -0.805175751617057,
-0.355541076313792, 1.67408376842917, 0.0690183754764037,
-0.368078688070326, -0.124454932783893, 0.980825337839467,
-1.03626189514696, 0.119168822502541, -1.03626189514696,
-1.03626189514696, 1.4555352366558, -0.136992544540427, -1.04879950690349,
0.749739194309568, -0.792638139860522, 0.312642130762837,
-0.0993797092708241, -0.17460537981003, -0.343003464557258,
-0.586627219843691, 0.300104519006303, -0.355541076313792,
-0.805175751617057, 0.518653050779668, -1.26734803867686,
-1.25481042692032, -0.368078688070326, -0.805175751617057,
-0.343003464557258, -0.343003464557258, -0.599164831600226,
-0.124454932783893, 1.66154615667263, -0.586627219843691,
-0.586627219843691, -0.124454932783893, 0.955750114326398,
-0.355541076313792, -0.343003464557258, 0.0940935989894723,
-0.792638139860522, -0.599164831600226, NA, -0.586627219843691,
-1.26734803867686, 0.762276806066102, 1.2244490931259, 0.081555987232938,
-0.574089608087157, -1.01118667163389, 0.312642130762837,
0.081555987232938, -0.368078688070326, -1.26734803867686,
1.63647093315956, -0.368078688070326, 0.531190662536203,
0.081555987232938, 0.543728274292737, 0.0564807637198694,
0.955750114326398, -1.25481042692032, 1.44299762489927, -1.04879950690349,
0.106631210746007, -0.586627219843691, 0.0940935989894723,
-0.162067768053496, 0.0940935989894723, -0.111917321027358,
0.968287726082933, 0.0940935989894723, 0.312642130762837,
-0.586627219843691, 0.543728274292737, -0.124454932783893,
0.543728274292737, -0.817713363373591, -0.586627219843691,
-0.368078688070326, 0.0940935989894723, -0.599164831600226,
-1.03626189514696, 0.774814417822636, 0.106631210746007,
-0.111917321027358, -0.817713363373591, -0.330465852800723,
0.993362949596001, -0.368078688070326, 1.19937386961283,
0.531190662536203, 0.749739194309568, 1.6490085449161, 0.0690183754764037,
-0.574089608087157, -0.368078688070326, 1.00590056135254,
1.4555352366558, -0.574089608087157, -0.586627219843691,
-0.817713363373591, -0.817713363373591, 0.0940935989894723,
-0.792638139860522, 0.0690183754764037), format.spss = "F8.2", display_width = 10L),
host = structure(c(68, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 38, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 41, 49, 41, 78, 41, 41,
49, 41, 45, 45, 51, 71, 73, 62, 51, 51, 65, 38, NA, 38, 70,
58, 45, 38, 64, 38, 72, 55, 45, 60, 58, 38, 38, 38, 38, 45,
38, 38, 51, 60, 38, 68, 51, 60, 38, 45, 38, 38, 38, 68, 45,
38, 51, 51, NA, 45, 38, 38, 66, 38, 45, 38, 65, 38, 51, NA,
NA, 60, 71, 70, 45, 71, 38, NA, 38, 55, 38, 38, 38, NA, 62,
58, 38, 58, 38, 51, 38, 72, 64, 45, 71, 45, 45, 51, 45, 45,
75, 38, 51, 38, 58, 55, 55, 38, 38, 70, 55, 65, 64, 55, 55,
69, 68, 55, 38, 38, 55, 45, 58, 38, 64, 38, 51, 45, 45, 38,
45, 62, 66, NA, 38, 45, 58, 58, 51, 65, 64, 38, 60, 60, 70,
75, 65, 62, 51, 62, NA, 38, 58, 38, 45, 38, 65, NA, 64, 38,
51, NA, NA, 38, NA, NA, NA, NA, 70, 38, NA, 75, 55, 38, 71,
38, 38, 55, 55, 58, 58, 45, 45, 45), format.spss = "F2.0", display_width = 11L),
anxt = structure(c(73, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 39, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 51, 51, 51, 66, 51, 40,
40, 51, 55, 62, 55, 55, 67, 51, 55, 55, 65, 39, NA, 39, 62,
59, 59, 39, 67, 39, 62, 51, 51, 65, 51, 51, 51, 39, 51, 59,
39, 59, 55, 71, 39, 53, 51, 51, 51, 51, 39, 55, 39, 65, 59,
51, 39, 65, NA, 39, 51, 39, 65, 51, 51, 39, 59, 39, 67, NA,
NA, 59, 70, 67, 39, 65, 39, NA, 39, 65, 51, 39, 39, NA, 62,
65, 55, 39, 39, 59, 39, 70, NA, 55, 67, 39, 51, 55, 51, 55,
70, 55, 56, 39, 70, 55, 51, 51, 51, 62, NA, 59, 62, 55, 59,
62, 59, 51, 51, 39, 65, 39, 55, 62, 51, 55, 39, 39, 62, 51,
55, 62, NA, NA, 39, 51, NA, 62, 39, 62, 59, 39, 59, 71, 51,
74, 59, 51, 51, 62, NA, 39, 51, 39, 51, 72, 62, NA, 62, 55,
62, NA, NA, 39, NA, NA, NA, NA, 70, 39, NA, 70, 65, 39, 73,
39, 51, 51, 55, 74, 62, 39, 51, 51), format.spss = "F2.0", display_width = 11L),
depr = structure(c(71, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 42, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 53, 68, NA, 71, 44, 44,
44, 44, 42, 42, 61, 64, 57, 70, 42, 50, 54, 42, NA, 50, 78,
57, 50, 42, 54, 42, 68, 42, 54, 64, 54, 54, 42, 42, 68, 64,
42, 65, 50, 68, 42, 70, 50, 42, 42, 42, 42, 50, 42, 60, 54,
42, 56, 64, NA, 42, 42, 42, 61, 42, 42, 42, 57, 42, 64, 57,
NA, 68, 73, 72, 57, 68, 42, 57, 42, 65, 50, 42, 42, NA, 62,
62, 42, 61, 42, 54, 54, 42, 71, 42, 64, 42, 54, 54, 57, 42,
70, 42, 54, 42, 57, 57, 54, 60, 54, 71, 60, 54, 60, 42, 42,
68, 70, 50, 60, 42, 69, 54, 42, 42, 57, 50, 42, 42, 54, 42,
65, 57, 68, NA, 42, 60, 64, 50, 60, 50, 70, 42, 65, 64, 65,
71, 64, 62, 42, 62, NA, 42, 42, 42, 57, 70, 61, NA, 57, 42,
68, NA, NA, 42, NA, NA, NA, NA, 65, 42, NA, 75, 61, 42, 75,
42, 42, 54, 50, 72, 42, 50, 42, 42), format.spss = "F2.0", display_width = 11L),
bpsipdr1 = structure(c(40, 26, 34, 29, 23, 41, 37, 21, 38,
NA, 33, 28, 25, NA, NA, 15, 18, 30, NA, NA, 28, 34, NA, 51,
24, 28, 23, 12, 39, 55, 28, NA, 26, 18, 33, NA, 27, 32, 27,
23, 28, 41, NA, 22, 21, 26, 26, 36, 16, 24, 24, 25, 23, 24,
26, 35, 32, 27, 38, 25, 26, 32, 27, 41, 28, NA, 27, 37, 30,
12, 21, 20, 12, NA, 32, 40, 40, 17, 23, 24, 12, 12, NA, 16,
49, 28, 32, 40, 22, 20, 43, 36, 22, 36, 25, 26, 15, 19, 31,
32, 21, 31, 30, 35, 23, 30, 22, 18, 12, 14, 23, 57, 34, 35,
25, 23, 31, 27, 33, 27, 28, 17, 37, 26, 31, 30, NA, 12, 29,
NA, 19, 36, 35, 30, 34, 35, 24, 34, 18, 52, 17, 20, 12, 25,
28, NA, 27, 42, 15, 17, 33, 16, 30, 12, 44, 19, 35, 24, 26,
37, 35, 18, 37, 16, NA, 34, 12, 26, 26, 39, 34, 23, 33, 19,
27, 45, 36, 22, 37, 18, 38, 24, 33, 27, 20, 33, 15, 33, 34,
22, 32, 16, 30, 24, 18, 22, 42, 34, 26, 26, 32, 21, 36, 40,
40, 55, 28, 37, 22, 17, 41, 12, 36, 12, 19, 48, 33, 26, NA,
40, 41, 24, 27, 12, 36, 24, 38, NA, 37, 12, 20, 53, 21, 12,
47, 13, 23, 12, 15, 47, 26, 12, 20, 20), format.spss = "F3.0", display_width = 11L)), .Names = c("mompa",
"momabhx", "risk6", "eadiff", "host", "anxt", "depr", "bpsipdr1"
), row.names = c(NA, -244L), class = "data.frame")

Resources