Related
Here is my data:
How do I make it so that the column names appear on the x axis? I will probably use the facet function so that the number values aren't next to the duration values, so one graph will have these on the x axis: "Number Looks", "Number Gesture", "Number Reach", "Number Other" for group A, and another graph will have these on the x axis: "Duration Looks", "Duration Gesture", "Duration Reach", "Duration Other" for group A, with the data below the column titles as the y-axis values. I will also have to generate the data for group B in the same way
Here is how we could achieve your task:
Bring your data in the correct format with pivot_longer
Use filter for each number and Duration
Now you have to separate dataframes
plot them individually with ggplot2 using facet_wrap for group A and B
The output arranged with plot_grid from cowplot package!
library(cowplot)
library(tidyverse)
df_number <- df %>%
pivot_longer(
cols = 3:12,
names_to = "names",
values_to = "values"
) %>%
filter(grepl('Number', names))
df_Duration <- df %>%
pivot_longer(
cols = 3:12,
names_to = "names",
values_to = "values"
) %>%
filter(grepl('Duration', names))
plot_number <- ggplot(df_number, aes(x=factor(names), y=values)) +
geom_bar(stat = "identity") +
xlab("Number") +
ylab("Value") +
facet_wrap(~Group) +
theme_bw()
plot_Duration <- ggplot(df_Duration, aes(x=factor(names), y=values)) +
geom_bar(stat = "identity") +
xlab("Duration") +
ylab("Value") +
facet_wrap(~Group) +
theme_bw()
plot_grid(plot_number, plot_Duration, labels = "AUTO")
data:
df <- structure(list(Participant = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17), Group = c("A", "A", "A", "A", "A",
"A", "A", "A", "B", "B", "B", "B", "B", "B", "B", "B", "B"),
Number_Looks = c(47, 94, 23, 64, 99, 38, 85, 38, 20, 10,
34, 54, 87, 78, 45, 63, 32), Duration_Look = c(247, 294,
223, 264, 299, 238, 285, 238, 220, 210, 234, 254, 287, 278,
245, 263, 232), Number_Gesture = c(39, 86, 15, 56, 91, 30,
77, 30, 12, 20, 26, 46, 79, 70, 37, 55, 24), Duration_Gesture = c(29,
76, 5, 46, 81, 20, 67, 20, 20, 10, 16, 36, 69, 60, 27, 45,
14), Number_Reach = c(40, 87, 16, 57, 92, 31, 78, 31, 13,
21, 27, 47, 80, 71, 38, 56, 25), Duration_Reach = c(89, 136,
65, 106, 141, 80, 127, 80, 80, 70, 76, 96, 129, 120, 87,
105, 74), Number_Other = c(52, 99, 28, 69, 104, 43, 90, 43,
25, 33, 39, 59, 92, 83, 50, 68, 37), Duration_Other = c(339,
386, 315, 356, 391, 330, 377, 330, 330, 320, 326, 346, 379,
370, 337, 355, 324), Number_Sound = c(152, 199, 128, 169,
204, 143, 190, 143, 125, 133, 139, 159, 192, 183, 150, 168,
137), Duration_Sound = c(319, 366, 295, 336, 371, 310, 357,
310, 310, 300, 306, 326, 359, 350, 317, 335, 304)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -17L))
I have some missing data that I am trying to impute to the mean of each column. My code,
apply(train_new, 2, function(x)
mutate(
ifelse(is.na(x) | x < 0, mean(x), x)
)
)
is meant to impute all 17 columns to the mean of each column in one fell swoop, but this returns Error during wrapup: no applicable method for 'mutate_' applied to an object of class "c('double', 'numeric')", and leads me to a debug screen. I'm sure this is just a syntactical issue, but I'm at a loss as to where it is.
Sample data:
structure(list(INDEX = c(1, 2, 3, 4, 5, 6), TARGET_WINS = c(39,
70, 86, 70, 82, 75), TEAM_BATTING_H = c(1445, 1339, 1377, 1387,
1297, 1279), TEAM_BATTING_2B = c(194, 219, 232, 209, 186, 200
), TEAM_BATTING_3B = c(39, 22, 35, 38, 27, 36), TEAM_BATTING_HR = c(13,
190, 137, 96, 102, 92), TEAM_BATTING_BB = c(457.7607, 685, 602,
451, 472, 443), TEAM_BATTING_SO = c(842, 1075, 917, 922, 920,
973), TEAM_BASERUN_SB = c(97.288, 37, 46, 43, 49, 107), TEAM_BASERUN_CS = c(NA,
28, 27, 30, 39, 59), TEAM_PITCHING_H = c(NA, 1347, 1377, 1396,
1297, 1279), TEAM_PITCHING_HR = c(84, 191, 137, 97, 102, 92),
TEAM_PITCHING_BB = c(530.9595, 689, 602, 454, 472, 443),
TEAM_PITCHING_SO = c(737.105, 1082, 917, 928, 920, 973),
TEAM_FIELDING_E = c(NA, 193, 175, 164, 138, 123), TEAM_FIELDING_DP = c(146.234708045,
155, 153, 156, 168, 149), TEAM_BATTING_1B = c(1199, 908,
973, 1044, 982, 951)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
You could try:
library(dplyr)
train_new %>%
mutate_all(funs(ifelse(is.na(.) | . < 0, mean(., na.rm = T), .)))
Here is one option with na.aggregate (from zoo)
library(zoo)
na.aggregate(replace(train_new, train_new < 0, NA))
df <- structure(list(V = structure(c(4L, 5L, 3L, 7L, 6L, 2L, 1L), .Label = c("132 B26,172 B27,107 B57,104 B59,137 B60,133 B61,103 B62,134 B63,177 B100,123 B133,184 B168,109 B197,103 B198,173 B202,157 B203,143 B266,62 B342,62 B354,92 B355,195 B368,164 B370,52 B468,74 B469,71 B484,98 B494,66 B502,63 B601,133 B622",
"135A,510A,511A,60 B23,67 B24,70 B25,95 B26,122 B27,123 B27,109 B60",
"25A,28 B55,31 B56,45 B57,43 B58,5 B59,47 B59,6 B60,69 B60,66 B61",
"267 B361,786 B363,543 B392", "563 B202,983 B360", "8 B1,12 B35,10 B71,9 B154,51 B179",
"91 B26,117 B27,117 B28,102 B29,47 B31,96 B63,78 B64,133 B65,117 B66,121 B66,112 B67,127 B100"
), class = "factor")), .Names = "V", class = "data.frame", row.names = c(NA,
-7L))
I want to have an output like this
V
361, 363, 392
202,360
55,56,57,58,59,59,60,60,61
26,27,28,29,31,63,64,65,66,66,67,100
1,35,71,154,179
23,24,25,26,27,27,60
26,27,57,59,60,61,62,63,100,133,168,197,198,202,203,266,342,354,355,368,370,468,469,484,494,502,601,622
I have tried for one string which works
s = "267 B361"
s1 = unlist(strsplit(s, split='B', fixed=TRUE))[2]
but I don't know how to apply it on all strings which are separated by a comma in each row
We can use str_extract_all to get the numbers that follow a non-numeric character. The output will be a list, so loop over the list with sapply and paste the elements in the list together (toString is a wrapper for paste(., collapse=', ')).
library(stringr)
sapply(str_extract_all(df$V, "(?<=[A-Z])\\d+"), toString)
#[1] "361, 363, 392"
#[2] "202, 360"
#[3] "55, 56, 57, 58, 59, 59, 60, 60, 61"
#[4] "26, 27, 28, 29, 31, 63, 64, 65, 66, 66, 67, 100"
#[5] "1, 35, 71, 154, 179"
#[6] "23, 24, 25, 26, 27, 27, 60"
#[7] "26, 27, 57, 59, 60, 61, 62, 63, 100, 133, 168, 197, 198, 202, 203, 266, 342, 354, 355, 368, 370, 468, 469, 484, 494, 502, 601, 622"
I have the following 3 tables:
AggData <- structure(list(Path = c("NonBrand", "Brand", "NonBrand,NonBrand",
"Brand,Brand", "NonBrand,NonBrand,NonBrand", "Brand,Brand,Brand",
"Brand,NonBrand", "NonBrand,Brand", "NonBrand,NonBrand,NonBrand,NonBrand",
"Brand,Brand,Brand,Brand", "NonBrand,NonBrand,NonBrand,NonBrand,NonBrand",
"Brand,Brand,Brand,Brand,Brand", "Brand,Brand,NonBrand", "NonBrand,Brand,Brand",
"Brand,NonBrand,NonBrand", "NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand",
"NonBrand,NonBrand,Brand", "Brand,NonBrand,Brand", "NonBrand,Brand,NonBrand",
"NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand",
"Brand,Brand,Brand,Brand,Brand,Brand", "NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand",
"NonBrand,Brand,Brand,Brand", "NonBrand,NonBrand,NonBrand,Brand",
"Brand,Brand,Brand,NonBrand", "Brand,Brand,Brand,Brand,Brand,Brand,Brand",
"Brand,NonBrand,NonBrand,NonBrand", "NonBrand,NonBrand,Brand,Brand",
"Brand,Brand,NonBrand,NonBrand", "Brand,NonBrand,Brand,Brand",
"NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand",
"Brand,Brand,NonBrand,Brand", "NonBrand,Brand,NonBrand,NonBrand",
"Brand,Brand,Brand,Brand,Brand,Brand,Brand,Brand", "NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand",
"NonBrand,NonBrand,Brand,NonBrand", "Brand,NonBrand,NonBrand,Brand",
"NonBrand,Brand,Brand,Brand,Brand", "NonBrand,NonBrand,NonBrand,NonBrand,Brand",
"Brand,NonBrand,Brand,NonBrand", "NonBrand,Brand,Brand,NonBrand",
"Brand,Brand,Brand,Brand,NonBrand", "Brand,NonBrand,NonBrand,NonBrand,NonBrand",
"Brand,Brand,Brand,Brand,Brand,Brand,Brand,Brand,Brand", "NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand",
"Brand,NonBrand,Brand,Brand,Brand", "NonBrand,Brand,NonBrand,Brand",
"Brand,Brand,Brand,NonBrand,Brand", "NonBrand,NonBrand,Brand,Brand,Brand",
"NonBrand,NonBrand,NonBrand,Brand,Brand", "Brand,Brand,NonBrand,Brand,Brand",
"Brand,Brand,Brand,NonBrand,NonBrand", "Brand,Brand,Brand,Brand,Brand,Brand,Brand,Brand,Brand,Brand",
"NonBrand,NonBrand,NonBrand,Brand,NonBrand", "Brand,Brand,NonBrand,NonBrand,NonBrand",
"NonBrand,Brand,Brand,Brand,Brand,Brand", "NonBrand,Brand,NonBrand,NonBrand,NonBrand",
"NonBrand,NonBrand,Brand,NonBrand,NonBrand", "NonBrand,NonBrand,NonBrand,NonBrand,NonBrand,Brand",
"Brand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand", "Brand,Brand,Brand,Brand,Brand,NonBrand",
"NonBrand,Brand,Brand,NonBrand,NonBrand", "Brand,NonBrand,NonBrand,Brand,Brand",
"NonBrand,NonBrand,NonBrand,NonBrand,Brand,Brand", "NonBrand,NonBrand,Brand,Brand,Brand,Brand",
"NonBrand,NonBrand,NonBrand,NonBrand,Brand,NonBrand", "NonBrand,NonBrand,Brand,NonBrand,Brand",
"Brand,NonBrand,NonBrand,Brand,NonBrand", "NonBrand,NonBrand,NonBrand,Brand,Brand,Brand",
"NonBrand,Brand,Brand,NonBrand,Brand", "Brand,NonBrand,NonBrand,NonBrand,NonBrand,Brand",
"Brand,Brand,NonBrand,NonBrand,NonBrand,NonBrand,NonBrand", "Brand,Brand,Brand,Brand,NonBrand,NonBrand,NonBrand"
), click_count = c(1799265, 874478, 198657, 128159, 45728, 30172,
20520, 17815, 16718, 9479, 6554, 3722, 3561, 3408, 3391, 3366,
3256, 2526, 1846, 1708, 1682, 1013, 951, 899, 881, 782, 780,
703, 642, 625, 615, 601, 453, 442, 414, 407, 362, 343, 313, 284,
281, 281, 271, 269, 268, 229, 223, 218, 215, 212, 204, 162, 161,
158, 155, 145, 132, 130, 115, 103, 102, 86, 77, 77, 72, 68, 68,
67, 58, 52, 32, 18, 18), conv_count = c(30938, 19652, 7401, 3803,
2014, 1072, 1084, 981, 652, 379, 230, 166, 205, 246, 254, 93,
239, 104, 112, 51, 76, 23, 66, 81, 55, 29, 62, 57, 50, 37, 17,
33, 38, 17, 8, 41, 33, 30, 24, 16, 26, 18, 16, 17, 7, 21, 10,
8, 27, 23, 11, 13, 6, 15, 14, 16, 8, 10, 6, 6, 11, 11, 8, 9,
8, 8, 9, 7, 7, 6, 6, 6, 7), CR = c(0.0171947989873643, 0.0224728352228415,
0.0372551684561833, 0.0296740767328085, 0.0440430370888733, 0.0355296301206417,
0.0528265107212476, 0.0550659556553466, 0.0389998803684651, 0.0399831205823399,
0.0350930729325603, 0.0445996775926921, 0.057568098848638, 0.0721830985915493,
0.0749041580654674, 0.0276292335115865, 0.0734029484029484, 0.0411718131433096,
0.0606717226435536, 0.0298594847775176, 0.0451843043995244, 0.0227048371174729,
0.0694006309148265, 0.0901001112347052, 0.0624290578887628, 0.0370843989769821,
0.0794871794871795, 0.0810810810810811, 0.0778816199376947, 0.0592,
0.0276422764227642, 0.0549084858569052, 0.0838852097130243, 0.0384615384615385,
0.0193236714975845, 0.100737100737101, 0.0911602209944751, 0.0874635568513119,
0.0766773162939297, 0.0563380281690141, 0.0925266903914591, 0.0640569395017794,
0.0590405904059041, 0.0631970260223048, 0.0261194029850746, 0.091703056768559,
0.0448430493273543, 0.036697247706422, 0.125581395348837, 0.108490566037736,
0.053921568627451, 0.0802469135802469, 0.0372670807453416, 0.0949367088607595,
0.0903225806451613, 0.110344827586207, 0.0606060606060606, 0.0769230769230769,
0.0521739130434783, 0.058252427184466, 0.107843137254902, 0.127906976744186,
0.103896103896104, 0.116883116883117, 0.111111111111111, 0.117647058823529,
0.132352941176471, 0.104477611940299, 0.120689655172414, 0.115384615384615,
0.1875, 0.333333333333333, 0.388888888888889)), .Names = c("Path",
"click_count", "conv_count", "CR"), row.names = c(NA, -73L), class = "data.frame")
another one here:
breakVector <- structure(list(breakVector = structure(c(1L, 1L), .Label = "NonBrand", class = "factor"),
CR = c(0.461541302855402, 0.538458697144598)), .Names = c("breakVector",
"CR"), row.names = c(NA, -2L), class = "data.frame")
and:
FinalTable <- structure(list(autribution_category = structure(c(2L, 1L), .Label = c("Brand",
"NonBrand"), class = "factor"), attributed_result = c(0, 0)), .Names = c("autribution_category",
"attributed_result"), row.names = 1:2, class = "data.frame")
when I run the following command:
if (FinalTable [2,1] == breakVector[1,1]) {
FinalTable$attributed_result[2] <- FinalTable$attributed_result[2] +
breakVector[1,2] * AggData$conv_count[3];
break}
I get the following error:
Error in Ops.factor(FinalTable[2, 1], breakVector[1, 1]) :
level sets of factors are different
This is pretty weird, since both values that im comparing are factors, I don't see any reason why R cant compare the two levels?
FinalTable[2,1] and breakVector[1,1] do not have the same levels:
> FinalTable[2,1]
[1] Brand
Levels: Brand NonBrand
> breakVector[1,1]
[1] NonBrand
Levels: NonBrand
This is easily fixed by using
breakVector[,1] <- factor(breakVector[,1], levels=c("Brand", "NonBrand"))
or, more generally
breakVector[,1] <- factor(breakVector[,1], levels=levels(FinalTable[,1]))
Perhaps, it will better compare both variables like a string:
if (as.character(FinalTable [2,1]) == as.character(breakVector[1,1])) {
FinalTable$attributed_result[2] <- FinalTable$attributed_result[2] +
breakVector[1,2] * AggData$conv_count[3];
break}
I have a dataset of environmental variables I would like to use for a GLMM. I am using the corvif function from the AED package (http://www.highstat.com/Book2/AED_1.0.zip) to identify and remove variables with high inflation factors.
Instead of removing one variable at a time manually from my dataset with a GVIF values > 3 (highest value removed first), I would like to know how to write a loop to accomplish this task automatically with the result being a new dataset with only the remaining variables (i.e. those with GVIF values < 3).
Any suggestions for how to approach this problem for a new R user?
Here is my sample data:
WW_Covs <- structure(list(Latitude = c(62.4419, 67.833333, 65.95, 63.72935,
60.966667, 60.266667, 55.660455, 62.216667, 61.3, 61.4, 62.084139,
55.662566, 64.48508, 63.208354, 62.87591, 62.70856, 62.64009,
63.79488, 59.55, 62.84206), BIO_02 = c(87, 82, 75, 70, 77, 70,
59, 84, 84, 79, 85, 60, 91, 87, 74, 74, 76, 70, 76, 74), BIO_03 = c(26,
23, 25, 26, 25, 24, 25, 25, 26, 25, 26, 26, 24, 25, 24, 25, 25,
25, 26, 24), BIO_04 = c(8443, 9219, 7594, 6939, 7928, 7593, 6160,
8317, 8167, 7972, 8323, 6170, 9489, 8578, 7814, 7680, 7904, 7149,
7445, 7803), BIO_05 = c(201, 169, 151, 166, 194, 210, 202, 205,
204, 186, 205, 200, 200, 195, 170, 154, 180, 166, 219, 170),
BIO_06 = c(-131, -183, -144, -102, -107, -75, -26, -119,
-113, -120, -120, -28, -169, -143, -131, -142, -124, -111,
-72, -129), BIO_08 = c(128, 109, 85, 78, 122, 145, 153, 134,
130, 126, 132, 152, 120, 119, 115, 98, 124, 104, 147, 115
), BIO_09 = c(-31, -81, -16, 13, -60, -6, 25, -25, -25, -70,
-25, 23, -56, -39, -47, -60, -39, 8, 0, -46), BIO_12 = c(667,
481, 760, 970, 645, 557, 645, 666, 652, 674, 670, 670, 568,
598, 650, 734, 620, 868, 571, 658), BIO_13 = c(78, 77, 96,
109, 85, 70, 67, 77, 84, 93, 78, 68, 72, 78, 93, 99, 90,
96, 72, 93), BIO_15 = c(23, 40, 25, 21, 36, 30, 21, 24, 28,
34, 24, 22, 28, 29, 34, 32, 36, 22, 30, 34), BIO_19 = c(147,
85, 180, 236, 108, 119, 154, 149, 135, 118, 148, 162, 117,
119, 120, 141, 111, 204, 111, 122)), .Names = c("Latitude",
"BIO_02", "BIO_03", "BIO_04", "BIO_05", "BIO_06", "BIO_08", "BIO_09",
"BIO_12", "BIO_13", "BIO_15", "BIO_19"), row.names = c(1:20), class = "data.frame")
Sample code:
library(AED)
WW_Final <- corvif(WW_Covs)
test <- corvif(WW_Covs])
test[order(-test$GVIF), ]
if(test$GVIF[1,] > 3, # this is where I get stuck...
Here is an algorithm for doing this. I illustrate with the built-in dataset longley, and I also use function vif in package car, rather than using package AED:
It's not pretty, and should be wrapped inside a function, but I leave that as an exercise for the interested reader.
The code:
library(car)
dat <- longley
cutoff <- 2
flag <- TRUE
while(flag){
fit <- lm(Employed ~ ., data=dat)
vfit <- vif(fit)
if(max(vfit) > cutoff){
dat <- dat[, -which.max(vfit)]
} else {
flag <- FALSE
}
}
print(fit)
print(vfit)
The output:
Call:
lm(formula = Employed ~ ., data = dat)
Coefficients:
(Intercept) Unemployed Armed.Forces
50.66281 0.02265 0.02847
Unemployed Armed.Forces
1.032501 1.032501