Using the data.frame below (Source: http://eric.univ-lyon2.fr/~ricco/tanagra/fichiers/en_Tanagra_PLSR_Software_Comparison.pdf)
Data
df <- read.table(text = c("
diesel twodoors sportsstyle wheelbase length width height curbweight enginesize horsepower horse_per_weight conscity price symboling
0 1 0 97 172 66 56 2209 109 85 0.0385 8.7 7975 2
0 0 0 100 177 66 54 2337 109 102 0.0436 9.8 13950 2
0 0 0 116 203 72 57 3740 234 155 0.0414 14.7 34184 -1
0 1 1 103 184 68 52 3016 171 161 0.0534 12.4 15998 3
0 0 0 101 177 65 54 2765 164 121 0.0438 11.2 21105 0
0 1 0 90 169 65 52 2756 194 207 0.0751 13.8 34028 3
1 0 0 105 175 66 54 2700 134 72 0.0267 7.6 18344 0
0 0 0 108 187 68 57 3020 120 97 0.0321 12.4 11900 0
0 0 1 94 157 64 51 1967 90 68 0.0346 7.6 6229 1
0 1 0 95 169 64 53 2265 98 112 0.0494 9.0 9298 1
1 0 0 96 166 64 53 2275 110 56 0.0246 6.9 7898 0
0 1 0 100 177 66 53 2507 136 110 0.0439 12.4 15250 2
0 1 1 94 157 64 51 1876 90 68 0.0362 6.4 5572 1
0 0 0 95 170 64 54 2024 97 69 0.0341 7.6 7349 1
0 1 1 95 171 66 52 2823 152 154 0.0546 12.4 16500 1
0 0 0 103 175 65 60 2535 122 88 0.0347 9.8 8921 -1
0 0 0 113 200 70 53 4066 258 176 0.0433 15.7 32250 0
0 0 0 95 165 64 55 1938 97 69 0.0356 7.6 6849 1
1 0 0 97 172 66 56 2319 97 68 0.0293 6.4 9495 2
0 0 0 97 172 66 56 2275 109 85 0.0374 8.7 8495 2"), header = T)
and this
Code
library(pls)
Y <- as.matrix(df[,14])
X <- as.matrix(df[,1:11])
df.pls <- mvr(Y ~ X, ncomp = 3, method = "oscorespls", scale = T)
plot(df.pls, "biplot")
I got this
Biplot
Any help to plot the pls biplot using ggplot2 will be appreciated?
#Read data
df <- read.table(text = c("
diesel twodoors sportsstyle wheelbase length width height curbweight enginesize horsepower horse_per_weight conscity price symboling
0 1 0 97 172 66 56 2209 109 85 0.0385 8.7 7975 2
0 0 0 100 177 66 54 2337 109 102 0.0436 9.8 13950 2
0 0 0 116 203 72 57 3740 234 155 0.0414 14.7 34184 -1
0 1 1 103 184 68 52 3016 171 161 0.0534 12.4 15998 3
0 0 0 101 177 65 54 2765 164 121 0.0438 11.2 21105 0
0 1 0 90 169 65 52 2756 194 207 0.0751 13.8 34028 3
1 0 0 105 175 66 54 2700 134 72 0.0267 7.6 18344 0
0 0 0 108 187 68 57 3020 120 97 0.0321 12.4 11900 0
0 0 1 94 157 64 51 1967 90 68 0.0346 7.6 6229 1
0 1 0 95 169 64 53 2265 98 112 0.0494 9.0 9298 1
1 0 0 96 166 64 53 2275 110 56 0.0246 6.9 7898 0
0 1 0 100 177 66 53 2507 136 110 0.0439 12.4 15250 2
0 1 1 94 157 64 51 1876 90 68 0.0362 6.4 5572 1
0 0 0 95 170 64 54 2024 97 69 0.0341 7.6 7349 1
0 1 1 95 171 66 52 2823 152 154 0.0546 12.4 16500 1
0 0 0 103 175 65 60 2535 122 88 0.0347 9.8 8921 -1
0 0 0 113 200 70 53 4066 258 176 0.0433 15.7 32250 0
0 0 0 95 165 64 55 1938 97 69 0.0356 7.6 6849 1
1 0 0 97 172 66 56 2319 97 68 0.0293 6.4 9495 2
0 0 0 97 172 66 56 2275 109 85 0.0374 8.7 8495 2"), header = T)
#Run OP's code
library(pls)
library(ggplot2)
Y <- as.matrix(df[,14])
X <- as.matrix(df[,1:11])
df.pls <- mvr(Y ~ X, ncomp = 3, method = "oscorespls", scale = T)
#Extract information from mvr object
df2<-df.pls$scores
comp1a<-df2[,1]
comp2a<-df2[,2]
df2<-as.data.frame(cbind(comp1a, comp2a))
df1<-df.pls$loadings
comp1<-df1[,1]
comp2<-df1[,2]
names<-df1[,0]
df1<-as.data.frame(cbind(names, comp1, comp2))
#Generate two plots and overlay
#Plot 1
p1<-ggplot(data=df1, aes(comp1,comp2))+
ylab("")+xlab("")+ggtitle("X scores and X Loadings")+
theme_bw() + theme(panel.border = element_rect(colour = "black", fill=NA, size=1),panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"))+
geom_text(aes(label=rownames(df1)), color="red")+
scale_x_continuous(breaks = c(-0.6,-0.4,-0.2,0,0.2,0.4,0.6))+
scale_y_continuous(breaks = c(-0.6,-0.4,-0.2,0,0.2,0.4,0.6))+
coord_fixed(ylim=c(-0.6, 0.6),xlim=c(-0.6, 0.6))+
theme(axis.ticks = element_line(colour = "red")) +
theme(axis.text.y=element_text(angle = 90, hjust = 0.65)) +
theme(axis.text.y = element_text(margin=margin(10,10,10,5,"pt")))
#Plot 2
p2<-ggplot(data=df2, aes(comp1a,comp2a))+
ylab("Comp 2")+xlab("Comp 1")+ggtitle("X scores and X Loadings")+
theme_bw() + theme(panel.border = element_rect(colour = "black", fill=NA, size=1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"))+
geom_text(aes(label=rownames(df2)))+
xlim(-4,4)+ylim(-4,4)+
scale_y_continuous(breaks = c(-4,-2,0,2))+
coord_cartesian(ylim=c(-4, 4))+
scale_x_continuous(breaks = c(-4,-2,0,2)) +
theme(plot.title = element_text(face="bold"))+
theme(axis.text.y=element_text(angle = 90, hjust = 0.65))
#Function to overlay plots in order to get two graphs with different axes on same plot
library(grid)
library(gtable)
ggplot_dual_axis = function(plot1, plot2, which.axis = "x") {
# Update plot with transparent panel
plot2 = plot2 + theme(panel.background = element_rect(fill = NA))
grid.newpage()
# Increase right margin if which.axis == "y"
if(which.axis == "y") plot1 = plot1 + theme(plot.margin = unit(c(0.7, 1.5, 0.4, 0.4), "cm"))
# Extract gtable
g1 = ggplot_gtable(ggplot_build(plot1))
g2 = ggplot_gtable(ggplot_build(plot2))
# Overlap the panel of the second plot on that of the first
pp = c(subset(g1$layout, name == "panel", se = t:r))
g = gtable_add_grob(g1, g2$grobs[[which(g2$layout$name=="panel")]], pp$t, pp$l, pp$b, pp$l)
# Steal axis from second plot and modify
axis.lab = ifelse(which.axis == "x", "axis-b", "axis-l")
ia = which(g2$layout$name == axis.lab)
ga = g2$grobs[[ia]]
ax = ga$children[[2]]
# Switch position of ticks and labels
if(which.axis == "x") ax$heights = rev(ax$heights) else ax$widths = rev(ax$widths)
ax$grobs = rev(ax$grobs)
if(which.axis == "x")
ax$grobs[[2]]$y = ax$grobs[[2]]$y - unit(1, "npc") + unit(0.15, "cm") else
ax$grobs[[1]]$x = ax$grobs[[1]]$x - unit(1, "npc") + unit(0.15, "cm")
# Modify existing row to be tall enough for axis
if(which.axis == "x") g$heights[[2]] = g$heights[g2$layout[ia,]$t]
# Add new row or column for axis label
if(which.axis == "x") {
g = gtable_add_grob(g, ax, 2, 4, 2, 4)
g = gtable_add_rows(g, g2$heights[1], 1)
g = gtable_add_grob(g, g2$grob[[6]], 2, 4, 2, 4)
} else {
g = gtable_add_cols(g, g2$widths[g2$layout[ia, ]$l], length(g$widths) - 1)
g = gtable_add_grob(g, ax, pp$t, length(g$widths) - 1, pp$b)
g = gtable_add_grob(g, g2$grob[[7]], pp$t, length(g$widths), pp$b - 1)
}
# Draw it
grid.draw(g)
}
#Run function on individual plots
ggplot_dual_axis(p2, p1, "y")
Related
Trying to use R cv.glmnet() for cross validation on loans data.
I have a data set on loan data (Kaggle) and have already split into train, test.
Separated the y response from the predictive variables in select(1) and select(-1).
Created matrix so as to avoid the "Error in storage.mode(y) <- "double" : 'list' object cannot be coerced to type 'double' " problem earlier.
Now seeking to run cv.glmnet() for cross validation, but this error stops me now.
"Error in y - predmat : non-numeric argument to binary operator"
Error in non-numeric argument, yet all my data is numeric, save for one factor for response y.
As a side question, what is the predmat in "y - predmat" refer to?
x_vars <- as.matrix(data.sample.train.split %>% select(-1))
y_resp <- as.matrix(data.sample.train.split %>% select(1))
cv_output <- cv.glmnet(x_vars, y_resp, type.measure = "deviance", nfolds = 5)
cv_output <- cv.glmnet(x_vars, y_resp,
type.measure = "deviance",
lambda = NULL,
nfolds = 5)
I am also considering to try this function:
ddd.lasso <- cv.glmnet(x_vars, y_resp, alpha = 1, family = "binomial")
ddd.model <- glmnet(x_vars, y_resp, alpha = 1, family = "binomial", lambda = ddd.lasso$lambda.min)
Data sample is as follows, just some of the columns:
c("loan_amnt", "funded_amnt",
"funded_amnt_inv", "grade", "emp_length", "annual_inc", "dti",
"mths_since_last_delinq", "mths_since_last_record", "open_acc",
"pub_rec", "revol_bal", "revol_util", "total_acc", "out_prncp",
"out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp",
"total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee",
"last_pymnt_amnt", "collections_12_mths_ex_med", "acc_now_delinq"
)))
loan_amnt funded_amnt funded_amnt_inv grade emp_length annual_inc dti
3 10000 10000 10000.000 60 10 49200.00 20.00
10 10000 10000 10000.000 60 4 42000.00 18.60
14 20250 20250 19142.161 60 3 43370.00 26.53
17 15000 15000 15000.000 80 2 92000.00 29.44
18 4000 4000 4000.000 80 10 106000.00 5.63
31 4400 4400 4400.000 40 10 55000.00 20.01
35 10000 10000 10000.000 100 10 60000.00 12.74
37 25600 25600 25350.000 80 9 110000.00 15.71
41 10000 10000 10000.000 80 1 39000.00 18.58
64 9200 9200 9200.000 80 2 60000.00 19.96
72 7000 7000 7000.000 80 4 39120.00 21.01
74 3500 3500 3500.000 100 10 83000.00 2.31
77 9500 9500 9500.000 100 7 50000.00 8.18
89 10000 10000 10000.000 100 1 43000.00 25.26
98 7000 7000 7000.000 80 1 30000.00 15.80
112 21600 21600 20498.266 20 8 60000.00 16.74
117 7200 7200 7200.000 80 5 48000.00 17.43
118 12000 12000 11975.000 60 1 57000.00 10.86
125 10000 10000 10000.000 100 5 70000.00 16.78
126 8000 8000 8000.000 60 3 28000.00 12.60
128 6000 6000 6000.000 60 10 94800.00 24.53
138 35000 35000 35000.000 80 2 168000.00 3.17
144 14000 14000 14000.000 100 10 66000.00 11.15
149 3000 3000 3000.000 60 5 71000.00 21.84
152 12000 12000 11975.000 80 2 60000.00 15.50
153 6000 6000 6000.000 100 3 34000.00 14.51
155 7000 7000 7000.000 80 7 82000.00 12.00
166 24250 18100 18075.000 -1 7 120000.00 12.96
170 2500 2500 2500.000 80 7 29000.00 18.70
172 4225 4225 4225.000 80 5 55200.00 17.61
180 6000 6000 6000.000 60 5 50000.00 15.58
192 5000 5000 5000.000 80 5 38004.00 23.78
193 8000 8000 8000.000 80 3 31000.00 16.22
199 12000 12000 12000.000 80 4 40000.00 22.20
203 3200 3200 3200.000 80 9 61200.00 2.16
209 5000 5000 5000.000 80 2 70000.00 20.06
220 13250 13250 13250.000 40 10 52000.00 23.70
224 12000 12000 12000.000 100 10 68000.00 7.08
mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util
3 35 59 10 0 5598 21.0
10 61 114 14 0 24043 70.2
14 18 107 8 0 17813 85.6
17 54 79 8 0 13707 93.9
18 18 97 12 0 6110 37.7
31 68 119 7 0 25237 99.0
35 37 93 11 0 14019 19.5
37 11 118 11 0 26088 62.0
41 58 17 5 0 12874 72.7
64 39 95 8 0 23299 78.7
72 26 33 8 1 9414 52.4
74 35 59 6 0 3092 13.4
77 46 118 8 0 13422 60.5
89 59 105 8 0 8215 37.2
98 68 101 7 0 15455 47.6
112 23 26 6 0 13354 78.1
117 24 19 7 0 16450 80.2
118 47 87 7 0 9273 81.5
125 32 92 9 0 10770 69.0
126 66 112 8 0 6187 54.3
128 10 101 13 0 71890 95.9
138 22 97 16 0 1099 1.4
144 26 102 7 0 12095 35.4
149 59 103 4 0 15072 88.7
152 46 94 7 0 12168 85.7
153 70 81 9 0 13683 64.8
155 79 83 6 0 25334 71.6
166 66 118 7 0 31992 99.0
170 63 99 5 0 2668 66.7
172 69 104 6 0 4055 73.7
180 49 94 8 0 7361 83.6
192 5 85 12 0 10023 57.3
193 28 77 13 0 2751 34.4
199 78 109 9 0 16273 55.5
203 79 113 5 1 2795 33.3
209 27 62 14 0 13543 54.2
220 70 86 8 0 15002 91.5
224 21 70 7 0 15433 55.6
total_acc out_prncp out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp
3 37 0 0 12226.302 12226.30 10000.00
10 28 0 0 12519.260 12519.26 10000.00
14 22 0 0 27663.043 25417.68 20250.00
17 31 0 0 15823.480 15823.48 15000.00
18 44 0 0 4484.790 4484.79 4000.00
31 11 0 0 5626.893 5626.89 4400.00
35 18 0 0 10282.670 10282.67 10000.00
37 27 0 0 29695.623 29405.63 25600.00
41 10 0 0 11474.760 11474.76 10000.00
64 19 0 0 10480.840 10480.84 9200.00
72 26 0 0 7932.300 7932.30 7000.00
74 28 0 0 3834.661 3834.66 3500.00
77 13 0 0 10493.710 10493.71 9500.00
89 16 0 0 11264.010 11264.01 10000.00
98 11 0 0 8452.257 8452.26 7000.00
112 21 0 0 27580.750 24853.63 21600.00
117 10 0 0 8677.156 8677.16 7200.00
118 11 0 0 14396.580 14366.62 12000.00
125 18 0 0 10902.910 10902.91 10000.00
126 11 0 0 8636.820 8636.82 8000.00
128 30 0 0 7215.050 7215.05 6000.00
138 22 0 0 38059.760 38059.76 35000.00
144 46 0 0 15450.084 15450.08 14000.00
149 14 0 0 3723.936 3723.94 3000.00
152 21 0 0 13919.414 13890.44 12000.00
153 16 0 0 6857.261 6857.26 6000.00
155 31 0 0 8290.730 8290.73 7000.00
166 20 0 0 22188.250 22157.63 18100.00
170 13 0 0 2894.740 2894.74 2500.00
172 12 0 0 5081.023 5081.02 4225.00
180 14 0 0 7325.299 7325.30 6000.00
192 17 0 0 6534.430 6534.43 5000.00
193 29 0 0 8306.470 8306.47 8000.00
199 23 0 0 14006.680 14006.68 12000.00
203 17 0 0 3709.193 3709.19 3200.00
209 26 0 0 5501.160 5501.16 5000.00
220 18 0 0 15650.390 15650.39 13250.00
224 34 0 0 12554.010 12554.01 12000.00
total_rec_int total_rec_late_fee recoveries collection_recovery_fee last_pymnt_amnt
3 2209.33 16.97000 0 0 357.48
10 2519.26 0.00000 0 0 370.46
14 7413.04 0.00000 0 0 6024.09
17 823.48 0.00000 0 0 2447.05
18 484.79 0.00000 0 0 2638.77
31 1226.89 0.00000 0 0 162.44
35 282.67 0.00000 0 0 8762.05
37 4095.62 0.00000 0 0 838.27
41 1474.76 0.00000 0 0 5803.94
64 1280.84 0.00000 0 0 365.48
72 932.30 0.00000 0 0 4235.03
74 334.66 0.00000 0 0 107.86
77 993.71 0.00000 0 0 5378.43
89 1264.01 0.00000 0 0 4.84
98 1452.26 0.00000 0 0 238.06
112 5980.75 0.00000 0 0 17416.49
117 1462.16 15.00000 0 0 19.26
118 2396.58 0.00000 0 0 5359.38
125 902.91 0.00000 0 0 4152.52
126 636.82 0.00000 0 0 6983.56
128 1215.05 0.00000 0 0 1960.88
138 3059.76 0.00000 0 0 272.59
144 1450.08 0.00000 0 0 2133.17
149 723.94 0.00000 0 0 107.29
152 1919.41 0.00000 0 0 395.05
153 857.26 0.00000 0 0 198.16
155 1290.73 0.00000 0 0 2454.29
166 4088.25 0.00000 0 0 16499.75
170 394.74 0.00000 0 0 1168.50
172 856.02 0.00000 0 0 146.48
180 1325.30 0.00000 0 0 215.51
192 1534.43 0.00000 0 0 1561.93
193 306.47 0.00000 0 0 7778.22
199 2006.68 0.00000 0 0 5971.51
203 509.19 0.00000 0 0 317.41
209 501.16 0.00000 0 0 3833.62
220 2400.39 0.00000 0 0 9026.78
224 554.01 0.00000 0 0 473.95
collections_12_mths_ex_med acc_now_delinq
3 0 0
10 0 0
14 0 0
17 0 0
18 0 0
31 0 0
35 0 0
37 0 0
41 0 0
64 0 0
72 0 0
74 0 0
77 0 0
89 0 0
98 0 0
112 0 0
117 0 0
118 0 0
125 0 0
126 0 0
128 0 0
138 0 0
144 0 0
149 0 0
152 0 0
153 0 0
155 0 0
166 0 0
170 0 0
172 0 0
180 0 0
192 0 0
193 0 0
199 0 0
203 0 0
209 0 0
220 0 0
224 0 0
Looks like a incorrect glmnet family, I accidently chose the default 'deviance' for cv.glmnet, when in fact my data was binomial. My next solution is to figure out "Convergence for 1th lambda value not reached after maxit=100000 iterations; solutions for larger lambdas returned"
Code that improved the solution:
cv.lasso <- cv.glmnet(x_vars, y_resp, alpha = 1, family = "binomial", nfolds = 5)
cv.model <- glmnet(x_vars, y_resp, alpha = 1, relax=TRUE, family="binomial", lambda=cv.lasso$lambda.min)
I tried to attach a decile value for each observation using the code below.However, it seems that the values are not correct. What can be the reason for that?
df<-read.table(text="pregnant glucose blood skin INSULIN MASS DIAB AGE CLASS predict_probability
1 106 70 28 135 34.2 0.142 22 0 0.15316285
1 91 54 25 100 25.2 0.234 23 0 0.05613959
4 136 70 0 0 31.2 1.182 22 1 0.54034794
9 164 78 0 0 32.8 0.148 45 1 0.64361578
3 173 78 39 185 33.8 0.970 31 1 0.79185196
11 136 84 35 130 28.3 0.260 42 1 0.31927737
0 141 84 26 0 32.4 0.433 22 0 0.41609308
3 106 72 0 0 25.8 0.207 27 0 0.10460090
9 145 80 46 130 37.9 0.637 40 1 0.67061324
10 111 70 27 0 27.5 0.141 40 1 0.16152296
",header=T)
deciles <- cut(df$predict_probability, breaks=c(quantile(df$predict_probability, probs = seq(0, 1, by = 0.10))),labels = 1:10, include.lowest=TRUE)
df1 <- cbind(df,deciles)
head(df1,10)
pregnant glucose blood skin INSULIN MASS DIAB AGE CLASS predict_probability deciles
1 1 106 70 28 135 34.2 0.142 22 0 0.15316285 3
2 1 91 54 25 100 25.2 0.234 23 0 0.05613959 1
3 4 136 70 0 0 31.2 1.182 22 1 0.54034794 7
4 9 164 78 0 0 32.8 0.148 45 1 0.64361578 8
5 3 173 78 39 185 33.8 0.970 31 1 0.79185196 10
6 11 136 84 35 130 28.3 0.260 42 1 0.31927737 5
7 0 141 84 26 0 32.4 0.433 22 0 0.41609308 6
8 3 106 72 0 0 25.8 0.207 27 0 0.10460090 2
9 9 145 80 46 130 37.9 0.637 40 1 0.67061324 9
10 10 111 70 27 0 27.5 0.141 40 1 0.16152296 4
Per Dason's proposal, here is the full answer to the question.
The quantile function should be taken out from the code so seq(0,1,by=0.1) should be passed directly to the cut function.
deciles <- cut(df$predict_probability, seq(0,1,by=0.1) ,labels = 1:10, include.lowest=TRUE)
df1 <- cbind(df,deciles)
head(df1,10)
pregnant glucose blood skin INSULIN MASS DIAB AGE CLASS predict_probability deciles
1 1 106 70 28 135 34.2 0.142 22 0 0.15316285 2
2 1 91 54 25 100 25.2 0.234 23 0 0.05613959 1
3 4 136 70 0 0 31.2 1.182 22 1 0.54034794 6
4 9 164 78 0 0 32.8 0.148 45 1 0.64361578 7
5 3 173 78 39 185 33.8 0.970 31 1 0.79185196 8
6 11 136 84 35 130 28.3 0.260 42 1 0.31927737 4
7 0 141 84 26 0 32.4 0.433 22 0 0.41609308 5
8 3 106 72 0 0 25.8 0.207 27 0 0.10460090 2
9 9 145 80 46 130 37.9 0.637 40 1 0.67061324 7
10 10 111 70 27 0 27.5 0.141 40 1 0.16152296 2
Using the data.frame below (Source: http://eric.univ-lyon2.fr/~ricco/tanagra/fichiers/en_Tanagra_PLSR_Software_Comparison.pdf)
Data
df <- read.table(text = c("
diesel twodoors sportsstyle wheelbase length width height curbweight enginesize horsepower horse_per_weight conscity price symboling
0 1 0 97 172 66 56 2209 109 85 0.0385 8.7 7975 2
0 0 0 100 177 66 54 2337 109 102 0.0436 9.8 13950 2
0 0 0 116 203 72 57 3740 234 155 0.0414 14.7 34184 -1
0 1 1 103 184 68 52 3016 171 161 0.0534 12.4 15998 3
0 0 0 101 177 65 54 2765 164 121 0.0438 11.2 21105 0
0 1 0 90 169 65 52 2756 194 207 0.0751 13.8 34028 3
1 0 0 105 175 66 54 2700 134 72 0.0267 7.6 18344 0
0 0 0 108 187 68 57 3020 120 97 0.0321 12.4 11900 0
0 0 1 94 157 64 51 1967 90 68 0.0346 7.6 6229 1
0 1 0 95 169 64 53 2265 98 112 0.0494 9.0 9298 1
1 0 0 96 166 64 53 2275 110 56 0.0246 6.9 7898 0
0 1 0 100 177 66 53 2507 136 110 0.0439 12.4 15250 2
0 1 1 94 157 64 51 1876 90 68 0.0362 6.4 5572 1
0 0 0 95 170 64 54 2024 97 69 0.0341 7.6 7349 1
0 1 1 95 171 66 52 2823 152 154 0.0546 12.4 16500 1
0 0 0 103 175 65 60 2535 122 88 0.0347 9.8 8921 -1
0 0 0 113 200 70 53 4066 258 176 0.0433 15.7 32250 0
0 0 0 95 165 64 55 1938 97 69 0.0356 7.6 6849 1
1 0 0 97 172 66 56 2319 97 68 0.0293 6.4 9495 2
0 0 0 97 172 66 56 2275 109 85 0.0374 8.7 8495 2"), header = T)
and this
Code
library(plsdepot)
df.plsdepot = plsreg1(df[, 1:11], df[, 14, drop = FALSE], comps = 3)
plot(df.plsdepot, comps = c(1, 2))
I got this
Result
The dependent (y) variable here is symboling, like price, is function of all other independent variables for the cars (diesel, twodoors, sportsstyle, wheelbase, length, width, height, curbweight, enginesize,horsepower, horse_per_weight)
Question
Any help to create the plot above using ggplot2 but with arrows instead of lines similar to this plot will be highly appreciated?
df <- read.table(text = c("
diesel twodoors sportsstyle wheelbase length width height curbweight enginesize horsepower horse_per_weight conscity price symboling
0 1 0 97 172 66 56 2209 109 85 0.0385 8.7 7975 2
0 0 0 100 177 66 54 2337 109 102 0.0436 9.8 13950 2
0 0 0 116 203 72 57 3740 234 155 0.0414 14.7 34184 -1
0 1 1 103 184 68 52 3016 171 161 0.0534 12.4 15998 3
0 0 0 101 177 65 54 2765 164 121 0.0438 11.2 21105 0
0 1 0 90 169 65 52 2756 194 207 0.0751 13.8 34028 3
1 0 0 105 175 66 54 2700 134 72 0.0267 7.6 18344 0
0 0 0 108 187 68 57 3020 120 97 0.0321 12.4 11900 0
0 0 1 94 157 64 51 1967 90 68 0.0346 7.6 6229 1
0 1 0 95 169 64 53 2265 98 112 0.0494 9.0 9298 1
1 0 0 96 166 64 53 2275 110 56 0.0246 6.9 7898 0
0 1 0 100 177 66 53 2507 136 110 0.0439 12.4 15250 2
0 1 1 94 157 64 51 1876 90 68 0.0362 6.4 5572 1
0 0 0 95 170 64 54 2024 97 69 0.0341 7.6 7349 1
0 1 1 95 171 66 52 2823 152 154 0.0546 12.4 16500 1
0 0 0 103 175 65 60 2535 122 88 0.0347 9.8 8921 -1
0 0 0 113 200 70 53 4066 258 176 0.0433 15.7 32250 0
0 0 0 95 165 64 55 1938 97 69 0.0356 7.6 6849 1
1 0 0 97 172 66 56 2319 97 68 0.0293 6.4 9495 2
0 0 0 97 172 66 56 2275 109 85 0.0374 8.7 8495 2"), header = T)
library(plsdepot)
library(ggplot2)
df.plsdepot = plsreg1(df[, 1:11], df[, 14, drop = FALSE], comps = 3)
data<-df.plsdepot$cor.xyt
data<-as.data.frame(data)
#Function to draw circle
circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
r = diameter / 2
tt <- seq(0,2*pi,length.out = npoints)
xx <- center[1] + r * cos(tt)
yy <- center[2] + r * sin(tt)
return(data.frame(x = xx, y = yy))
}
dat <- circleFun(c(0,0),2,npoints = 100)
ggplot(data=data, aes(t1,t2))+
ylab("")+xlab("")+ggtitle("Circle of Correlations ")+
theme_bw() +geom_text(aes(label=rownames(data),
colour=ifelse(rownames(data)!='symboling', 'orange','blue')))+
scale_color_manual(values=c("orange","#6baed6"))+
scale_x_continuous(breaks = c(-1,-0.5,0,0.5,1))+
scale_y_continuous(breaks = c(-1,-0.5,0,0.5,1))+
coord_fixed(ylim=c(-1, 1),xlim=c(-1, 1))+xlab("axis 1")+
ylab("axis 2")+ theme(axis.line.x = element_line(color="darkgrey"),
axis.line.y = element_line(color="darkgrey"))+
geom_path(data=dat,aes(x,y), colour = "darkgrey")+
theme(legend.title=element_blank())+
theme(axis.ticks = element_line(colour = "grey"))+
theme(axis.title = element_text(colour = "darkgrey"))+
theme(axis.text = element_text(color="darkgrey"))+
theme(legend.position='none')+
theme(plot.title = element_text(color="#737373")) +
theme(panel.grid.minor = element_blank()) +
annotate("segment",x=0, y=0, xend= 0.60, yend= 0.20, color="orange",
arrow=arrow(length=unit(0.3,"cm")))+
annotate("segment",x=0, y=0, xend= -0.25, yend= -0.35, color="#6baed6",
alpha=0.3,arrow=arrow(length=unit(0.3,"cm")))+
annotate("segment",x=0, y=0, xend= 0.45, yend= 0.75, color="#6baed6",
alpha=0.3,arrow=arrow(length=unit(0.3,"cm")))+
annotate("segment",x=0, y=0, xend= 0.37 , yend=-0.02, color="#6baed6",
alpha=0.3,arrow=arrow(length=unit(0.3,"cm")))+
annotate("segment",x=0, y=0, xend= -0.80, yend= 0.30, color="#6baed6",
alpha=0.3,arrow=arrow(length=unit(0.3,"cm")))+
annotate("segment",x=0, y=0, xend= -0.75, yend= 0.60, color="#6baed6",
alpha=0.3,arrow=arrow(length=unit(0.3,"cm")))+
annotate("segment",x=0, y=0, xend= -0.67, yend= 0.60, color="#6baed6",
alpha=0.3,arrow=arrow(length=unit(0.3,"cm")))+
annotate("segment",x=0, y=0, xend= -0.59, yend= -0.13, color="#6baed6",
alpha=0.3,arrow=arrow(length=unit(0.3,"cm")))+
annotate("segment",x=0, y=0, xend= -0.59, yend= 0.70, color="#6baed6",
alpha=0.3,arrow=arrow(length=unit(0.3,"cm")))+
annotate("segment",x=0, y=0, xend= -0.39, yend= 0.80, color="#6baed6",
alpha=0.3,arrow=arrow(length=unit(0.3,"cm")))+
annotate("segment",x=0, y=0, xend= 0.04, yend= 0.93, color="#6baed6",
alpha=0.3,arrow=arrow(length=unit(0.3,"cm")))+
annotate("segment",x=0, y=0, xend= 0.70, yend= 0.40, color="#6baed6",
alpha=0.3,arrow=arrow(length=unit(0.3,"cm")))
> head(m)
X id1 q_following topic_followed topic_answered nfollowers nfollowing
1 1 1 80 80 100 180 180
2 2 1 76 76 95 171 171
3 3 1 72 72 90 162 162
4 4 1 68 68 85 153 153
5 5 1 64 64 80 144 144
6 6 1 60 60 75 135 135
> head(d)
X id1 q_following topic_followed topic_answered nfollowers nfollowing
1 1 1 63 735 665 949 146
2 2 1 89 737 666 587 185
3 3 1 121 742 670 428 264
4 4 1 277 750 706 622 265
5 5 1 339 765 734 108 294
6 6 1 363 767 766 291 427
matcher <- function(x,y){ return(na.omit(m[which(d[,y]==x),y])) }
max_matcher <- function(x) { return(sum(matcher(x,3:13))) }
result <- foreach(1:1000, function(x) {
if(max(max_matcher(1:1000)) == max_matcher(x)) return(x)
})
I want to compute result across each group, grouped by id1 of dataframe m.
m %>% group_by(id1) %>% summarise(result) #doesn't work
by(m, m[,"id1"], result) #doesn't work
How should I proceed?
I would like to add legends for two geom_line and a geom_point at the same time, but the legends were not align to each other. So how to align the two legends and adjust legend positions? Thank you in advance!
My data:
df1:
x1 y1
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
11 0
12 0
13 0
14 0
15 0
16 0
17 0
18 0
19 0
20 0
21 0
22 0
23 9.2
24 18.5
25 27.6
26 36.8
27 46.1
28 54.2
29 63.4
30 72.6
31 81.7
32 88.9
33 93
34 99.1
35 105.4
36 110
37 118.3
38 128.2
39 138
40 146.9
41 155.1
42 162.5
43 165.7
44 169.2
45 174.2
46 176.3
47 183.8
48 187.8
49 194.2
50 200.7
51 203.4
52 204.7
53 209.5
54 214.5
55 219.6
56 224.1
57 228.5
58 232.8
59 237
60 239.5
61 242.7
62 243.1
63 244.6
64 245
65 246.6
66 248.6
67 251
68 253
69 255
70 256.7
71 256.7
df2:
x2 y2
24 0.006525
32 0.072525
39 0.120025
46 0.1601418
53 0.1972939
60 0.2226233
68 0.2312895
df3:
x3 y3
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
11 0
12 0
13 0
14 0
15 0
16 0
17 0
18 0
19 0
20 0
21 0
22 0
23 10.9
24 14.8
25 19.6
26 25.6
27 31.4
28 38.5
29 47.1
30 56.9
31 64.7
32 71
33 77
34 84.7
35 92.5
36 98.8
37 108.2
38 118.8
39 126.9
40 134.3
41 141.1
42 147.2
43 149.9
44 152.8
45 157
46 158.7
47 164.9
48 168.3
49 173.6
50 179
51 181.3
52 182.3
53 186.3
54 190.4
55 194.7
56 198.5
57 202.1
58 205.7
59 209.2
60 211.3
61 213.9
62 214.3
63 215.6
64 215.9
65 217.2
66 218.9
67 220.9
68 222.5
69 224.2
70 225.7
71 225.7
My code:
library("ggplot2")
library("reshape2")
library("gridExtra")
p <- ggplot() +
geom_line(data=df1, aes(x= x1, y= y1, linetype= "aa"))+
geom_point(data=df2, aes(x= x2, y= y2, shape="bbbbbbb"))+
geom_line(data=df3, aes(x= x3, y= y3, linetype= "cc"))+
scale_shape_manual(name="",
labels=c("bbbbbbb"),
values = c(21) )+
scale_linetype_manual(name="",
labels=c("aa","cc"),
values=c("solid", "dashed")) +
ylab("y")+
xlab("x")+
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.justification = c(0, 1),
legend.position=c(0, 1))
My plot:
Thank you guys for attention. I have find a solution to the problem, I adopted the idea from this post.
library("ggplot2")
library("reshape2")
library("gridExtra")
library("gtable")
p <- ggplot() +
geom_line(data=df1, aes(x= x1, y= y1, linetype= "aa"))+
geom_point(data=df2, aes(x= x2, y= y2, shape="bbbbbbb"))+
geom_line(data=df3, aes(x= x3, y= y3, linetype= "cc"))+
# discard errorbar here.
scale_shape_manual(name=NULL,
labels=c("bbbbbbb"),
values = c(21) )+
scale_linetype_manual(name=NULL,
labels=c("aa","cc"),
values=c("solid", "dashed")) +
ylab("y")+
xlab("x")+
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position=c(0, 1),
legend.justification=c(0,1),
legend.margin=unit(0,"cm"),
legend.box="vertical",
legend.box.just = "left",
legend.key.size=unit(1,"lines"),
legend.text.align=0,
legend.key = element_blank(),
legend.title = element_blank(),
legend.background=element_blank())
data <- ggplot_build(p)
gtable <- ggplot_gtable(data)
lbox <- which(sapply(gtable$grobs, paste) == "gtable[guide-box]")
guide <- gtable$grobs[[lbox]]
gtable$grobs[[lbox]]$heights <- unit.c(guide$heights[1:2],
unit(-.8,"cm"),
guide$heights[2:3])
# Plotting
g<-grid.draw(gtable)