How to fill out white/missing parts of the map in R? - r

This code below creates map of UK postcodes using ggplot, however leaves some of the parts white/missing from the map, could you please advise how to make sure that whole map is filled and that the postcode areas have a border ? Thanks.
MAP OF UK from the below code
rm(list=ls())
library(tidyverse)
library(maptools)
library(raster)
library(plotrix)
library(ggrepel)
df2016 <- read.table(stringsAsFactors=FALSE, header=TRUE, text="
name value amount
LD 1 3
ZE 1 2
WS 0.79 19
ML 0.75 12
HS 0.75 4
TQ 0.74 38
WN 0.73 15
CA 0.71 28
HU 0.7 33
FY 0.69 16
HG 0.69 16
IV 0.68 19
DL 0.68 25
CB 0.68 115
TS 0.67 46
IP 0.67 87
AB 0.67 66
NP 0.67 45
FK 0.67 18
IM 0.67 9
SM 0.66 50
HD 0.66 32
EN 0.66 61
CO 0.65 52
ME 0.65 54
PE 0.64 266
EX 0.64 81
WV 0.63 49
JE 0.63 24
NE 0.62 148
YO 0.62 47
DE 0.62 78
LN 0.61 36
SN 0.61 109
IG 0.6 63
NR 0.6 90
SP 0.59 37
BA 0.59 93
UB 0.59 127
TN 0.59 95
BT 0.59 180
BD 0.59 51
HP 0.59 126
TA 0.59 46
PO 0.58 113
DH 0.58 55
WD 0.58 102
BH 0.57 96
DG 0.57 14
CV 0.57 225
RG 0.57 255
BN 0.56 158
DY 0.56 48
HA 0.56 148
W 0.56 359
WA 0.56 77
DA 0.55 38
CT 0.55 62
GU 0.55 231
RH 0.55 132
BL 0.55 33
HX 0.55 11
BS 0.54 184
SS 0.54 46
EH 0.54 185
DT 0.54 37
G 0.54 137
B 0.54 283
LU 0.54 41
NG 0.54 97
OX 0.53 208
S 0.53 179
CM 0.53 100
DD 0.53 17
GL 0.53 87
AL 0.53 89
HR 0.53 38
LS 0.52 122
TF 0.52 21
RM 0.52 44
SL 0.52 155
MK 0.52 136
SY 0.52 46
DN 0.52 81
N 0.52 191
M 0.52 226
SR 0.52 29
SK 0.52 64
BB 0.51 140
KY 0.51 41
WF 0.51 51
PR 0.51 63
L 0.51 81
KT 0.5 185
CF 0.5 118
ST 0.5 84
TR 0.5 46
CW 0.5 44
TD 0.5 12
P 0.5 2
SW 0.5 317
LL 0.49 49
CH 0.49 43
E 0.49 275
EC 0.48 364
PA 0.48 27
SO 0.48 157
CR 0.48 84
PL 0.48 61
SG 0.47 59
KA 0.47 15
LA 0.47 43
SA 0.46 78
LE 0.46 194
TW 0.45 125
OL 0.44 41
SE 0.44 297
NN 0.43 143
NW 0.42 236
WC 0.41 138
WR 0.38 73
BR 0.37 62
GY 0.26 35
PH 0.23 13
")
#df2016$amount <- NULL
df2016$name <- as.character(df2016$name)
# Download a shapefile of postal codes into your working directory
download.file(
"http://www.opendoorlogistics.com/wp-content/uploads/Data/UK-postcode-boundaries-Jan-2015.zip",
"postal_shapefile"
)
# Unzip the shapefile
unzip("postal_shapefile")
# Read the shapefile
postal <- readShapeSpatial("./Distribution/Areas")
postal.df <- fortify(postal, region = "name")
# Join your data to the shapefile
colnames(postal.df)[colnames(postal.df) == "id"] <- "name"
postal.df <- raster::merge(postal.df, df2016, by = "name")
postal.df$value[is.na(postal.df$value)] <- 0.50
# Get centroids of spatialPolygonDataFrame and convert to dataframe
# for use in plotting area names.
postal.centroids.df <- data.frame(long = coordinates(postal)[, 1],
lat = coordinates(postal)[, 2],
id=postal$name)
p <- ggplot(postal.df, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = cut(value,5))) +
geom_text_repel(data = postal.centroids.df, aes(label = id, x = long, y = lat, group = id), size = 3, check_overlap = T) +
labs(x=" ", y=" ") +
theme_bw() + scale_fill_brewer('Success Rate 2016', palette = 15) +
coord_map() +
theme(panel.grid.minor=element_blank(), panel.grid.major=element_blank()) +
theme(axis.ticks = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank()) +
theme(panel.border = element_blank())
p

Try arranging the postal code by name or number just before plotting
postal.centroids.df %>%
arrange(id)
My county maps of the US did the same thing when they weren't in order. If that doesn't work try by lat or long as well.

Solution was to use left_join from dplyr instead of merge:
rm(list=ls())
library(tidyverse)
library(maptools)
library(raster)
library(plotrix)
library(ggrepel)
df2016 <- read.table(stringsAsFactors=FALSE, header=TRUE, text="
name value amount
LD 1 3
ZE 1 2
WS 0.79 19
ML 0.75 12
HS 0.75 4
TQ 0.74 38
WN 0.73 15
CA 0.71 28
HU 0.7 33
FY 0.69 16
HG 0.69 16
IV 0.68 19
DL 0.68 25
CB 0.68 115
TS 0.67 46
IP 0.67 87
AB 0.67 66
NP 0.67 45
FK 0.67 18
IM 0.67 9
SM 0.66 50
HD 0.66 32
EN 0.66 61
CO 0.65 52
ME 0.65 54
PE 0.64 266
EX 0.64 81
WV 0.63 49
JE 0.63 24
NE 0.62 148
YO 0.62 47
DE 0.62 78
LN 0.61 36
SN 0.61 109
IG 0.6 63
NR 0.6 90
SP 0.59 37
BA 0.59 93
UB 0.59 127
TN 0.59 95
BT 0.59 180
BD 0.59 51
HP 0.59 126
TA 0.59 46
PO 0.58 113
DH 0.58 55
WD 0.58 102
BH 0.57 96
DG 0.57 14
CV 0.57 225
RG 0.57 255
BN 0.56 158
DY 0.56 48
HA 0.56 148
W 0.56 359
WA 0.56 77
DA 0.55 38
CT 0.55 62
GU 0.55 231
RH 0.55 132
BL 0.55 33
HX 0.55 11
BS 0.54 184
SS 0.54 46
EH 0.54 185
DT 0.54 37
G 0.54 137
B 0.54 283
LU 0.54 41
NG 0.54 97
OX 0.53 208
S 0.53 179
CM 0.53 100
DD 0.53 17
GL 0.53 87
AL 0.53 89
HR 0.53 38
LS 0.52 122
TF 0.52 21
RM 0.52 44
SL 0.52 155
MK 0.52 136
SY 0.52 46
DN 0.52 81
N 0.52 191
M 0.52 226
SR 0.52 29
SK 0.52 64
BB 0.51 140
KY 0.51 41
WF 0.51 51
PR 0.51 63
L 0.51 81
KT 0.5 185
CF 0.5 118
ST 0.5 84
TR 0.5 46
CW 0.5 44
TD 0.5 12
P 0.5 2
SW 0.5 317
LL 0.49 49
CH 0.49 43
E 0.49 275
EC 0.48 364
PA 0.48 27
SO 0.48 157
CR 0.48 84
PL 0.48 61
SG 0.47 59
KA 0.47 15
LA 0.47 43
SA 0.46 78
LE 0.46 194
TW 0.45 125
OL 0.44 41
SE 0.44 297
NN 0.43 143
NW 0.42 236
WC 0.41 138
WR 0.38 73
BR 0.37 62
GY 0.26 35
PH 0.23 13
")
# Download a shapefile of postal codes into your working directory
download.file(
"http://www.opendoorlogistics.com/wp-content/uploads/Data/UK-postcode-boundaries-Jan-2015.zip",
"postal_shapefile"
)
# Unzip the shapefile
unzip("postal_shapefile")
# Read the shapefile
postal <- readShapeSpatial("./Distribution/Areas")
postal.df <- fortify(postal, region = "name")
# Join your data to the shapefile
colnames(postal.df)[colnames(postal.df) == "id"] <- "name"
library(dplyr)
test <- left_join(postal.df, df2016, by = "name", copy = FALSE)
#postal.df <- raster::merge(postal.df, df2016, by = "name")
test$value[is.na(test$value)] <- 0.50
# for use in plotting area names.
postal.centroids.df <- data.frame(long = coordinates(postal)[, 1],
lat = coordinates(postal)[, 2],
id=postal$name)
p <- ggplot(test, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = cut(value,5))) +
geom_text_repel(data = postal.centroids.df, aes(label = id, x = long, y = lat, group = id), size = 3, check_overlap = T) +
labs(x=" ", y=" ") +
theme_bw() + scale_fill_brewer('Success Rate 2016', palette = 15) +
coord_map() +
theme(panel.grid.minor=element_blank(), panel.grid.major=element_blank()) +
theme(axis.ticks = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank()) +
theme(panel.border = element_blank())
p

Related

Problems with to legends in a lda plot using ggplot2

I am having trouble with a legend in a lda analysis. Here is toy data:
>d_e_a.train
Lymphoprol. CORT Testo FDL RBC.Ab. ifn.g il.4 Profile
52 0.00 0.58 1.94 2.54 6 98 40 Med
81 22.23 0.58 0.05 1.56 4 203 45 Med
66 5.31 1.75 0.30 2.73 3 49 74 High
62 35.00 0.81 0.95 4.30 6 322 60 Low
9 6.52 2.36 0.03 0.92 4 51 75 High
70 13.27 0.47 1.67 2.57 3 278 75 Med
56 18.23 0.46 1.89 2.99 4 54 60 High
72 31.25 0.31 1.52 3.37 5 305 57 Low
90 22.09 0.40 0.06 1.62 5 254 58 Med
37 4.32 1.34 0.05 0.71 3 41 73 High
3 15.65 0.50 0.07 0.97 5 280 67 Med
17 39.32 1.71 0.30 2.06 2 93 53 High
57 19.25 1.15 0.05 1.75 5 95 73 Med
24 17.03 0.14 1.28 3.22 4 79 77 Med
85 13.73 0.52 1.59 2.20 3 62 75 Med
41 23.16 0.89 0.09 1.48 2 99 57 Med
65 29.25 0.28 0.04 2.56 5 298 55 Low
75 0.00 0.86 0.11 1.65 3 110 47 Med
22 14.25 1.09 1.46 1.96 5 76 69 Med
20 35.14 0.26 1.12 5.16 6 282 47 Low
83 36.94 0.55 1.62 2.15 4 298 60 Low
45 28.58 1.50 0.21 1.41 5 201 65 Med
2 13.91 0.65 1.34 2.27 6 195 58 Med
73 0.00 0.99 0.09 0.92 3 133 77 Med
29 35.80 0.12 0.01 1.80 7 307 65 Low`
and this is the model: model_a <- lda(Profile ~., data = d_e_a.train)
when I try to plot it using the following code I get two legends as it can be seen in the plot
library(ggplot2)
library(ggfortify)
library(devtools)
install_github('fawda123/ggord')
library(ggord)
plota<-ggord(model_a, d_e_a.train$Profile)+
theme_classic()+
scale_fill_manual(name = "Profile",
labels = c("Fischer - like", "Lewis - like", "Medium"))+
theme(text = element_text(size = 20 ),
axis.line.x = element_line(color="black", size = 1),
axis.line.y = element_line(color="black", size = 1),
axis.text.x=element_text(colour="black",angle = 360,vjust = 0.6),
axis.text.y=element_text(colour="black"))
plota
I would like to get only the legend that is seen in the top.
Regards
You need to have both a fill scale and a color scale with the same labels. You also need to remove the shape guide that this function seems to add, even though the shape of the points appears constant.
ggord(model_a, d_e_a.train$Profile)+
theme_classic()+
scale_fill_discrete(name = "Profile",
labels = c("Fischer - like", "Lewis - like", "Medium"))+
scale_color_discrete(name = "Profile",
labels = c("Fischer - like", "Lewis - like", "Medium"))+
theme(text = element_text(size = 20 ),
axis.line.x = element_line(color="black", size = 1),
axis.line.y = element_line(color="black", size = 1),
axis.text.x=element_text(colour="black",angle = 360,vjust = 0.6),
axis.text.y=element_text(colour="black")) +
guides(shape = guide_none())

How to convert a list into a data.frame in R?

I've created a frequency table in R with the fdth package using this code
fdt(x, breaks = "Sturges")
The specific result was:
Class limits f rf rf(%) cf cf(%)
[-15.907,-11.817) 12 0.00 0.10 12 0.10
[-11.817,-7.7265) 8 0.00 0.07 20 0.16
[-7.7265,-3.636) 6 0.00 0.05 26 0.21
[-3.636,0.4545) 70 0.01 0.58 96 0.79
[0.4545,4.545) 58 0.00 0.48 154 1.27
[4.545,8.6355) 91 0.01 0.75 245 2.01
[8.6355,12.726) 311 0.03 2.55 556 4.57
[12.726,16.817) 648 0.05 5.32 1204 9.89
[16.817,20.907) 857 0.07 7.04 2061 16.93
[20.907,24.998) 1136 0.09 9.33 3197 26.26
[24.998,29.088) 1295 0.11 10.64 4492 36.90
[29.088,33.179) 1661 0.14 13.64 6153 50.55
[33.179,37.269) 2146 0.18 17.63 8299 68.18
[37.269,41.36) 2525 0.21 20.74 10824 88.92
[41.36,45.45) 1349 0.11 11.08 12173 100.00
It was given as a list:
> class(x)
[1] "fdt.multiple" "fdt" "list"
I need to convert it into a data frame object, so I can have a table. How can I do it?
I'm a beginner at using R :(
Since you did not provide a reproducible example of your data I have used example from the help page of ?fdt which is closer to what you have.
library(fdth)
mdf <- data.frame(c1=sample(LETTERS[1:3], 1e2, TRUE),
c2=as.factor(sample(1:10, 1e2, TRUE)),
n1=c(NA, NA, rnorm(96, 10, 1), NA, NA),
n2=rnorm(100, 60, 4),
n3=rnorm(100, 50, 4),
stringsAsFactors=TRUE)
fdt <- fdt(mdf,breaks='FD',by='c1')
class(fdt)
#[1] "fdt.multiple" "fdt" "list"
You can extract the table part from each list and bind them together.
result <- purrr::map_df(fdt, `[[`, 'table')
#In base R
#result <- do.call(rbind, lapply(fdt, `[[`, 'table'))
result
# Class limits f rf rf(%) cf cf(%)
#1 [8.1781,9.1041) 5 0.20833333 20.833333 5 20.833333
#2 [9.1041,10.03) 6 0.25000000 25.000000 11 45.833333
#3 [10.03,10.956) 10 0.41666667 41.666667 21 87.500000
#4 [10.956,11.882) 3 0.12500000 12.500000 24 100.000000
#5 [53.135,56.121) 4 0.16000000 16.000000 4 16.000000
#6 [56.121,59.107) 8 0.32000000 32.000000 12 48.000000
#7 [59.107,62.092) 8 0.32000000 32.000000 20 80.000000
#....

How build a nonlinear approximation?

There was a need to build an approximation of data using the formula
y = a(exp(x/b) - 1) (below the code).
library("ggplot2")
df <- read.table(file='vah_p_1',header =TRUE)
p <- ggplot(df, aes(x = x, y = y)) + geom_point() +
geom_smooth(data = df, method = "nls",size=0.4, se=FALSE,color ='cyan2',
formula = y ~ a(exp^(x*b)-1),method.args = list(start=c(a=1.0,b=0.0)))
p
Unfortunately the approximation line is not being built.I think the problem is in method.args = list(start=c(a=1.0,b=0.0). How to find a, b?
In vah_p_1 is located:
x y
0 4
0.25 5
0.27 6
0,29 7
0.31 8
0.33 10
0.34 13
0.36 16
0.37 20
0.38 23
0.39 28
0.4 37
0.41 43
0.42 55
0.43 67
0.44 81
0.45 94
0.46 118
0.47 143
0.48 187
0.49 225

post code areas plotting in R - how to add legend

I am new to R and need some help.
could you please help me with the below ? I would like to add gradient legend next to the plot from 0 to 1 showing different color as value change, but this is best I was able to get. As well please some tips how to add text with the post code inside of the map ? Thanks.
rm(list=ls())
library(tidyverse)
library(maptools)
library(raster)
library(plotrix)
# collection data set load + post codes lo la - 2016
df2016 <- read.csv('C:/Users/thomas/desktop/coll2016WORKINGFILE.csv')
colnames(df2016) <- c('name','value','amount')
df2016$amount <- NULL
df2016$name <- as.character(df2016$name)
# OPTIONAL: Depending on your data, you may need to rescale it for the color ramp to work
df2016$value <- rescale(df2016$value, newrange = c(0, 1))
# Download a shapefile of postal codes into your working directory
download.file(
"http://www.opendoorlogistics.com/wp-content/uploads/Data/UK-postcode-boundaries-Jan-2015.zip",
"postal_shapefile"
)
# Unzip the shapefile
unzip("postal_shapefile")
# Read the shapefile
postal <- readShapeSpatial("./Distribution/Areas")
postal.df <- fortify(postal, region = "name")
# Join your data to the shapefile
postal <- raster::merge(postal, df2016, by = "name")
postal$value[is.na(postal$value)] <- 0.50
# Get centroids of spatialPolygonDataFrame and convert to dataframe
# for use in plotting area names.
postal.centroids.df <- data.frame(long = coordinates(postal)[, 1],
lat = coordinates(postal)[, 2],
id=postal$name,
ratio = postal$value)
plot(postal, col = gray(postal$value))
title("UK Success Rate")
legend("right",NULL,legend = postal$value,col = gray(postal$value))
Original dataset from csv has below 3 columns:
Row Labels Success/Failed ratio N of coll
LD 1 3
ZE 1 2
WS 0.79 19
ML 0.75 12
HS 0.75 4
TQ 0.74 38
WN 0.73 15
CA 0.71 28
HU 0.7 33
FY 0.69 16
HG 0.69 16
IV 0.68 19
DL 0.68 25
CB 0.68 115
TS 0.67 46
IP 0.67 87
AB 0.67 66
NP 0.67 45
FK 0.67 18
IM 0.67 9
SM 0.66 50
HD 0.66 32
EN 0.66 61
CO 0.65 52
ME 0.65 54
PE 0.64 266
EX 0.64 81
WV 0.63 49
JE 0.63 24
NE 0.62 148
YO 0.62 47
DE 0.62 78
LN 0.61 36
SN 0.61 109
IG 0.6 63
NR 0.6 90
SP 0.59 37
BA 0.59 93
UB 0.59 127
TN 0.59 95
BT 0.59 180
BD 0.59 51
HP 0.59 126
TA 0.59 46
PO 0.58 113
DH 0.58 55
WD 0.58 102
BH 0.57 96
DG 0.57 14
CV 0.57 225
RG 0.57 255
BN 0.56 158
DY 0.56 48
HA 0.56 148
W 0.56 359
WA 0.56 77
DA 0.55 38
CT 0.55 62
GU 0.55 231
RH 0.55 132
BL 0.55 33
HX 0.55 11
BS 0.54 184
SS 0.54 46
EH 0.54 185
DT 0.54 37
G 0.54 137
B 0.54 283
LU 0.54 41
NG 0.54 97
OX 0.53 208
S 0.53 179
CM 0.53 100
DD 0.53 17
GL 0.53 87
AL 0.53 89
HR 0.53 38
LS 0.52 122
TF 0.52 21
RM 0.52 44
SL 0.52 155
MK 0.52 136
SY 0.52 46
DN 0.52 81
N 0.52 191
M 0.52 226
SR 0.52 29
SK 0.52 64
BB 0.51 140
KY 0.51 41
WF 0.51 51
PR 0.51 63
L 0.51 81
KT 0.5 185
CF 0.5 118
ST 0.5 84
TR 0.5 46
CW 0.5 44
TD 0.5 12
P 0.5 2
SW 0.5 317
LL 0.49 49
CH 0.49 43
E 0.49 275
EC 0.48 364
PA 0.48 27
SO 0.48 157
CR 0.48 84
PL 0.48 61
SG 0.47 59
KA 0.47 15
LA 0.47 43
SA 0.46 78
LE 0.46 194
TW 0.45 125
OL 0.44 41
SE 0.44 297
NN 0.43 143
NW 0.42 236
WC 0.41 138
WR 0.38 73
BR 0.37 62
GY 0.26 35
PH 0.23 13
Here you go. Use sf with new ggplot or stuff from my misc package for base graphs.
# collection data set load + post codes lo la - 2016
df2016 <- read.table(stringsAsFactors=FALSE, header=TRUE, text="
name value amount
LD 1 3
ZE 1 2
WS 0.79 19
# YOUR OTHER VALUES FROM ABOVE
PH 0.23 13")
if(FALSE){ # don't run when sourcing file
# Download a shapefile of postal codes into your working directory
download.file(
"http://www.opendoorlogistics.com/wp-content/uploads/Data/UK-postcode-boundaries-Jan-2015.zip",
"postal_shapefile.zip"
)
# Unzip and read the shapefile
unzip("postal_shapefile.zip")
}
# install.packages("sf")
postal <- sf::st_read("Distribution/Areas.shp")
# Join your data to the shapefile
postal2 <- merge(postal, df2016, by="name")
#devtools::install_github("tidyverse/ggplot2") # need newer ggplot2 version for geom_sf
library(ggplot2)
ggplot(postal2) + geom_sf(aes(fill = value))
# Want to remain in base graphs?
#install.packages("berryFunctions")
library(berryFunctions)
cols <- seqPal(n=100)
cls <- classify(postal2$value, breaks=100)$index
plot(postal2[,c("value","geometry")], col=cols[cls], graticule=TRUE, axes=TRUE) # ?sf::plot_sf
colPointsLegend(postal2$value, colors=cols, horizontal=FALSE, title="UK value")

Area under the curve

I have my data in long-format like this with 20 different variables (but they all have the same Time points):
Time variable value
1 0 P1 0.07
2 1 P1 0.02
3 2 P1 0.12
4 3 P1 0.17
5 4 P1 0.10
6 5 P1 0.17
66 0 P12 0.02
67 1 P12 0.11
68 2 P12 0.20
69 3 P12 0.19
70 4 P12 0.07
71 5 P12 0.20
72 6 P12 0.19
73 7 P12 0.19
74 8 P12 0.12
75 10 P12 0.13
76 12 P12 0.08
77 14 P12 NA
78 24 P12 0.07
79 0 P13 0.14
80 1 P13 0.17
81 2 P13 0.24
82 3 P13 0.24
83 4 P13 0.26
84 5 P13 0.25
85 6 P13 0.21
86 7 P13 0.21
87 8 P13 NA
88 10 P13 0.19
89 12 P13 0.14
90 14 P13 NA
91 24 P13 0.12
I would like to calculate the area under the curve for each variable between time=0 and time=24. Ideally I would also like to calculate area under the curve where y>0.1.
I have tried the pracma package but it just comes out with NA.
trapz(x=P2ROKIlong$Time, y=P2ROKIlong$value)
Do I have to split my data into lots of different vectors and then do it manually or is there a way of getting it out of the long-format data?
The following code runs fine for me:
require(pracma)
df = data.frame(Time =c(0,1,2,3,4,5),value=c(0.07,0.02,0.12,0.17,0.10,0.17))
AUC = trapz(df$Time,df$value)
Is there anything strange (NA's?) in your the rest of your dataframe?
EDIT: New code based on comments
May not be the most efficient, but the size of your data seems limited. This returns a vector AUC_result with the AUC per variable. Does this solve your issue?
require(pracma)
df = data.frame(Time =c(0,1,2,3,4,5),value=c(0.07,0.02,0.12,0.17,NA,0.17),variable = c("P1","P1","P1","P2","P2","P2"))
df=df[!is.na(df$value),]
unique_groups = as.character(unique(df$variable))
AUC_result = c()
for(i in 1:length(unique_groups))
{
df_subset = df[df$variable %in% unique_groups[i],]
AUC = trapz(df_subset$Time,df_subset$value)
AUC_result[i] = AUC
names(AUC_result)[i] = unique_groups[i]
}

Resources