I did a logistic Regression and trying to plot it now. However, I do not find a way to plot it nicely. I'm not very familiar with R so I'm first trying to do it without interaction effects and just one variable.
Here is my data:
> dput(head(Wahl2013))
structure(list(Wahlbeteiligung = structure(c(1L, 1L, 1L, 1L,
1L, 1L), .Label = c("ja, habe gewaehlt", "nein, habe nicht gewaehlt"
), class = "factor"), Geschlecht = structure(c(2L, 2L, 1L, 1L,
1L, 1L), .Label = c("0", "1"), class = "factor"), Gebj = c(4L,
1L, 1L, 1L, 2L, 2L), Zweitstimme = c(1, 0, 0, 0, 0, 0), eig.Pst.Klima = c(4,
6, 7, 5, 5, 4), Salienz.Klima = c(1, 4, 4, 3, 2, 3), Bildung = c(3L,
3L, 1L, 3L, 2L, 1L), Atomenergie = c(3, 3, 3, 5, 3, 1)), na.action = structure(c(`2` = 2L,
`11` = 11L, `22` = 22L, `29` = 29L, `50` = 50L, `58` = 58L, `72` = 72L,
`76` = 76L, `77` = 77L, `85` = 85L, `96` = 96L, `108` = 108L,
`112` = 112L, `119` = 119L, `120` = 120L, `124` = 124L, `125` = 125L,
`130` = 130L, `142` = 142L, `143` = 143L, `151` = 151L, `160` = 160L,
`175` = 175L, `183` = 183L, `190` = 190L, `196` = 196L, `219` = 219L,
`229` = 229L, `234` = 234L, `238` = 238L, `243` = 243L, `248` = 248L,
`261` = 261L, `269` = 269L, `277` = 277L, `279` = 279L, `285` = 285L,
`286` = 286L, `287` = 287L, `311` = 311L, `313` = 313L, `319` = 319L,
`324` = 324L, `331` = 331L, `334` = 334L, `347` = 347L, `348` = 348L,
`351` = 351L, `352` = 352L, `359` = 359L, `368` = 368L, `373` = 373L,
`374` = 374L, `380` = 380L, `385` = 385L, `391` = 391L, `398` = 398L,
`410` = 410L, `412` = 412L, `422` = 422L, `423` = 423L, `434` = 434L,
`435` = 435L, `442` = 442L, `449` = 449L, `453` = 453L, `462` = 462L,
`463` = 463L, `466` = 466L, `473` = 473L, `483` = 483L, `484` = 484L,
`534` = 534L, `546` = 546L, `547` = 547L, `554` = 554L, `561` = 561L,
`568` = 568L, `573` = 573L, `583` = 583L, `596` = 596L, `612` = 612L,
`618` = 618L, `619` = 619L, `625` = 625L, `638` = 638L, `645` = 645L,
`677` = 677L, `692` = 692L, `726` = 726L, `734` = 734L, `738` = 738L,
`741` = 741L, `751` = 751L, `759` = 759L, `767` = 767L, `768` = 768L,
`770` = 770L, `774` = 774L, `784` = 784L, `792` = 792L, `793` = 793L,
`800` = 800L, `805` = 805L, `821` = 821L, `834` = 834L, `857` = 857L,
`867` = 867L, `869` = 869L, `877` = 877L, `895` = 895L, `896` = 896L,
`898` = 898L, `912` = 912L, `918` = 918L, `925` = 925L, `928` = 928L,
`931` = 931L, `939` = 939L, `946` = 946L, `949` = 949L, `956` = 956L,
`1001` = 1001L, `1009` = 1009L, `1016` = 1016L, `1018` = 1018L,
`1019` = 1019L, `1031` = 1031L, `1032` = 1032L, `1054` = 1054L,
`1058` = 1058L, `1062` = 1062L, `1063` = 1063L, `1068` = 1068L,
`1089` = 1089L, `1090` = 1090L, `1101` = 1101L, `1102` = 1102L,
`1121` = 1121L, `1154` = 1154L, `1156` = 1156L, `1162` = 1162L,
`1170` = 1170L, `1174` = 1174L, `1181` = 1181L, `1182` = 1182L,
`1183` = 1183L, `1191` = 1191L, `1196` = 1196L, `1201` = 1201L,
`1215` = 1215L, `1233` = 1233L, `1267` = 1267L, `1270` = 1270L,
`1294` = 1294L, `1297` = 1297L, `1305` = 1305L, `1315` = 1315L,
`1330` = 1330L, `1335` = 1335L, `1338` = 1338L, `1340` = 1340L,
`1345` = 1345L, `1352` = 1352L, `1370` = 1370L, `1373` = 1373L,
`1379` = 1379L, `1380` = 1380L, `1400` = 1400L, `1410` = 1410L,
`1427` = 1427L, `1438` = 1438L, `1439` = 1439L, `1440` = 1440L,
`1444` = 1444L, `1447` = 1447L, `1468` = 1468L, `1469` = 1469L,
`1473` = 1473L, `1485` = 1485L, `1486` = 1486L, `1490` = 1490L,
`1498` = 1498L, `1499` = 1499L, `1500` = 1500L, `1503` = 1503L,
`1506` = 1506L, `1516` = 1516L, `1520` = 1520L, `1522` = 1522L,
`1536` = 1536L, `1545` = 1545L, `1557` = 1557L, `1565` = 1565L,
`1569` = 1569L, `1576` = 1576L, `1577` = 1577L, `1579` = 1579L,
`1586` = 1586L, `1596` = 1596L), class = "omit"), row.names = c(1L,
3L, 4L, 5L, 6L, 7L), class = "data.frame")
model3 <- glm(Zweitstimme ~ Atomenergie, data = Wahl2013)
> summary(model3)
Call:
glm(formula = Zweitstimme ~ Atomenergie, data = Wahl2013)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.17189 -0.12128 -0.07067 -0.02007 1.03054
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.081152 0.022555 -3.598 0.000332 ***
Atomenergie 0.050609 0.006139 8.244 3.81e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 0.08119716)
Null deviance: 118.71 on 1395 degrees of freedom
Residual deviance: 113.19 on 1394 degrees of freedom
AIC: 460.49
Number of Fisher Scoring iterations: 2
> plot(jitter(Wahl2013$Atomenergie, .2), jitter(Wahl2013$Zweitstimme, .2))
> newdat <- data.frame(Atomenergie=seq(min(Wahl2013$Atomenergie), max(Wahl2013$Atomenergie),len=100))
> newdat$Zweitstimme = predict(fit, newdata=newdat, type="response")
> lines(Zweitstimme ~ Atomenergie, newdat, col="green4", lwd=2)
This Looks like the following:
I do not find a way to make it look better. Thanks for help!
Unfortunately, your sample data only contained 5 rows which was not sufficient to recreate your model. I have therefore attempted to recreate your data like this:
set.seed(69)
Wahl2013 <- data.frame(Zweitstimme = as.vector(sapply(1:5, function(i) {
sample(0:1, 200, TRUE, c(60/i^1.8, 1))})),
Atomenergie = rep(1:5, each = 200))
We can see by using this data on your plotting code that it is quite similar:
model3 <- glm(Zweitstimme ~ Atomenergie, family = binomial, data = Wahl2013)
plot(jitter(Wahl2013$Atomenergie, .2),
jitter(Wahl2013$Zweitstimme, .2))
newdat <- data.frame(Atomenergie=seq(min(Wahl2013$Atomenergie),
max(Wahl2013$Atomenergie),len = 100))
newdat$Zweitstimme = predict(model3, newdata = newdat, type="response")
lines(Zweitstimme ~ Atomenergie, newdat, col = "green4", lwd = 2)
However, plotting a binary response this way, even with jitter, is not very appealing.
The important thing to show is the proportion of Zwetstimme at each of the 5 levels of atomenergie. One way to do this is using ggplot, which will add in a 95% confidence range:
library(ggplot2)
ggplot(Wahl2013, aes(Atomenergie, Zweitstimme)) +
geom_smooth(method = "glm", data = Wahl2013,
method.args = list(family = binomial),
fill = "dodgerblue", alpha = 0.15, linetype = 2,
colour = "royalblue") +
coord_cartesian(ylim = c(0, 0.4)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 2)) +
labs(y = "Zweistimme (Prozent)") +
theme_bw() +
theme(panel.border = element_rect(colour = NA))
Or you could show the size of the groups at each of the 10 points with bubbles:
df <- data.frame(s = as.vector(with(Wahl2013, table(Atomenergie, Zweitstimme))),
Atomenergie = rep(1:5, 2),
Zweitstimme = rep(0:1, each = 5))
ggplot(Wahl2013, aes(Atomenergie, Zweitstimme)) +
geom_smooth(method = "glm", data = Wahl2013,
method.args = list(family = binomial),
fill = "dodgerblue", alpha = 0.15, linetype = 2,
colour = "royalblue") +
geom_point(data = df, aes(size = s, fill = s), shape = 21) +
scale_y_continuous(labels = scales::percent_format(accuracy = 2)) +
scale_fill_gradientn(colours = c("yellow", "red"), name = "Personen") +
guides(size = guide_none()) +
labs(y = "Zweistimme (Prozent)") +
coord_cartesian(clip = "off") +
theme_bw() +
theme(panel.border = element_rect(colour = NA)) +
annotation_custom(grid::textGrob("Ja", unit(1.0, "npc"), unit(0.95, "npc"),
just = "left", gp = grid::gpar(cex = 2))) +
annotation_custom(grid::textGrob("Nein", unit(1.0, "npc"), unit(0.05, "npc"),
just = "left", gp = grid::gpar(cex = 2)))
Related
I have a data frame loaded in R and I need to sum one row. The problem is that I've tried to use rowSums() function, but 2 columns are not numeric ones (one is character "Nazwa" and one is boolean "X" at the end of data frame). Is there any option to sum this row without those two columns? So I'd like to start from row 1, column 3 and don't include last column.
My data:
structure(list(Kod = c(0L, 200000L, 400000L, 600000L, 800000L,
1000000L), Nazwa = c("POLSKA", "DOLNOŚLĄSKIE", "KUJAWSKO-POMORSKIE",
"LUBELSKIE", "LUBUSKIE", "ŁÓDZKIE"), gospodarstwa.ogółem.gospodarstwa.2006.... = c(9187L,
481L, 173L, 1072L, 256L, 218L), gospodarstwa.ogółem.gospodarstwa.2007.... = c(11870L,
652L, 217L, 1402L, 361L, 261L), gospodarstwa.ogółem.gospodarstwa.2008.... = c(14896L,
879L, 258L, 1566L, 480L, 314L), gospodarstwa.ogółem.gospodarstwa.2009.... = c(17091L,
1021L, 279L, 1710L, 579L, 366L), gospodarstwa.ogółem.gospodarstwa.2010.... = c(20582L,
1227L, 327L, 1962L, 833L, 420L), gospodarstwa.ogółem.gospodarstwa.2011.... = c(23449L,
1322L, 371L, 2065L, 1081L, 478L), gospodarstwa.ogółem.gospodarstwa.2012.... = c(25944L,
1312L, 390L, 2174L, 1356L, 518L), gospodarstwa.ogółem.gospodarstwa.2013.... = c(26598L,
1189L, 415L, 2129L, 1422L, 528L), gospodarstwa.ogółem.gospodarstwa.2014.... = c(24829L,
1046L, 401L, 1975L, 1370L, 508L), gospodarstwa.ogółem.gospodarstwa.2015.... = c(22277L,
849L, 363L, 1825L, 1202L, 478L), gospodarstwa.ogółem.gospodarstwa.2016.... = c(22435L,
813L, 470L, 1980L, 1148L, 497L), gospodarstwa.ogółem.gospodarstwa.2017.... = c(20257L,
741L, 419L, 1904L, 948L, 477L), gospodarstwa.ogółem.gospodarstwa.2018.... = c(19207L,
713L, 395L, 1948L, 877L, 491L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2006..ha. = c(228038L,
19332L, 4846L, 19957L, 12094L, 3378L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2007..ha. = c(287529L,
21988L, 5884L, 23934L, 18201L, 3561L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2008..ha. = c(314848L,
28467L, 5943L, 26892L, 18207L, 4829L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2009..ha. = c(367062L,
26427L, 6826L, 30113L, 22929L, 5270L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2010..ha. = c(519069L,
39703L, 7688L, 34855L, 35797L, 7671L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2011..ha. = c(605520L,
45547L, 8376L, 34837L, 44259L, 8746L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2012..ha. = c(661688L,
44304L, 8813L, 37466L, 52581L, 9908L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2013..ha. = c(669970L,
37455L, 11152L, 40819L, 54692L, 10342L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2014..ha. = c(657902L,
37005L, 11573L, 38467L, 53300L, 11229L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2015..ha. = c(580730L,
31261L, 10645L, 34052L, 46343L, 10158L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2016..ha. = c(536579L,
29200L, 9263L, 31343L, 43235L, 9986L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2017..ha. = c(494978L,
27542L, 8331L, 29001L, 37923L, 9260L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2018..ha. = c(484677L,
27357L, 7655L, 28428L, 37174L, 8905L), X = c(NA, NA, NA, NA,
NA, NA)), row.names = c(NA, 6L), class = "data.frame")
My attempt:
rowSums(dane_csv[, 3:length(dane_csv$Nazwa=='POLSKA')])
Using base R
rowSums(dane_csv[sapply(dane_csv, is.numeric)])
-output
1 2 3 4 5 6
6667212 627833 511473 1033876 1288648 1108797
Or with dplyr
library(dplyr)
dane_csv %>%
transmute(out = rowSums(across(where(is.numeric))))
in base R use Filter function, to select the numeric columns then do a rowSums on them
rowSums(Filter(is.numeric, df))
1 2 3 4 5 6
6667212 627833 511473 1033876 1288648 1108797
You can select only the numeric columns:
library(dplyr)
df %>%
select(where(is.numeric)) %>%
rowSums() %>%
first()
Result:
1
6667212
I'm analysing real-estate sales for some N. American cities and am using k-means clustering on the data. I have seven clusters and for each observation in the cluster I have the latitude, longitude, zipcode, and cluster_id. I'd like to plot this on a map to better visualize the clusters - I'm not sure what such a plot is called - Choropleth? Polygon?
Most of the examples are using geoJSON files but I only have a data.frame object from my k-means clustering.
Actual data:
https://www.kaggle.com/threnjen/portland-housing-prices-sales-jul-2020-jul-2021
Sample data:
> dput(dt[runif(n = 10,min = 1,max = 25000)])
structure(list(id = c(23126L, 15434L, 5035L, 19573L, NA, 24486L,
NA, 14507L, 3533L, 20192L), zipcode = c(97224L, 97211L, 97221L,
97027L, NA, 97078L, NA, 97215L, 97124L, 97045L), latitude = c(45.40525436,
45.55965805, 45.4983139, 45.39398956, NA, 45.47454071, NA, 45.50736618,
45.52812958, 45.34381485), longitude = c(-122.7599182, -122.6500015,
-122.7288742, -122.591217, NA, -122.8898392, NA, -122.6084061,
-122.91745, -122.5948334), lastSoldPrice = c(469900L, 599000L,
2280000L, 555000L, NA, 370000L, NA, 605000L, 474900L, 300000L
), lotSize = c(5227L, 4791L, 64904L, 9147L, NA, 2178L, NA, 4356L,
2613L, 6969L), livingArea = c(1832L, 2935L, 5785L, 2812L, NA,
1667L, NA, 2862L, 1844L, 742L), cluster_id = c(7, 7, 2, 7, NA,
4, NA, 7, 7, 4)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7faa8000fee0>)
I've followed the example on https://gist.github.com/josecarlosgonz/8565908 to try and create a geoJSON file to be able to plot this data but without success.
I'm not using markers because I have ~25,000 observations - it would be difficult to plot them all and the file would take forever to load.
EDIT:
observations by zipcode:
> dput(dat[, .N, by = .(`address/zipcode`)][(order(`address/zipcode`))])
structure(list(`address/zipcode` = c(7123L, 97003L, 97004L, 97005L,
97006L, 97007L, 97008L, 97009L, 97015L, 97019L, 97023L, 97024L,
97027L, 97030L, 97034L, 97035L, 97038L, 97045L, 97056L, 97060L,
97062L, 97068L, 97070L, 97078L, 97080L, 97086L, 97089L, 97113L,
97123L, 97124L, 97132L, 97140L, 97201L, 97202L, 97203L, 97204L,
97205L, 97206L, 97209L, 97210L, 97211L, 97212L, 97213L, 97214L,
97215L, 97216L, 97217L, 97218L, 97219L, 97220L, 97221L, 97222L,
97223L, 97224L, 97225L, 97227L, 97229L, 97230L, 97231L, 97232L,
97233L, 97236L, 97239L, 97266L, 97267L), N = c(1L, 352L, 9L,
252L, 421L, 1077L, 357L, 1L, 31L, 2L, 4L, 159L, 239L, 525L, 640L,
548L, 1L, 1064L, 5L, 353L, 471L, 736L, 6L, 403L, 866L, 913L,
8L, 5L, 1113L, 776L, 3L, 543L, 219L, 684L, 463L, 1L, 57L, 809L,
189L, 216L, 688L, 510L, 504L, 330L, 318L, 177L, 734L, 195L, 832L,
305L, 276L, 589L, 688L, 716L, 286L, 83L, 1307L, 475L, 77L, 150L,
382L, 444L, 290L, 423L, 430L)), row.names = c(NA, -65L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7f904781a6e0>)
I used the kaggle data on a simple laptop (i3 8th gen) to generate a ggplot2 object, with cluster IDs randomly sampled and transform this via the ggplotly() function ... the resulting plotly object seems OK to work with for analysis but I do not know your performance requirements:
library(dplyr)
library(ggplot2)
library(plotly)
library(rnaturalearth) # here we get the basic map data from
# read in data from zip, select minimal number of columns and sample cluster_id
df <- readr::read_csv(unzip("path_to_zip/portland_housing.csv.zip"))%>%
dplyr::select(az = `address/zipcode`, latitude, longitude) %>%
dplyr::mutate(cluster_id = sample(1:7, n(), replace = TRUE))
# get the map data
world <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf")
# build the ggplot2 object (note that I use rings as shapes and alpha parameter to reduce the over plotting
plt <- ggplot2::ggplot(data = world) +
ggplot2::geom_sf() +
ggplot2::geom_point(data = df, aes(x = longitude, y = latitude, color = factor(cluster_id)), size = 1, shape = 21, alpha = .7) +
ggplot2::coord_sf(xlim = c(-124.5, -122), ylim = c(45, 46), expand = FALSE)
# plot it:
plt
# plotly auto transform from ggplot2 object
plotly::ggplotly(plt)
EDIT
To include a map you can use for example the ggmap package instead of the map data from rnaturalearth... I will only display the plotly result:
library(ggmap)
# https://stackoverflow.com/questions/23130604/plot-coordinates-on-map
sbbox <- ggmap::make_bbox(lon = c(-124.5, -122), lat = c(45, 46), f = .1)
myarea <- ggmap::get_map(location=sbbox, zoom=10, maptype="terrain")
myarea <- ggmap::ggmap(myarea)
plt2 <- myarea +
ggplot2::geom_point(data = df, mapping = aes(x = longitude, y = latitude, color = factor(cluster_id)), shape = 21, alpha = .7)
plotly::ggplotly(plt2)
There are many other approaches concerning the map data, like using the mapbox-api
Sorry if this is a duplicate question but I cannot seem to find the answer to my question anywhere. I have two plots and I would like to overlay plot two on plot one so that they form one plot. Is this possible? I will attach how both plots look separately. They are both facetted by the same variable which is by location and are on the same x and y-axis scale so theoretically should be possible.
Thank you.
## Plot one
Proxy<-read.csv("ALLRSL.csv",header=T)
p1<-ggplot()+
geom_ribbon(data=Proxy,aes(x=YEAR,ymin=LOWER,ymax=UPPER,fill=SITE),alpha=.5)+
geom_line(data=Proxy,aes(x=YEAR,y=RSL,col=SITE))+
facet_wrap(~ SITE,ncol= 1)+
scale_fill_manual(values=c("#4E193D","#342955","#4E617E","#97B4CB"))+
scale_color_manual(values=c("#4E193D","#342955","#4E617E","#97B4CB"))+
theme_classic()+
xlim(1900, 2020)+
theme(panel.grid.major.x = element_blank())+
theme(panel.grid.minor.x = element_blank())+
theme(panel.grid.minor.y = element_blank())+
theme(panel.grid.major.y = element_blank())+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
theme(
strip.background = element_blank(),
strip.text.x = element_blank()
)+
theme(legend.position="none")
p1
plot two
tgsm<-read.csv("tgsm.csv",header=T)
tgsm<-na.omit(tgsm)
tglonger<-pivot_longer(tgsm, cols=c(-Year),names_to="Site", values_to = "value")
p2<-ggplot()+
geom_point(data=tglonger,aes(x=Year,y=value,col=Site),alpha=.7,size=1)+
facet_wrap(~Site,ncol=1)+
theme_classic()+
xlim(1900,2020)+
scale_color_manual(values=c("#4E193D","#342955","#4E617E","#97B4CB"))+
theme(panel.grid.major.x = element_blank())+
theme(panel.grid.minor.x = element_blank())+
theme(panel.grid.minor.y = element_blank())+
theme(panel.grid.major.y = element_blank())+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
theme(
strip.background = element_blank(),
strip.text.x = element_blank()
)+
theme(legend.position="none")
p2
Data
Proxy <- structure(list(RSL = c(-0.305251214, -0.306414006, -0.307194187,
-0.308202139, -0.309150572, -0.309679123), UPPER = c(-0.182716456,
-0.186724068, -0.189331305, -0.193118273, -0.197069799, -0.20118809
), LOWER = c(-0.416725663, -0.413606073, -0.411131729, -0.408930899,
-0.406531588, -0.404478981), YEAR = 1820:1825, SITE = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("Little Swanport", "Lutregala",
"Tarra", "Wapengo"), class = "factor")), row.names = c(NA, 6L
), class = "data.frame")
tgsm <- structure(list(Year = 1993:1998, Lg2002 = c(-0.001164223, -0.002229453,
-0.002734792, -0.002977787, -0.002786098, -0.002026994), Wap2002 = c(-0.002531348,
-0.002051993, -0.001468704, -0.001182162, -0.001027132, -0.00020881
), Tar2002 = c(-0.029020612, -0.024330561, -0.019927593, -0.015682528,
-0.012907219, -0.009784772), LSP2002 = c(-0.034514531, -0.030171621,
-0.026095138, -0.021952898, -0.018480702, -0.014531318)), na.action = structure(c(`1` = 1L,
`2` = 2L, `3` = 3L, `4` = 4L, `5` = 5L, `6` = 6L, `7` = 7L, `8` = 8L,
`9` = 9L, `10` = 10L, `11` = 11L, `12` = 12L, `13` = 13L, `14` = 14L,
`15` = 15L, `16` = 16L, `17` = 17L, `18` = 18L, `19` = 19L, `20` = 20L,
`21` = 21L, `22` = 22L, `23` = 23L, `24` = 24L, `25` = 25L, `26` = 26L,
`27` = 27L, `28` = 28L, `29` = 29L, `30` = 30L, `31` = 31L, `32` = 32L,
`33` = 33L, `34` = 34L, `35` = 35L, `36` = 36L, `37` = 37L, `38` = 38L,
`39` = 39L, `40` = 40L, `41` = 41L, `42` = 42L, `43` = 43L, `44` = 44L,
`45` = 45L, `46` = 46L, `47` = 47L, `48` = 48L, `49` = 49L, `50` = 50L,
`51` = 51L, `52` = 52L, `53` = 53L, `54` = 54L, `55` = 55L, `56` = 56L,
`57` = 57L, `58` = 58L, `59` = 59L, `60` = 60L, `61` = 61L, `62` = 62L,
`63` = 63L, `64` = 64L, `65` = 65L, `66` = 66L, `67` = 67L, `68` = 68L,
`69` = 69L, `70` = 70L, `71` = 71L, `72` = 72L, `73` = 73L, `74` = 74L,
`75` = 75L, `76` = 76L, `77` = 77L, `78` = 78L, `79` = 79L, `80` = 80L,
`81` = 81L, `82` = 82L, `83` = 83L, `84` = 84L, `85` = 85L, `86` = 86L,
`87` = 87L, `88` = 88L, `89` = 89L, `90` = 90L, `91` = 91L, `92` = 92L,
`93` = 93L, `94` = 94L, `95` = 95L, `96` = 96L, `97` = 97L, `98` = 98L,
`99` = 99L, `100` = 100L, `101` = 101L, `102` = 102L, `103` = 103L,
`104` = 104L, `105` = 105L, `106` = 106L, `107` = 107L, `108` = 108L,
`109` = 109L, `110` = 110L, `111` = 111L, `112` = 112L, `113` = 113L,
`114` = 114L, `115` = 115L, `116` = 116L, `117` = 117L, `118` = 118L,
`119` = 119L, `120` = 120L, `121` = 121L, `122` = 122L, `123` = 123L,
`124` = 124L, `125` = 125L, `126` = 126L, `127` = 127L, `128` = 128L,
`129` = 129L, `130` = 130L, `131` = 131L, `132` = 132L, `133` = 133L,
`134` = 134L, `135` = 135L, `136` = 136L, `137` = 137L, `138` = 138L,
`139` = 139L, `140` = 140L, `141` = 141L, `142` = 142L, `143` = 143L,
`144` = 144L, `145` = 145L, `146` = 146L, `147` = 147L, `148` = 148L,
`149` = 149L, `150` = 150L, `151` = 151L, `152` = 152L, `153` = 153L,
`154` = 154L, `155` = 155L, `156` = 156L, `157` = 157L, `183` = 183L
), class = "omit"), row.names = 158:163, class = "data.frame")
See plot one how you can do that with patchwork.
However. Conceptually, I am guessing you want to add a sort of prediction to some historic values or so. I personally would put everything in one data frame and plot this. If there is a too large gap between the two time points, you can facet by timepoints (as in my suggestion).
The plots look a bit different than your plot because you only provided data for one Site in Proxy (so I filtered the other for what I thought is the equivalent, it will work nonetheless, because the faceting remains) - and I removed all those theme elements that are not relevant to the problem.
Plot one - combining plots.
library(tidyverse)
library(patchwork)
tgsm<-na.omit(tgsm)
tglonger <-
pivot_longer(tgsm, cols=c(-Year), names_to="SITE", values_to = "RSL") %>%
filter(SITE == "LSP2002") %>%
rename(YEAR = Year)
p1 <- ggplot() +
geom_ribbon(data = Proxy, aes(x = YEAR, ymin = LOWER, ymax = UPPER, fill = SITE), alpha = .5) +
geom_line(data = Proxy, aes(x = YEAR, y = RSL, col = SITE)) +
facet_wrap(~SITE) +
coord_cartesian(xlim = c(1800, 1830), ylim = c(-1, 0)) +
theme_classic() +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.position = "none"
)
p2 <- ggplot() +
geom_point(data = tglonger, aes(x = YEAR, y = RSL, col = SITE), alpha = .7, size = 1) +
facet_wrap(~SITE) +
coord_cartesian(xlim = c(1990, 2000), ylim = c(-1, 0)) +
theme_classic() +
## only one call to theme!!
theme(
## this is where the theme call is different to above
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.line.y = element_blank(),
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.position = "none",
)
p1 + p2
Suggestion for an alternative visualisation
df_new <-
bind_rows(time1 = Proxy, time2 = tglonger, .id = "timevar") %>%
mutate(SITE = "LSP2002")
ggplot(df_new)+
geom_point(aes(x=YEAR,y=RSL))+
facet_grid(SITE~timevar, scales = "free_x")+
theme(legend.position="none") +
theme(panel.spacing = unit(.5, "lines"))
You can also use this data frame in order to create a list of plots, and then stitch it together with patchwork. This approach doesn't allow to change individual plots though.
ls_p <-
df_new %>%
split(., .$timevar) %>%
map(~{ggplot(.x)+
geom_point(aes(x=YEAR,y=RSL))+
coord_cartesian(ylim = c(-0.4,0))+
facet_grid(~SITE, scales = "free_x")+
theme(legend.position="none") +
theme(panel.spacing = unit(.5, "lines"))})
library(patchwork)
wrap_plots(ls_p)
I am working on doing linear regressions on data that I split into train, validation, and test. I have had an issue when I run a lm that I receive the error
Error in lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) : NA/NaN/Inf in 'x'.
I looked at other posts and tried a few things, but it is still not working.
Here is the model.
test_model <- lm(CVD ~ Exp_RFEI_2 + Low_Access + Gini_Index + Median_Income, data=test, NaRV.omit(Exp_RFEI_2))
I removed the missing values and double checked with
missing <- FINAL_df[!complete.cases(FINAL_df),]
I also added NaRV.omit() to handle any Na/NaN/Inf, but I still receive the error. I've noticed that if I change the degree of split (i.e. 80-10-10 vs. 75-15-15) or the set.seed that it changes where the error is thrown. Meaning depending on the condition the error may be in the valid list or the test list. So I know it's something with the data not with the code. I feel like I am missing something really simple!
Here is a sample of data.
structure(list(FIPS = "01001", State = "AL", County = "Autauga",
LACCESS_LOWI15 = 6543.67682386602, PCT_LACCESS_LOWI15 = 11.9911250002126,
FMRKT16 = 1, GROC14 = 4, SUPERC14 = 1, CONVS14 = 30, SPECS14 = 2,
FFR14 = 36, Value = 517.4, MILK_SODA_PRICE10 = 0.9232891,
PCT_SNAP16 = 16.9764357535007, `Estimate!!Gini Index` = 0.4227,
`Margin of Error!!Gini Index` = 0.0175, PCT_NHWHITE10 = 77.2461563834271,
PCT_NHBLACK10 = 17.5825988162211, PCT_HISP10 = 2.40054241263675,
PCT_NHASIAN10 = 0.855765882978138, PCT_NHNA10 = 0.397647101940591,
PCT_NHPI10 = 0.0403144527313042, PCT_65OLDER10 = 11.9953821626871,
PCT_18YOUNGER10 = 26.7779589892067, MEDHHINC15 = 56580, RFEI = 16.5,
Exp_RFEI_1 = 9.57142857142857, Exp_RFEI_2 = 8.25), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), na.action = structure(c(`68` = 68L,
`69` = 69L, `70` = 70L, `71` = 71L, `72` = 72L, `73` = 73L, `74` = 74L,
`75` = 75L, `76` = 76L, `77` = 77L, `78` = 78L, `79` = 79L, `80` = 80L,
`81` = 81L, `82` = 82L, `83` = 83L, `84` = 84L, `85` = 85L, `86` = 86L,
`87` = 87L, `88` = 88L, `89` = 89L, `90` = 90L, `91` = 91L, `92` = 92L,
`93` = 93L, `94` = 94L, `95` = 95L, `96` = 96L, `541` = 541L,
`547` = 547L, `548` = 548L, `549` = 549L, `550` = 550L, `551` = 551L,
`613` = 613L, `755` = 755L, `778` = 778L, `1032` = 1032L, `1060` = 1060L,
`1101` = 1101L, `1241` = 1241L, `1417` = 1417L, `1429` = 1429L,
`1650` = 1650L, `1658` = 1658L, `1659` = 1659L, `1713` = 1713L,
`1930` = 1930L, `1951` = 1951L, `2418` = 2418L, `2513` = 2513L,
`2654` = 2654L, `2658` = 2658L, `2674` = 2674L, `2713` = 2713L,
`2834` = 2834L, `2846` = 2846L, `2869` = 2869L, `2916` = 2916L,
`2917` = 2917L, `2927` = 2927L, `3095` = 3095L, `3144` = 3144L,
`3145` = 3145L, `3146` = 3146L, `3147` = 3147L, `3148` = 3148L,
`3149` = 3149L, `3150` = 3150L, `3151` = 3151L, `3152` = 3152L,
`3153` = 3153L, `3154` = 3154L, `3155` = 3155L, `3156` = 3156L,
`3157` = 3157L, `3158` = 3158L, `3159` = 3159L, `3160` = 3160L,
`3161` = 3161L, `3162` = 3162L, `3163` = 3163L, `3164` = 3164L,
`3165` = 3165L, `3166` = 3166L, `3167` = 3167L, `3168` = 3168L,
`3169` = 3169L, `3170` = 3170L, `3171` = 3171L, `3172` = 3172L,
`3173` = 3173L, `3174` = 3174L, `3175` = 3175L, `3176` = 3176L,
`3177` = 3177L, `3178` = 3178L, `3179` = 3179L, `3180` = 3180L,
`3181` = 3181L, `3182` = 3182L, `3183` = 3183L, `3184` = 3184L,
`3185` = 3185L, `3186` = 3186L, `3187` = 3187L, `3188` = 3188L,
`3189` = 3189L, `3190` = 3190L, `3191` = 3191L, `3192` = 3192L,
`3193` = 3193L, `3194` = 3194L, `3195` = 3195L, `3196` = 3196L,
`3197` = 3197L, `3198` = 3198L, `3199` = 3199L, `3200` = 3200L,
`3201` = 3201L, `3202` = 3202L, `3203` = 3203L, `3204` = 3204L,
`3205` = 3205L, `3206` = 3206L, `3207` = 3207L, `3208` = 3208L,
`3209` = 3209L, `3210` = 3210L, `3211` = 3211L, `3212` = 3212L,
`3213` = 3213L, `3214` = 3214L, `3215` = 3215L, `3216` = 3216L,
`3217` = 3217L, `3218` = 3218L, `3219` = 3219L, `3220` = 3220L,
`3221` = 3221L, `3222` = 3222L, `3223` = 3223L, `3224` = 3224L,
`3225` = 3225L, `3226` = 3226L, `3227` = 3227L, `3228` = 3228L,
`3229` = 3229L), class = "omit"))
I have a dataframe with the position of a player on a pitch.
The bounding box for the area is 0 - 1000 and 0 - 750.
The starting ball position is 375-500 and the starting player position is 637-692.
I was trying using geom_tile but I can't get a heat map. How can I link the variables to make a heatmap?
frames <- structure(list(half = c("1T", "1T", "1T", "1T", "1T", "1T", "1T",
"1T", "1T", "1T", "1T", "1T", "1T", "1T", "1T", "1T", "1T", "1T",
"1T", "1T", "2T", "2T", "2T", "2T"), `ball-X` = c(375L, 375L,
375L, 375L, 375L, 372L, 365L, 358L, 351L, 344L, 338L, 332L, 326L,
320L, 315L, 310L, 305L, 301L, 300L, 309L, 631L, 631L, 631L, 631L
), `ball-Y` = c(500L, 500L, 500L, 500L, 500L, 490L, 470L, 450L,
432L, 414L, 397L, 381L, 365L, 350L, 336L, 322L, 309L, 297L, 302L,
304L, 577L, 582L, 589L, 596L), `L-2-X` = c(637L, 637L, 636L,
636L, 639L, 639L, 641L, 643L, 645L, 648L, 652L, 656L, 660L, 665L,
669L, 672L, 673L, 674L, 673L, 672L, 227L, 230L, 233L, 235L),
`L-2-Y` = c(692L, 692L, 691L, 688L, 685L, 684L, 681L, 678L,
674L, 669L, 663L, 657L, 649L, 641L, 633L, 624L, 615L, 606L,
596L, 587L, 438L, 445L, 452L, 460L)), class = "data.frame", row.names = c(NA,
-24L))
ggplot(frames, aes(x = `L-2-X`, y = `L-2-Y`)) +
scale_x_continuous(limits = c(0,750))+
scale_y_continuous(limits = c(0,1000))+
geom_tile(aes(fill = `L-2-X`)) +
scale_fill_viridis_c(option = "B", direction = -1) +
theme_light()+
facet_grid(~ half)
Not sure about the final result you are trying to achieve. As far as I get it your code works fine. However, your tiles are simply to small for being visible. Only when I zoomed the plot some tiny tiles appeared.
Therefore I would recommend to bin the data to get a nice heatmap. As an example my code below bins the data in squares of size 25 to 25 (cm??). For the fill I simply count the number of obs per square. Another approach would be to use e.g. geom_hex which uses hexagons for the binning.
library(ggplot2)
library(dplyr)
# Bin data
frames_bin <- frames %>%
# Bin data
mutate(l_2_x = cut(`L-2-X`, breaks = seq(0, 750, 25), labels = seq(0, 725, 25), include.lowest = TRUE),
l_2_y = cut(`L-2-Y`, breaks = seq(0, 1000, 25), labels = seq(0, 975, 25), include.lowest = TRUE)) %>%
# Count number of obs per bin
count(half, l_2_x, l_2_y) %>%
# Convert factors to numeric
mutate_at(vars(l_2_x, l_2_y), ~ as.numeric(as.character(.x)))
ggplot(frames_bin) +
scale_x_continuous(limits = c(0, 750)) +
scale_y_continuous(limits = c(0, 1000)) +
geom_tile(aes(x = l_2_x, y = l_2_y, fill = n)) +
scale_fill_viridis_c(option = "B", direction = -1) +
theme_light()+
facet_grid(~ half)
# Out of the box: use geom_hex
ggplot(frames) +
scale_x_continuous(limits = c(0, 750)) +
scale_y_continuous(limits = c(0, 1000)) +
geom_hex(aes(x = `L-2-X`, y = `L-2-Y`, fill = ..ncount..)) +
scale_fill_viridis_c(option = "B", direction = -1) +
theme_light()+
facet_grid(~ half)