Related
I have a database with the following structure:
I want to create two bar plots, with two facets (Sin and T), in the X-axis the time and in the Y-axis the different A, B, C, D, and E columns (columns can be stacked or not).
How can I do that?
Thanks in advance.
Something like this?
library(tidyverse)
df %>%
pivot_longer(
-c(COND, Time)
) %>%
ggplot(aes(x=factor(Time), y = value, fill=name)) +
geom_col(position = position_dodge())+
facet_wrap(.~COND)+
xlab("Time")
data:
df <- structure(list(COND = c("Sin", "Sin", "Sin", "Sin", "T", "T",
"T", "T"), Time = c(0L, 1L, 6L, 8L, 0L, 1L, 6L, 8L), A = c(54L,
202L, 155L, 202L, 244L, 321L, 149L, 155L), B = c(1536L, 732L,
2577L, 1321L, 1744L, 1952L, 3857L, 1780L), C = c(34018L, 80476L,
4173L, 119L, 33851L, 56320L, 2494L, 696L), D = c(10458L, 33655L,
357L, 452L, 10869L, 30667L, 1839L, 3315L), E = c(3500L, 1904L,
0L, 0L, 3035L, 2839L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-8L))
I'm analysing real-estate sales for some N. American cities and am using k-means clustering on the data. I have seven clusters and for each observation in the cluster I have the latitude, longitude, zipcode, and cluster_id. I'd like to plot this on a map to better visualize the clusters - I'm not sure what such a plot is called - Choropleth? Polygon?
Most of the examples are using geoJSON files but I only have a data.frame object from my k-means clustering.
Actual data:
https://www.kaggle.com/threnjen/portland-housing-prices-sales-jul-2020-jul-2021
Sample data:
> dput(dt[runif(n = 10,min = 1,max = 25000)])
structure(list(id = c(23126L, 15434L, 5035L, 19573L, NA, 24486L,
NA, 14507L, 3533L, 20192L), zipcode = c(97224L, 97211L, 97221L,
97027L, NA, 97078L, NA, 97215L, 97124L, 97045L), latitude = c(45.40525436,
45.55965805, 45.4983139, 45.39398956, NA, 45.47454071, NA, 45.50736618,
45.52812958, 45.34381485), longitude = c(-122.7599182, -122.6500015,
-122.7288742, -122.591217, NA, -122.8898392, NA, -122.6084061,
-122.91745, -122.5948334), lastSoldPrice = c(469900L, 599000L,
2280000L, 555000L, NA, 370000L, NA, 605000L, 474900L, 300000L
), lotSize = c(5227L, 4791L, 64904L, 9147L, NA, 2178L, NA, 4356L,
2613L, 6969L), livingArea = c(1832L, 2935L, 5785L, 2812L, NA,
1667L, NA, 2862L, 1844L, 742L), cluster_id = c(7, 7, 2, 7, NA,
4, NA, 7, 7, 4)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7faa8000fee0>)
I've followed the example on https://gist.github.com/josecarlosgonz/8565908 to try and create a geoJSON file to be able to plot this data but without success.
I'm not using markers because I have ~25,000 observations - it would be difficult to plot them all and the file would take forever to load.
EDIT:
observations by zipcode:
> dput(dat[, .N, by = .(`address/zipcode`)][(order(`address/zipcode`))])
structure(list(`address/zipcode` = c(7123L, 97003L, 97004L, 97005L,
97006L, 97007L, 97008L, 97009L, 97015L, 97019L, 97023L, 97024L,
97027L, 97030L, 97034L, 97035L, 97038L, 97045L, 97056L, 97060L,
97062L, 97068L, 97070L, 97078L, 97080L, 97086L, 97089L, 97113L,
97123L, 97124L, 97132L, 97140L, 97201L, 97202L, 97203L, 97204L,
97205L, 97206L, 97209L, 97210L, 97211L, 97212L, 97213L, 97214L,
97215L, 97216L, 97217L, 97218L, 97219L, 97220L, 97221L, 97222L,
97223L, 97224L, 97225L, 97227L, 97229L, 97230L, 97231L, 97232L,
97233L, 97236L, 97239L, 97266L, 97267L), N = c(1L, 352L, 9L,
252L, 421L, 1077L, 357L, 1L, 31L, 2L, 4L, 159L, 239L, 525L, 640L,
548L, 1L, 1064L, 5L, 353L, 471L, 736L, 6L, 403L, 866L, 913L,
8L, 5L, 1113L, 776L, 3L, 543L, 219L, 684L, 463L, 1L, 57L, 809L,
189L, 216L, 688L, 510L, 504L, 330L, 318L, 177L, 734L, 195L, 832L,
305L, 276L, 589L, 688L, 716L, 286L, 83L, 1307L, 475L, 77L, 150L,
382L, 444L, 290L, 423L, 430L)), row.names = c(NA, -65L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7f904781a6e0>)
I used the kaggle data on a simple laptop (i3 8th gen) to generate a ggplot2 object, with cluster IDs randomly sampled and transform this via the ggplotly() function ... the resulting plotly object seems OK to work with for analysis but I do not know your performance requirements:
library(dplyr)
library(ggplot2)
library(plotly)
library(rnaturalearth) # here we get the basic map data from
# read in data from zip, select minimal number of columns and sample cluster_id
df <- readr::read_csv(unzip("path_to_zip/portland_housing.csv.zip"))%>%
dplyr::select(az = `address/zipcode`, latitude, longitude) %>%
dplyr::mutate(cluster_id = sample(1:7, n(), replace = TRUE))
# get the map data
world <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf")
# build the ggplot2 object (note that I use rings as shapes and alpha parameter to reduce the over plotting
plt <- ggplot2::ggplot(data = world) +
ggplot2::geom_sf() +
ggplot2::geom_point(data = df, aes(x = longitude, y = latitude, color = factor(cluster_id)), size = 1, shape = 21, alpha = .7) +
ggplot2::coord_sf(xlim = c(-124.5, -122), ylim = c(45, 46), expand = FALSE)
# plot it:
plt
# plotly auto transform from ggplot2 object
plotly::ggplotly(plt)
EDIT
To include a map you can use for example the ggmap package instead of the map data from rnaturalearth... I will only display the plotly result:
library(ggmap)
# https://stackoverflow.com/questions/23130604/plot-coordinates-on-map
sbbox <- ggmap::make_bbox(lon = c(-124.5, -122), lat = c(45, 46), f = .1)
myarea <- ggmap::get_map(location=sbbox, zoom=10, maptype="terrain")
myarea <- ggmap::ggmap(myarea)
plt2 <- myarea +
ggplot2::geom_point(data = df, mapping = aes(x = longitude, y = latitude, color = factor(cluster_id)), shape = 21, alpha = .7)
plotly::ggplotly(plt2)
There are many other approaches concerning the map data, like using the mapbox-api
Say , I have datasets
df1=
structure(list(date = c("17.02.2021", "04.11.2020", "14.11.2020",
"24.11.2020", "29.11.2020", "04.12.2020", "09.12.2020"), x1 = c(0L,
0L, 7L, 0L, 0L, 0L, 0L), x2 = c(674L, 632L, 1036L, 656L, 736L,
762L, 698L), x3 = c(698L, 712L, 1140L, 704L, 784L, 786L, 722L
), x4 = c(522L, 472L, 988L, 464L, 608L, 578L, 514L), x5 = c(2408L,
3256L, 2840L, 2840L, 2888L, 2632L, 2648L), x6 = c(1952L, 2336L,
2480L, 2208L, 2208L, 2144L, 2016L), x7 = c(1056L, 1120L, 1504L,
1056L, 1184L, 1184L, 1120L), x8 = c(1984L, 2464L, 2400L, 2144L,
2208L, 2144L, 2080L), x9 = c(2336L, 2976L, 2784L, 2464L, 2784L,
2528L, 2400L), x10 = c(2528L, 3232L, 3104L, 2848L, 2912L, 2592L,
2656L), x11 = c(1248L, 1312L, 1504L, 1312L, 1312L, 1312L, 1248L
)), class = "data.frame", row.names = c(NA, -7L))
each row it is date. for the first day data profile here
The second day has data profiles
and so on.
Here reference dataset
df2=structure(list(date = c("06.11.2019", "01.12.2019", "25.01.2020",
"04.02.2020", "09.02.2020", "14.02.2020"), x1 = c(12L, 0L, 1L,
6L, 23L, 1L), x2 = c(1272L, 1046L, 688L, 572L, 592L, 328L), x3 = c(1032L,
974L, 736L, 780L, 800L, 568L), x4 = c(792L, 862L, 496L, 476L,
592L, 296L), x5 = c(2232L, 1496L, 1784L, 2792L, 3064L, 3544L),
x6 = c(2976L, 1904L, 1632L, 1760L, 1376L, 1440L), x7 = c(1568L,
1248L, 1008L, 1120L, 992L, 800L), x8 = c(1888L, 1376L, 1632L,
2400L, 2464L, 2720L), x9 = c(2080L, 1504L, 1760L, 2848L,
2912L, 3296L), x10 = c(2400L, 1552L, 1824L, 2848L, 2928L,
3360L), x11 = c(2400L, 1504L, 1120L, 1040L, 784L, 736L)), class = "data.frame", row.names = c(NA,
-6L))
Is there a way or method that would compare the profile of each row of data in df1 with the reference dataset df2, if the profile is similar, then 1 otherwise 0
The date in both dataset can be different, the main problem is detect is profiles are similar or not.
My desired output. The Peter's code is good, but is it possible calculate The difference between profiles by variables for example
This code allows you to visually compare the reference and df1 profiles. As you can see that none of the profiles match exactly. Some profiles are similar, but without a definition of "similar" as pointed out by #user2974951 it's difficult to move this closer to an answer.
library(dplyr)
library(tidyr)
library(ggplot2)
# restructure the data to allow comparison between the datasets
df <-
expand.grid("date_ref" = df2$date, "date_df1" = df1$date) %>%
left_join(df2, by = c("date_ref" = "date")) %>%
left_join(df1, by = c("date_df1" = "date")) %>%
pivot_longer(starts_with("x"), names_to = c("var", "df"), names_sep = "\\.") %>%
mutate(df = if_else(df == "x", "ref", "df1"),
var = factor(var, paste0("x", 1:11)))
# now you can plot the data to compare profiles; had to add some formatting to make the graph readable.
ggplot(df, aes(var, value, group = df, colour = df))+
geom_line()+
facet_grid(date_ref~date_df1)+
labs(colour = "Dataset")+
theme_classic()+
theme(legend.position = "bottom",
axis.text.x = element_text(size = 6, angle = 90),
axis.text.y = element_text(size = 6),
strip.text = element_text(size = 6))
Created on 2021-04-07 by the reprex package (v1.0.0)
What you need to define first is what criteria of similarity you want to use and what your threshold level of similarity is (how similar the datasets need to be to be considered equivalent). Also the important factor is the nature of your data. For example whether you consider your x1..x11 to be independent or just different samples of the same set.
Depending on the answers it can be anything from comparing each df1[i,2:12] to df2[i,2:12] exactly (if they are just duplicates or not) to comparing both of them to NA and checking if they are both NA or both a known value. Something in between would be checking if the differences of each parameter for each line of the datasets are not greater then 0.05 of the minimal value for example and marking the line equivalent if all the parameters are OK or using something like Pearson's correlation coefficient (cor(x,y) function has it enabled by default) for each line and comparing its value to 0.5 for example (both 0.05 and 0.5 are just arbitrary numbers of course and they probably need to be adjusted somewhat). Or maybe the amount of matching points (compared exactly as integers or just similar to some degree) is a better indication for you. There are also known standard tests for sample group dissimilarities, time series dissimilarities, or other statistical hypothesis. Many of them are available in R from bundled packages and if you fancy something else then it is most likely already available in one of the extra packages you can easily download and install.
I am trying to add the values of R2 in scatter plots for several data set and also using facet_grid. So, I want to add several text (values of R2, each one for each data set) in each plot. I have been looking for similar examples, but I couldn't get the right approach, because I don't know how to set the x and y position for the text.
This a very short sample of my original data:
dput(test)
structure(list(code = c("AT0ENK1", "AT0ENK1", "AT0ENK1", "AT0ENK1",
"AT0ENK1", "AT0ENK1", "AT0ENK1", "AT0ENK1", "AT0ILL1", "AT0ILL1",
"AT0ILL1", "AT0ILL1", "AT0ILL1", "AT0ILL1", "AT0ILL1", "AT0ILL1"
), model = structure(c(2L, 2L, 2L, 2L, 6L, 6L, 6L, 6L, 2L, 2L,
2L, 2L, 6L, 6L, 6L, 6L), .Label = c("Obs", "EMEP", "LOTO", "MATCH",
"MINNI", "WRFF", "WRFM"), class = "factor"), O3 = c(118.037246704102,
105.963432312012, 102.795967102051, 107.245376586914,
101.879364013672,
124.914794921875, 129.386352539062, 115.475601196289,
96.2464294433594,
113.553771972656, 108.113143920898, 95.6128845214844,
104.497161865234,
111.243560791016, 121.166435241699, 118.756866455078), O3obs =
c(144.424,
151.726, 151.866, 139.439, 144.424, 151.726, 151.866, 139.439,
164.202, 171.715, 158.06, 137.473, 164.202, 171.715, 158.06,
137.473), r2 = c(0.485277006453918, 0.485277006453918,
0.485277006453918,
0.485277006453918, 0.277829662775301, 0.277829662775301,
0.277829662775301,
0.277829662775301, 0.0429530296631768, 0.0429530296631768,
0.0429530296631768,
0.0429530296631768, 0.0332266668960316, 0.0332266668960316,
0.0332266668960316,
0.0332266668960316)), .Names = c("code", "model", "O3", "O3obs",
"r2"), class = "data.frame", row.names = c(1L, 2L, 3L, 4L, 125L,
126L, 127L, 128L, 187L, 188L, 189L, 190L, 311L, 312L, 313L, 314L
))
And I tried it with:
ggplot( test, aes(O3obs,O3, group= model)) +
geom_point(aes(color=model),size=1)+xlim(0,200) + ylim (0,200) +
geom_abline(intercept = 0, slope = 1) + facet_wrap(~code) +
geom_text(data=test, aes(color = model, label = paste("R2: ", round(r2,2), sep="")), x=180, y=Inf, show.legend = F)
But the values of R2 are overlapped.
Any suggestion? How can I add the values of R2 for each data in each plot?
When you specify x and y in geom_text you are assigning the same coordinates for all the text so it would make sense that they overlap. I usually get around this by creating a data frame that has x and y coordinates for each group. For your data this could look like:
require(dplyr)
require(ggplot2)
new_data = test %>% group_by(code, model) %>% summarise(r2 = max(r2))
new_data$xposition = 40
new_data$yposition = c(200,170,200,170)
ggplot( test, aes(O3obs,O3, group= model))+
geom_point(aes(color=model),size=1)+xlim(0,200) + ylim (0,200) +
geom_abline(intercept = 0, slope = 1) + facet_wrap(~code) +
geom_text(data=new_data,aes(x = xposition, y = yposition, color=model, label = paste("R2: ",
round(r2,2),sep="")),show.legend = F)
I have a lookup table in R that I am trying to figure out how to implement. The challenge for me is that it involves continuous values or ranges of data. If the value falls inbetween I'd like it to pick the right value.
I want to use the two continuous 'GRADE', 'SAT' variables plus the categorical 'TYPE' value to assign a 'GROUP' value. This big block of code looks intimidating but these are tiny tiny tables.
Any advice is appreciated!!!!
#lookup table code for recreating dataframe
structure(list(Type = structure(c(1L, 2L, 1L, 1L), .Label = c("A",
"B"), class = "factor"), min_grade = c(93L, 85L, 93L, 80L), max_grade = c(100L,
93L, 100L, 92L), min_sat = c(600L, 700L, 400L, 600L), max_sat = c(800L,
800L, 599L, 800L), Group = structure(c(1L, 1L, 2L, 3L), .Label = c("A",
"B", "C"), class = "factor")), .Names = c("Type", "min_grade",
"max_grade", "min_sat", "max_sat", "Group"), class = "data.frame", row.names = c(NA,
-4L))
#example ----- desired value is in the 'GROUP' column so this would be NULL before I used the lookup table
structure(list(Name = structure(c(3L, 1L, 2L, 4L), .Label = c("Jack",
"James", "John", "Jordan"), class = "factor"), Grade = c(95L,
95L, 92L, 93L), Sat = c(701L, 500L, 800L, 800L), Type = structure(c(1L,
1L, 1L, 2L), .Label = c("A", "B"), class = "factor"), Group = structure(c(1L,
2L, 3L, 1L), .Label = c("A", "B", "C"), class = "factor")), .Names = c("Name",
"Grade", "Sat", "Type", "Group"), class = "data.frame", row.names = c(NA,
-4L))
how abt this?
ltab <- structure(list(Type = structure(c(1L, 2L, 1L, 1L), .Label = c("A",
"B"), class = "factor"), min_grade = c(93L, 85L, 93L, 80L), max_grade = c(100L,
93L, 100L, 92L), min_sat = c(600L, 700L, 400L, 600L), max_sat = c(800L,
800L, 599L, 800L), Group = structure(c(1L, 1L, 2L, 3L), .Label = c("A",
"B", "C"), class = "factor")), .Names = c("Type", "min_grade",
"max_grade", "min_sat", "max_sat", "Group"), class = "data.frame", row.names = c(NA,
-4L))
dat <- structure(list(Name = structure(c(3L, 1L, 2L, 4L), .Label = c("Jack",
"James", "John", "Jordan"), class = "factor"), Grade = c(95L,
95L, 92L, 93L), Sat = c(701L, 500L, 800L, 800L), Type = structure(c(1L,
1L, 1L, 2L), .Label = c("A", "B"), class = "factor")), .Names = c("Name",
"Grade", "Sat", "Type"), class = "data.frame", row.names = c(NA,
-4L))
library(plyr)
mdat <- adply(merge(dat, ltab, by="Type", all=T), 1, function(x) {
c(FallsIn=x$Grade > x$min_grade & x$Grade <= x$max_grade & x$Sat > x$min_sat & x$Sat <= x$max_sat)
})
mdat[mdat$FallsIn,]
thinking about generalizing, are there going to be more continuous variables that you need to check?
EDIT: could not edit OP post so taking OP's comment into account is how I would tackle an example of "categorizing multidimensional continuous random variables"
so that these keywords will flag up in future searches
breaks <- list(Var1=c(0, 0.25, 1),
Var2=c(0, 0.5, 1),
Var3=c(0, 0.25, 0.75, 1))
#generate this on the fly
genIntv <- function(x) {
ret <- paste0("(", x[1:(length(x)-1)],", ",x[2:length(x)], "]")
names(ret) <- 1:(length(x)-1)
ret
}
lookupTbl <- data.frame(expand.grid(lapply(breaks, genIntv), stringsAsFactors=F),
Group=LETTERS[1:12])
lookupTbl2 <- data.frame(expand.grid(lapply(breaks, function(x) 1:(length(x)-1)), stringsAsFactors=F),
Group=LETTERS[1:12])
#data set
dat <- data.frame(Var1=c(0.1, 0.76), Var2=c(0.5, 0.75), Var3=c(0.25,0.9))
binDat <- do.call(cbind, setNames(lapply(1:ncol(dat), function(k)
.bincode(dat[,k], breaks[[k]], T, T)),colnames(dat)))
merge(binDat, lookupTbl2, all.x=T, all.y=F)
good to learn if someone else has better approaches
If you have small data, a full join should be fine.
library(dplyr)
result =
example %>%
select(-Type) %>%
full_join(look_up) %>%
filter(min_grade < Grade & Grade <= max_grade &
min_sat < Sat & Sat <= max_sat)