Graph to visualize mean group wise and pareto chart in R language - r

I have a dataset which has regions of a country, states and sales in that state. I want to visualize the mean of that dataset region wise and also a pareto chart to know which state contributes more to the overall regional sales. How to do this in R language. Please help as I'm new to R
#dput for dataset
Region <- c('South','South','South','South','South','Central','Central','Central','North','North','North','North','East','East','East','East','West','West','West','West')
State <- c('TAMIL NADU', 'TELANGANA,'ANDHRA PRADESH','KARNATAKA,'KERALA','MADHYA PRADESH','ORISSA','CHATTISGARH','DELHI','UTTARAKHAND','HARYANA','PUNJAB','ASSAM','MIZORAM','WB','BIHAR','GUJARAT','RAJASTHAN','MAHARASHTRA','GOA')
sales <- C(89,109,92,56,43,103,26,41,126,56,64,98,26,16,61,40,61,101,191,38)
The dataset somewhat looks like this
Region
State
Gdp
South
Tamil Nadu
89
South
Telangana
109
South
Karnataka
92
South
Andhra Pradesh
56
South
Kerala
43
Central
Madhya Pradesh
103
Central
Chattisgarh
26
Central
Orissa
41
North
Delhi
126
North
Punjab
56
North
Haryana
64
North
Uttarakhand
98
East
Assam
26
East
Mizoram
16
East
West Bengal
61
East
Bihar
40
West
Gujarat
61
West
Rajasthan
101
West
Maharashtra
191
West
Goa
38

You did not provide a desired output, so here is my guess at it..
library(data.table)
library(ggplot2)
# setDT(DT) #not needed if your data is already in data.table format
# Order decreasing Gdp
setorder(DT, -Gdp)
# Data wrangling
DT[, `:=`(meanGdp_region = mean(Gdp),
cumGdp = cumsum(Gdp)), by = Region]
DT[, State_f := factor(State, levels = State)]
# Plot
ggplot(data = DT, aes(x = State_f)) +
geom_col(aes(y = Gdp)) +
geom_line(aes(y = cumGdp, group = 1), color = "red") +
geom_hline(aes(yintercept = meanGdp_region), color = "blue") +
facet_wrap(~Region, nrow = 1, scales = "free_x") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
labs(x = "")
sample data used
# Sample data
DT <- fread("Region State Gdp
South Tamil Nadu 89
South Telangana 109
South Karnataka 92
South Andhra Pradesh 56
South Kerala 43
Central Madhya Pradesh 103
Central Chattisgarh 26
Central Orissa 41
North Delhi 126
North Punjab 56
North Haryana 64
North Uttarakhand 98
East Assam 26
East Mizoram 16
East West Bengal 61
East Bihar 40
West Gujarat 61
West Rajasthan 101
West Maharashtra 191
West Goa 38")

Another output guess:
Region <- c('South','South','South','South','South','Central','Central','Central','North','North','North','North','East','East','East','East','West','West','West','West')
State <- c('TAMIL NADU', 'TELANGANA','ANDHRA PRADESH','KARNATAKA','KERALA','MADHYA PRADESH','ORISSA','CHATTISGARH','DELHI','UTTARAKHAND','HARYANA','PUNJAB','ASSAM','MIZORAM','WB','BIHAR','GUJARAT','RAJASTHAN','MAHARASHTRA','GOA')
sales <- c(89,109,92,56,43,103,26,41,126,56,64,98,26,16,61,40,61,101,191,38)
df <- data.frame(Region, State, sales)
df2 <- df %>%
arrange(desc(sales)) %>%
mutate(State = factor(State)) %>%
mutate(cumulative = cumsum(sales)) %>%
mutate(State = fct_inorder(df$State))
ggplot(df2, aes(x=State)) +
geom_bar(aes(y=sales), fill='blue', stat="identity") +
geom_point(aes(y=cumulative), color = rgb(0, 1, 0), pch=16, size=1) +
geom_path(aes(y=cumulative, group=1), colour="slateblue1", lty=3, size=0.9) +
theme(axis.text.x = element_text(angle=90, vjust=0.6)) +
labs(title = "Pareto Plot", x = 'State', y = 'Count')

it's great that you want to explore R. I found few mistakes, these vectors will not work, you forgot to put ' in few places and you should use c instead of C (in the code I grouped by colour States in diff. way compared to previous answer - hope you can choose what works for you).
library(ggplot2)
Region <- c('South','South','South','South','South','Central','Central','Central','North','North','North','North','East','East','East','East','West','West','West','West')
State <- c('TAMIL NADU', 'TELANGANA','ANDHRA PRADESH','KARNATAKA','KERALA','MADHYA PRADESH','ORISSA','CHATTISGARH','DELHI','UTTARAKHAND','HARYANA','PUNJAB','ASSAM','MIZORAM','WB','BIHAR','GUJARAT','RAJASTHAN','MAHARASHTRA','GOA')
sales <- c(89,109,92,56,43,103,26,41,126,56,64,98,26,16,61,40,61,101,191,38)
myDf <- data.frame(Region, State, sales, stringsAsFactors = FALSE)
str(myDf)
myDf <- myDf\[order(myDf$sales, decreasing=TRUE), \]
myDf$State <- factor(myDf$State , levels=myDf$State)
myDf$cumulative <- cumsum(myDf$sales)
ggplot(myDf, aes(x = State)) +
geom_bar(aes(y = sales, fill = Region), stat = "identity") +
geom_point(aes(y = cumulative), color = rgb(0, 1, 0), pch = 16, size = 1) +
geom_path(aes(y = cumulative, group = 1), colour = "slateblue1", lty = 3, size = 0.9) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6)) +
labs(title = "Pareto Plot", x = 'States', y = 'Sales')]

Related

in R, ggplot geom_point() with colors based on specific, discrete values - part 2

My question is similar to this one, except that my data are different. In my case, I was not able to use the solution given. I would expect points to show up on my map coloured according to the cut() values. Could someone point me in the right direction?
> test
# A tibble: 10 × 5
TC1 TC2 Lat Long Country
<dbl> <dbl> <dbl> <dbl> <fctr>
1 2.9 2678.0 50.62980 -95.60953 Canada
2 1775.7 5639.9 -31.81889 123.19389 Australia
3 4.4 5685.6 -10.10449 38.54364 Tanzania
4 7.9 NA 54.81822 -99.91685 Canada
5 11.2 2443.0 7.71667 -7.91667 Cote d'Ivoire
6 112.1 4233.4 -17.35093 128.02609 Australia
7 4.4 114.6 45.21361 -67.31583 Canada
8 8303.5 4499.9 46.63626 -81.39866 Canada
9 100334.8 2404.5 46.67291 -93.11937 USA
10 NA 1422.9 -17.32921 31.28224 Zimbabwe
ggplot(data = test, aes(x= Long, y= Lat)) +
borders("world", fill="gray75", colour="gray75", ylim = c(-60, 60)) +
geom_point(aes(size=TC2, col=cut(TC1, c(-Inf, 1000, 5000, 50000, Inf)))) +
# scale_colour_gradient(limits=c(100, 1000000), low="yellow", high="red") +
scale_color_manual(name = "TC1",
values = c("(-Inf,1000]" = "green",
"(1000,5000]" = "yellow",
"(5000,50000]" = "orange",
"(50000, Inf]" = "red"),
labels = c("up to 1", "1 to 5", "5 to 50", "greater than 50")) +
theme(legend.position = "right") +
coord_quickmap()
Warning message:
Removed 10 rows containing missing values (geom_point).
You were almost there! It's just the names of the 'cut' factors that are incorrect. If you try:
cut(test$TC1, c(-Inf, 1000, 5000, 50000, Inf))
# [1] (-Inf,1e+03] (1e+03,5e+03] (-Inf,1e+03] (-Inf,1e+03] (-Inf,1e+03]
# [6] (-Inf,1e+03] (-Inf,1e+03] (5e+03,5e+04] (5e+04, Inf] <NA>
# Levels: (-Inf,1e+03] (1e+03,5e+03] (5e+03,5e+04] (5e+04, Inf]
As you see the names of the levels are a bit different from what you are typing.
library(ggplot2)
ggplot(data = test, aes(x = Long, y = Lat)) +
borders("world", fill="gray75", colour="gray75", ylim = c(-60, 60)) +
geom_point(aes(size=TC2, color = cut(TC1, c(-Inf, 1000, 5000, 50000, Inf)))) +
scale_color_manual(name = "TC1",
values = c("(-Inf,1e+03]" = "green",
"(1e+03,5e+03]" = "yellow",
"(5e+03,5e+04]" = "orange",
"(5e+04, Inf]" = "red"),
labels = c("up to 1", "1 to 5", "5 to 50", "greater than 50")) +
theme(legend.position = "right") +
coord_quickmap()
#> Warning: Removed 2 rows containing missing values (geom_point).
Data:
test <- read.table(text = 'TC1 TC2 Lat Long Country
1 2.9 2678.0 50.62980 -95.60953 Canada
2 1775.7 5639.9 -31.81889 123.19389 Australia
3 4.4 5685.6 -10.10449 38.54364 Tanzania
4 7.9 NA 54.81822 -99.91685 Canada
5 11.2 2443.0 7.71667 -7.91667 "Cote d\'Ivoire"
6 112.1 4233.4 -17.35093 128.02609 Australia
7 4.4 114.6 45.21361 -67.31583 Canada
8 8303.5 4499.9 46.63626 -81.39866 Canada
9 100334.8 2404.5 46.67291 -93.11937 USA
10 NA 1422.9 -17.32921 31.28224 Zimbabwe', header = T)

Ordering a 2 bar plot in R

I have a data set as below and I have created a graph with below code as suggested in a previous question. What I want to do is order the bars by rankings rather than team names. Is that possible to do in ggplot?
Team Names PLRankingsReverse Grreserve
Liverpool 20 20
Chelsea 19 19
Manchester City 15 18
Arsenal 16 17
Tottenham 18 16
Manchester United 8 15
Everton 10 14
Watford 13 13
Burnley 17 12
Southampton 9 11
WBA 11 10
Stoke 4 9
Bournemouth 12 8
Leicester 7 7
Middlesbrough 14 6
C. Palace 6 5
West Ham 1 4
Hull 3 3
Swansea 5 2
Sunderland 2 1
And here is the code:
alldata <- read.csv("premierleague.csv")
library(ggplot2)
library(reshape2)
alldata <- melt(alldata)
ggplot(alldata, aes(x = Team.Names, y= value, fill = variable), xlab="Team Names") +
geom_bar(stat="identity", width=.5, position = "dodge")
Thanks for the help!
In this case you need to sort your data frame prior to melting and capture the order. You can then use this to set the limit order on scale_x_discrete, or you can factor Team Name in your aes string.
Using factor:
ordr <- order(alldata$`Team Names`, alldata$PLRankingsReverse, decreasing = TRUE)
alldata <- melt(alldata)
ggplot(alldata, aes(x = factor(`Team Name`, ordr), y = value, fill = variable) +
labs(x = "Team Name") +
geom_bar(stat = "identity", width = .5, position = "dodge")
Using scale_x_discrete:
ordr <- alldata$`Team Name`[order(alldata$PLRankingsReverse, decreasing = TRUE)]
alldata <- melt(alldata)
ggplot(alldata, aes(x = `Team Name`, y = value, fill = variable) +
labs(x = "Team Name") +
geom_bar(stat = "identity", width =. 5, position = "dodge") +
scale_x_discrete(limits = ordr)

How to plot data points at particular location in a map in R

I have a dataset that looks like this:
LOCALITY numbers
1 Airoli 72
2 Andheri East 286
3 Andheri west 208
4 Arya Nagar 5
5 Asalfa 7
6 Bandra East 36
7 Bandra West 72
I want to plot bubbles (bigger the number bigger would be the bubble) inside the map of mumbai for each location in dataset.
I loaded the map of mumbai using 'maps' library but now I am not sure on how to plot these in the map. Is it possible to do in R ?
I used this to load the map:
library(ggmap)
library(mapproj)
maps <- get_map(location = 'Mumbai', zoom = 12)
ggmap(maps)
This should get you headed in the right direction, but be sure to check out the examples pointed out by #Jaap in the comments.
library(ggmap)
map <- get_map(location = "Mumbai", zoom = 12)
df <- data.frame(location = c("Airoli",
"Andheri East",
"Andheri West",
"Arya Nagar",
"Asalfa",
"Bandra East",
"Bandra West"),
values = c(72, 286, 208, 5, 7, 36, 72),
stringsAsFactors = FALSE)
locs_geo <- geocode(df$location)
df <- cbind(df, locs_geo)
df
# location values lon lat
# 1 Airoli 72 72.99348 19.15793
# 2 Andheri East 286 72.87270 19.11549
# 3 Andheri West 208 72.82766 19.13632
# 4 Arya Nagar 5 80.32170 26.48341
# 5 Asalfa 7 72.89514 19.10023
# 6 Bandra East 36 72.84935 19.06053
# 7 Bandra West 72 72.83625 19.06069
ggmap(map) +
geom_point(data = df, aes(x = lon, y = lat, size = values))

R map plot without longitude and latitude

I would like to plot the R map without the longitude and latitude values. Most of the map functions use longitude and latitude values. The only information I have is the name of the state and frequency. Please let me know how to plot the R map.
state freq
1 california 14717
2 texas 6842
3 new york 6729
4 florida 6720
5 illinois 5921
6 NA 5897
7 georgia 5008
8 ohio 4197
9 michigan 3593
10 virginia 3278
11 new jersey 3097
12 north carolina 3084
13 washington 3048
14 pennsylvania 2972
15 maryland 2821
16 missouri 2615
17 minnesota 2318
18 massachusetts 2242
19 colorado 2210
20 indiana 2078
21 arizona 1901
22 wisconsin 1842
23 oregon 1817
24 tennessee 1737
25 alabama 1679
26 connecticut 1627
27 south carolina 1122
28 nevada 1090
29 kansas 1062
30 kentucky 983
31 oklahoma 971
32 louisiana 954
33 utah 877
34 arkansas 855
35 mississippi 787
36 nebraska 674
37 idaho 599
38 new hampshire 551
39 new mexico 472
40 rhode island 435
41 hawaii 409
42 west virginia 391
43 montana 330
44 delaware 300
45 vermont 207
46 alaska 200
47 south dakota 189
48 iowa 186
49 wyoming 150
50 maine 101
51 north dakota 52
Lacking a reproducible example, I manually typed just 4 states as an illustration:
library(dplyr)
library(ggplot2)
df <- data.frame( state = c("california","texas","nevada","north dakota"),
freq = c(14717, 6842, 1090, 52),
stringsAsFactors = FALSE )
state_level_df <- data.frame(state = tolower(state.name),
long = state.center$x,
lat = state.center$y,
stringsAsFactors = FALSE) %>%
inner_join( df, by="state" )
ggplot(state_level_df, aes(long, lat)) +
borders("state") +
geom_point(aes(color=freq,size=freq), show_guide=FALSE) +
theme(text=element_text(size=18)) +
scale_size(range=c(2,20)) +
scale_color_continuous(low="red",high="green") +
theme_bw()
which gives me this:
Your full data frame df should work as well.
Here is a partial choropleth, using the partial data frame contributed by #akhmed.
df <- data.frame( state = c("california","texas","nevada","north dakota", rep("NA", 47)),
freq = c(14717, 6842, 1090, 52, rep(0, 47)),
stringsAsFactors = FALSE )
library(maps)
library(ggthemes)
states_map <- map_data("state", region = c("california","texas","nevada","north dakota"))
new_map <- merge(states_map, df, by.x = "region", by.y = "state")
new_map <- arrange(new_map, group, order) # to sort polygons in right order
ggplot(new_map, aes(x = long, y = lat, group = group, fill = freq)) +
geom_polygon(color = "black") +
coord_map("polyconic") + theme_tufte() + labs(x = "", y = "")
You can revise the color scheme with scale_fill_gradient2, for example.
This is the code that Deepayan Sarkar offered in his book "Lattice: " to plot a pseudo-3d barplot with continental US states as the x.y location for the bars. You should be able to substitute the 'density' values with the values in your dataset. You will probably need to remove the exclusion of AK and HI.
state.info <- data.frame(name = state.name, long = state.center$x, lat = state.center$y,
area = state.x77[, "Area"],
population = 1000 * state.x77[, "Population"])
state.info$density <- with(state.info, population / area)
library("maps")
state.map <- map("state", plot=FALSE, fill = FALSE)
panel.3dmap <- function(..., rot.mat, distance, xlim, ylim, zlim, xlim.scaled,
ylim.scaled, zlim.scaled) { scaled.val <- function(x, original, scaled) {
scaled[1] + (x - original[1]) * diff(scaled) / diff(original) }
m <- ltransform3dto3d(rbind(scaled.val(state.map$x, xlim, xlim.scaled),
scaled.val(state.map$y, ylim, ylim.scaled), zlim.scaled[1]), rot.mat, distance)
panel.lines(m[1,], m[2,], col = "grey76") }
cloud(density ~ long + lat, state.info, subset = !(name %in% c("Alaska", "Hawaii")),
panel.3d.cloud = function(...) { panel.3dmap(...)
panel.3dscatter(...) },
type = "h", scales = list(draw = FALSE), zoom = 1.1, xlim = state.map$range[1:2],
ylim = state.map$range[3:4], xlab = NULL, ylab = NULL, zlab = NULL,
aspect = c(diff(state.map$range[3:4]) / diff(state.map$range[1:2]), 0.3),
panel.aspect = 0.75, lwd = 2, screen = list(z = 30, x = -60),
par.settings = list(axis.line = list(col = "transparent"),
box.3d = list(col = "transparent", alpha = 0)))
Here is a plotly alternative using some of the techniques from previous respondents:
library(plotly)
# create df but taking a subset of original poster's data
df <- data.frame(state = c("california","texas","nevada","north dakota", rep("NA", 47)),
freq = c(14717, 6842, 1090, 52, rep(0, 47)),
stringsAsFactors = FALSE )
# generate location information for all states (using built-in data)
state.info <- inner_join(data.frame(state=tolower(state.name),
long=state.center$x, lat=state.center$y,
stringsAsFactors=FALSE),
data.frame(state=tolower(datasets::state.name),
abbrev=datasets::state.abb))
# join the test data to the states location info
map.df <- inner_join(state.info, df, by="state")
# set up plotly to zoom in to US only
g <- list(scope='usa', projection=list(type='albers usa'),
showlakes=TRUE, lakecolor=toRGB('white'))
# plot on the US map
plot_ly(map.df, type='choropleth', locationmode='USA-states',
locations=map.df$abbrev, z=map.df$freq, text=map.df$state) %>%
layout(geo=g, title='Frequency by State')
This produces:

Drawing colored US State map with cut_number() in R

I have a dataframe called "drawdata":
GeoName Ranking
1 Alabama 15
2 Alaska 2
3 Arizona 28
4 Arkansas 12
5 California 19
6 Colorado 7
7 Connecticut 42
8 Delaware 37
9 District of Columbia 9
10 Florida 38
11 Georgia 11
12 Hawaii 48
13 Idaho 10
14 Illinois 16
15 Indiana 26
16 Iowa 34
17 Kansas 27
18 Kentucky 20
19 Louisiana 4
20 Maine 51
21 Maryland 30
22 Massachusetts 39
23 Michigan 14
24 Minnesota 23
25 Mississippi 41
26 Missouri 32
27 Montana 25
28 Nebraska 21
29 Nevada 45
30 New Hampshire 47
31 New Jersey 33
32 New Mexico 5
33 New York 44
34 North Carolina 13
35 North Dakota 31
36 Ohio 35
37 Oklahoma 6
38 Oregon 18
39 Pennsylvania 40
40 Rhode Island 49
41 South Carolina 29
42 South Dakota 46
43 Tennessee 43
44 Texas 3
45 Utah 17
46 Vermont 50
47 Virginia 8
48 Washington 24
49 West Virginia 22
50 Wisconsin 36
51 Wyoming 1
And I want to draw a US State map with different colors for each ranking. The code I have is:
names(drawdata) = c('region','value')
drawdata[,1] = tolower(drawdata[,1])
states = data.frame(state.center, state.abb)
states_map = map_data("state")
df = merge(drawdata, states_map, by = "region")
df$num = 49
p1 = ggplot(data = df, aes(x = long, y = lat, group = group))
p1 = p1 + geom_polygon(aes(fill = cut_number(value, num[1])))
p1 = p1 + geom_path(colour = 'gray', linestyle = 2)
p1 = p1 + scale_fill_brewer('', palette = 'PuRd')
p1 = p1 + coord_map()
p1 = p1 + scale_x_continuous(breaks=NULL) + scale_y_continuous(breaks=NULL)
p1 = p1 + theme(legend.position="none")
p1 = p1 + geom_text(data = states, aes(x = x, y = y, label = state.abb, group = NULL), size = 2)
p1
This perfectly works if 'num', or the number of colors to fill, is small. However, when I set 'num=49', then it produces an error:
Error in cut.default(x, breaks(x, "n", n), include.lowest = TRUE, ...) :
'breaks' are not unique
When I alter the code from
p1 = p1 + geom_polygon(aes(fill = cut_number(value, num[1])))
to
p1 = p1 + geom_polygon(aes(fill = cut_number(unique(value), num[1])))
then it gives me a different error:
Error: Aesthetics must either be length one, or the same length as the dataProblems:cut_number(unique(value), num[1])
I want a map where every 49 States in the map have different colors, each reflecting their 'Ranking'. Any help is very appreciated!
Brewer palettes deliberately have small maximums (generally < 12) since it's pretty much impossible for humans to map the subtle differences to the discrete values you have. You can achieve what you're looking for by "faking" it with scale_fill_gradient2 (NOTE: I deliberately left the legend in as you should too):
library(ggplot2)
names(drawdata) <- c('region','value')
drawdata[,1] <- tolower(drawdata[,1])
states <- data.frame(state.center, state.abb)
states <- states[!(states$state.abb %in% c("AK", "HI")),] # they aren't part of states_map
states_map <- map_data("state")
p1 <- ggplot()
# borders
p1 <- p1 + geom_map(data=states_map, map=states_map,
aes(x=long, y=lat, map_id=region),
color="white", size=0.15)
# fills
p1 <- p1 + geom_map(data=drawdata, map=states_map,
aes(fill=value, map_id=region),
color="white", size=0.15)
# labels
p1 <- p1 + geom_text(data=states,
aes(x=x, y=y, label=state.abb, group=NULL), size=2)
# decent projection
p1 <- p1 + coord_map("albers", lat0=39, lat1=45)
p1 <- p1 + scale_fill_gradient2(low="#f7f4f9", mid="#df65b0", high="#67001f")
# better theme
p1 <- p1 + labs(x=NULL, y=NULL)
p1 <- p1 + theme_bw()
p1 <- p1 + theme(panel.grid=element_blank())
p1 <- p1 + theme(panel.border=element_blank())
p1 <- p1 + theme(axis.ticks=element_blank())
p1 <- p1 + theme(axis.text=element_blank())
p1
You can get an even better result with scale_fill_distiller which does alot under the scenes to let you use a Color Brewer palette with continuous data (I'd argue you do not have continuous data tho):
p1 <- p1 + scale_fill_distiller(palette="PuRd")
I'd strongly suggest continuing to use cut like you had originally and having a max of 9 breaks to fit into the Color Brewer palette you're trying to work with. In reality, folks are still going to need a table to really grok the rankings (never assume Americans know either state shapes, locations or even the two-letter abbreviations for them), so I'd also pretty much just suggest using an actual table with full names at least with this choropleth if not in place of it.
Note also that the way you're trying to build the map deliberately excluded Alaska, Hawaii and the District of Columbia. You'll need to use a real shapefile and something like I cover here to get them to show up nicely.
If you want different colors for each state, using a gradient, you can work with scale_fill_gradient. Here is one version, using green and red at the ends of the gradient, so that each state is on that scale.
ggplot(data = df, aes(x = long, y = lat, group = group)) +
geom_polygon(aes(fill = value)) +
geom_path(colour = 'gray', linestyle = 2) +
scale_fill_gradient(low = "green", high = "red") +
coord_map() +
scale_x_continuous(breaks=NULL) + scale_y_continuous(breaks=NULL) +
theme(legend.position="none") +
geom_text(data = states, aes(x = x, y = y, label = state.abb, group = NULL), size = 2)

Resources