R - WordCloud2 does not always render the most frequent words

R - WordCloud2 does not always render the most frequent words - r

I use the wordcloud2 package to render word clouds. It seems that wordcloud2 does not always display the most frequent words.
I said "not always" because the problem is not permanent. It seems that the results are mostly random.
Code :
library(wordcloud2)
library(htmlwidgets)
DataCloud <- as.character(DataTextAnalysis[,1])
DataCloud <- as.data.frame(table(DataCloud))
DataCloud <- DataCloud[order(DataCloud$Freq, decreasing = TRUE),]
DataCloud <- DataCloud[1:10, ]
wordcloud2(data = DataCloud)
Data :
structure(list(`Theme 1` = structure(c(12L, NA, 2L, 4L, 6L, 7L,
NA, 14L, 6L, 6L, 2L, 7L, 5L, 2L, 2L, 2L, 11L, 12L, 2L, 2L, 10L,
NA, 12L, NA, 2L, 13L, 15L, NA, NA, 10L, NA, 1L, 2L, 16L, 6L,
1L, 7L, 9L, 15L, 3L, 1L, 2L, 2L, 2L, 17L, 2L, 17L, 7L, 3L, 2L,
2L, 8L, 6L), .Label = c("Ambiance", "Autonomie", "Changement régulier de hiérarchie",
"Côté familial", "Défi", "Diversité des tâches", "Faire du bon travail",
"Gérer l humain", "Gestion de projets", "Horaires", "Réglage du finisseur",
"Relation client", "Rencontrer de nouvelles équipes", "Responsabilité",
"Technicité", "Travailler avec la hiérachie", "Travailler en binôme"
), class = "factor"), `Theme 2` = structure(c(NA, NA, 13L, 1L,
14L, NA, NA, 4L, 15L, 14L, 10L, 8L, 8L, 5L, 15L, 4L, 13L, 8L,
6L, NA, 3L, NA, 3L, NA, 11L, 5L, 5L, NA, NA, 9L, NA, 16L, 1L,
7L, 8L, 5L, 19L, 2L, 8L, 11L, 5L, 13L, 11L, 11L, 19L, 5L, 19L,
12L, 11L, 8L, 18L, 17L, 4L), .Label = c("Ambiance", "Amélioration",
"Autonomie", "Confiance", "Diversité des tâches", "Être écouté",
"Evolution continue de l entreprise", "Faire du bon travail",
"Hiver", "Liberté", "Matériel performant", "Partager mon savoir-faire",
"Relation client", "Rencontrer de nouvelles équipes", "Responsabilité",
"Solidarité", "Stimulation", "Tranquille", "Travailler dans ma région"
), class = "factor")), .Names = c("Theme 1", "Theme 2"), row.names = c(NA,
-53L), class = "data.frame")

Reduce the font size so that all words fit the available page space:
wordcloud2(DataCloud, size = .5)

Related

How to change range of heatmap using gheatmap in R

Im trying to add a heatmap to my phylogenetic tree. The range of the heatmap should be from 0 to 100 instead it only covers the the min and max of the values. Can I reset the range of the heatmap?
thanks.
library(ggtree)
library(ggplot2)
library(ggstance)
df1 <- structure(
list(id = structure(
c(5L, 15L, 29L, 18L, 24L, 21L,
13L, 11L, 8L, 25L, 23L, 9L, 16L, 3L, 6L, 2L, 20L, 27L, 30L, 17L,
14L, 4L, 1L, 7L, 22L, 28L, 10L, 12L, 26L, 19L),
.Label = c("t1",
"t10", "t11", "t12", "t13", "t14", "t15", "t16", "t17", "t18",
"t19", "t2", "t20", "t21", "t22", "t23", "t24", "t25", "t26",
"t27", "t28", "t29", "t3", "t30", "t4", "t5", "t6", "t7", "t8",
"t9"), class = "factor"),
location = structure(c(1L, 3L, 2L,
1L, 2L, 3L, 3L, 2L, 3L, 2L, 3L, 3L, 2L, 2L, 1L, 1L, 3L, 2L, 1L,
1L, 3L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 1L),
.Label = c("CZ", "GZ", "HK"), class = "factor"),
Value = c(22L, 10L, 33L, 12L, NA,
NA, NA, NA, NA, NA, NA, NA, 45L, 89L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 80L, NA, NA, NA, NA)),
class = "data.frame", row.names = c(NA,
-30L))
tr <- rtree(30)
p <- ggtree(tr)
#df1<- your_example_data
p1 <- p %<+% df1 + geom_tippoint(aes(color=location))+ guides(color = "none")
d2 <- data.frame( val=rnorm(30, mean= 50, sd=20))
rownames(d2)<- tr$tip.label
library(ggnewscale)
p1 <- p1 + new_scale_fill()
p2<- gheatmap(p1, d2 ,offset=0.015, width=0.05,
colnames_angle=45, colnames_offset_y = 0.25,colnames_offset_x =0.001, colnames=TRUE,
colnames_position='top',font.size = 3)+
scale_fill_viridis_c(option="A", name="query\ncoverage\npercentage")
p2

Try using scale_fill_gradientn. I don't have ggtree in my library collection, but it should work with it too. data$Z are the values used in the legend.
min(data$Z)
[1] 10.43507
# using geom_tile instead
ggplot(data, aes(X, Y, fill= Z)) +
geom_tile() +
scale_fill_gradientn(limits = c(0,max(data$Z)),
colours=viridis(10,o="A"),
breaks=c(0,max(data$Z)),
labels=c(0,max(data$Z)))
Data
data <- structure(list(X = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L), .Label = c("A", "B", "C", "D", "E", "F", "G", "H",
"I", "J"), class = "factor"), Y = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L), .Label = c("var1", "var2", "var3", "var4", "var5",
"var6", "var7", "var8", "var9", "var10"), class = "factor"),
Z = c(33.991562910378, 35.5263787321746, 24.5632742531598,
18.0254957079887, 45.778294513002, 38.0070783570409, 38.8778781332076,
13.9182714093477, 13.2864724285901, 12.3245238792151, 45.4634746629745,
43.5207717958838, 14.6174691990018, 14.6395265311003, 16.3748204801232,
37.5898649636656, 46.154183940962, 21.7670671269298, 45.9928634669632,
15.2300526481122, 42.3459290526807, 36.1509132292122, 13.004608694464,
17.2632187511772, 24.1008642502129, 21.0504860430956, 47.8412099648267,
22.8905160259455, 26.2689692527056, 42.2642367053777, 49.7228981740773,
18.5286565497518, 19.9640860501677, 19.8192273359746, 46.2587429210544,
45.3112288471311, 14.0251182205975, 46.5721819829196, 19.2603973485529,
11.8241156637669, 43.5814412590116, 12.3338401783258, 34.6708638872951,
16.535308547318, 12.5870429351926, 17.7716215513647, 38.3571200724691,
40.5572446156293, 38.3018106594682, 36.1261784471571, 23.6329158209264,
38.2715854980052, 31.8956978339702, 19.8036628682166, 41.236245688051,
42.5284101255238, 47.3572976142168, 10.9305525757372, 41.5727174282074,
39.237065333873, 41.6476187948138, 43.6902561411262, 39.2061061505228,
18.3187866955996, 42.8791201952845, 33.8544269837439, 17.3525733780116,
14.5423825085163, 46.209614733234, 24.5643785689026, 35.3784507885575,
44.3101883865893, 45.7905176281929, 36.0531417001039, 44.190902383998,
32.4274326208979, 33.8546730671078, 43.7150628026575, 44.4308217708021,
27.6862936094403, 39.8551124054939, 10.4350713547319, 35.6894047465175,
28.6168400477618, 18.5768875014037, 17.1367645263672, 30.369380293414,
17.7864238992333, 36.1986118741333, 43.2466325163841, 49.581032032147,
49.736803509295, 40.3205085452646, 27.0655540842563, 42.9749015253037,
30.9310132544488, 23.7332978192717, 35.1737863756716, 40.4224442131817,
15.6103290617466)), out.attrs = list(dim = c(X = 10L, Y = 10L
), dimnames = list(X = c("X=A", "X=B", "X=C", "X=D", "X=E", "X=F",
"X=G", "X=H", "X=I", "X=J"), Y = c("Y=var1", "Y=var2", "Y=var3",
"Y=var4", "Y=var5", "Y=var6", "Y=var7", "Y=var8", "Y=var9", "Y=var10"
))), row.names = c(NA, -100L), class = "data.frame")

Facets: organising their order and organising the levels within facets

I would like to please organise the following plots so that facets are printed out from most to least busy (i.e. Hemiptera, Coleoptera, Hymenoptera, Siphonaptera, Lepidoptera, etc.)
I would also like to order the levels within each facet like in Coleoptera. I realise that the X-labels will change order too so I need each facet to print out its own X-label according the level order.
I have already read many threads and that's how I was able to organise Coleoptera. But now I want it to be more tidy.
This is the data (let me know if this format is ok, if not I can try another way):
structure(list(Order = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L), .Label = c("Coleoptera",
"Dermaptera", "Dictyoptera", "Diptera", "Hemiptera", "Hymenoptera",
"Lepidoptera", "Phthiraptera", "Psocoptera", "Siphonaptera",
"Thysanoptera"), class = "factor"), Nrange = structure(c(1L,
3L, 4L, 5L, 6L, 7L, 8L, 10L, 11L, 12L, 14L, 14L, 1L, 10L, 1L,
3L, 4L, 6L, 7L, 10L, 11L, 12L, 14L, NA, 1L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 14L, NA, 1L, 4L, 5L, 6L, 7L, 8L, 10L, 11L,
12L, 14L, 15L, NA, 1L, 2L, 4L, 5L, 6L, 7L, 8L, 10L, 11L, 12L,
13L, 14L, 4L, 10L, 11L, 12L, 14L, 1L, 4L, 10L, 11L, 12L, 13L,
14L, 1L, 5L, 10L, 1L, 4L, 6L, 7L, 10L, 11L, 12L, 14L), .Label = c("Africa",
"Africa, Asia", "Americas", "Asia", "Asia-Temp", "Asia-Trop",
"Australasia", "C&S America", "Cosmopolitan", "Cryptogenic",
"N America", "S America", "Trop", "Trop, SubTrop", "Unknown"), class = "factor"),
Records = c(16L, 1L, 9L, 7L, 11L, 17L, 1L, 15L, 8L, 8L, 5L,
1L, 2L, 1L, 5L, 1L, 1L, 1L, 1L, 9L, 9L, 2L, 1L, 4L, 11L,
10L, 30L, 15L, 9L, 2L, 2L, 2L, 34L, 11L, 21L, 1L, 21L, 16L,
8L, 1L, 14L, 3L, 5L, 25L, 4L, 2L, 1L, 1L, 8L, 1L, 10L, 1L,
2L, 1L, 1L, 8L, 5L, 2L, 1L, 2L, 2L, 9L, 1L, 2L, 1L, 3L, 1L,
12L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 3L,
3L, 2L)), .Names = c("Order", "Nrange", "Records"), row.names = c(NA,
-83L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), vars = "Order", drop = TRUE)
This is the reordering that I guess is affecting only Coleoptera.
xy<-x%>%
mutate(Nrange=reorder(Nrange,-Records,sum))
This is the plot:
to_plot<-xy %>%
filter(!is.na(Nrange))
ggplot(to_plot,aes(x=Nrange,y=Records,fill=Nrange))+
geom_col()+
theme(axis.text.x = element_text(angle=90, vjust=0.7), legend.position = "none") +
facet_wrap(~Order,ncol=3)+
labs(title="Insects recorded as alien-invasive to mainland Spain",
subtitle="Native ranges vs number of records",
caption="Data source: DAISIE (http://www.europe-aliens.org/)")
And this is the plot:
enter image description here

Assuming you're using the tidyverse (based on your code):
library(tidyverse)
xy <- x %>%
ungroup() %>%
mutate(
Order = fct_reorder(Order, Records, sum, .desc = TRUE)
)
xy %>%
filter(!is.na(Nrange)) %>%
ggplot() +
aes(x = Nrange, y = Records, fill = Nrange) +
geom_col() +
facet_wrap(~Order, ncol = 3)
fct_reorder comes from the forcats package, which I believe is now a part of the tidyverse.
Or, using base R, something like this:
xy <- x
record_sums <- tapply(xy$Records, xy$Order, sum)
levels(xy$Order) <- levels(xy$Order)[order(record_sums, decreasing = TRUE)]

drop from an intersection of two columns and based on a condition in a third column

I have the following data:
df <- structure(list(IDVar = 1:40, Major.sectors = structure(c(5L,
9L, 3L, 15L, 11L, 7L, 18L, 18L, 18L, 3L, 3L, 3L, 3L, 17L, 3L,
11L, 7L, 17L, 3L, 11L, 3L, 18L, 3L, 17L, 9L, 18L, 9L, 19L, 3L,
11L, 11L, 2L, 5L, 3L, 18L, 17L, 4L, 2L, 3L, 3L), .Label = c("Banks",
"Chemicals, rubber, plastics, non-metallic products", "Construction",
"Education, Health", "Food, beverages, tobacco", "Gas, Water, Electricity",
"Hotels & restaurants", "Insurance companies", "Machinery, equipment, furniture, recycling",
"Metals & metal products", "Other services", "Post & telecommunications",
"Primary sector", "Public administration & defense", "Publishing, printing",
"Textiles, wearing apparel, leather", "Transport", "Wholesale & retail trade",
"Wood, cork, paper"), class = "factor"), Region.in.country = structure(c(15L,
8L, 8L, 8L, 10L, 15L, 19L, 10L, 8L, 10L, 3L, 18L, 4L, 12L, 4L,
15L, 13L, 4L, 15L, 15L, 7L, 15L, 12L, 1L, 7L, 10L, 15L, 8L, 13L,
15L, 12L, 8L, 7L, 15L, 15L, 10L, 8L, 10L, 10L, 15L), .Label = c("Andalucia",
"Aragon", "Asturias", "Canary Islands", "Cantabria", "Castilla-La Mancha",
"Castilla y Leon", "Cataluna", "Ceuta", "Comunidad Valenciana",
"Extremadura", "Galicia", "Islas Baleares", "La Rioja", "Madrid",
"Melilla", "Murcia", "Navarra", "Pais Vasco"), class = "factor"),
EBIT.TA = c(-0.234432635519391, -0.884337466274593, -0.00446559204081373,
0.11109107677028, -0.137203773525798, -0.582114677880617,
0.0190497663203189, -3.04252763094666, 0.113157822682219,
-0.0255533180037229, 0.281767142199724, 0.0326641697396841,
-0.00879974750993553, 0.0542074697816672, -0.112104697294392,
-0.191945591325174, -0.00380586115226597, -0.0363239884169068,
-0.273949107908537, 0.435398668004486, -0.00563436099927988,
-2.75971618056051, -0.1047327709263, 0.151283793741506, -0.0373197549569126,
0.00912639083178201, -0.0386627754065697, -0.018235399636112,
-0.0118104711362467, -0.701299939137125, NA, 0.0191819361175666,
-0.0104887983706721, -0.801677105519484, -0.402194475974272,
-0.124125227730062, 0.143020458476649, -0.601186271451194,
0.0163269364787831, 5.09955167591238), EBIT.TA_l1 = c(-0.443687074746458,
-0.561864166134075, -0.0345769510044604, 0.0282541797531804,
-0.0181173929170762, 0.0147211350970115, 0.0588534950162799,
-1.14097109926961, 0.060100343733096, -0.0386426338471025,
0.049684095221329, 0.0558174150334904, 0.00214962169435867,
0.0399960114646072, 0.0402934579830171, -0.612359147433149,
-0.0115916125659674, 0.00739473610413031, 0.0174576615247567,
0.68624861825246, 0.0305807338940829, -3.88006243913616,
0.0410122725022661, -0.089491343996377, -0.215219123182103,
0.00967853324842811, -0.0336715197882038, 0.362424791356667,
0.221203934329637, -0.654387857513823, 0.0656934439915892,
0.0652005453654772, 0.0339559014267185, 0.0259085077216708,
-0.303606048856146, 0.0280113794301873, 0.109307291990628,
-0.470048555841697, -0.00157699300508027, -0.350519090107081
), EBIT.TA_l2 = c(-0.351308186716873, 0.00159428805074234,
-0.00604587147802615, 0.0761894448922952, -0.00348378141492824,
NA, 0.0346370866793768, -0.552226781084599, 0.00220031803369861,
-0.0285840972149053, 0.065316579236306, 0.4090851643341,
-0.0188362202518351, 0.0403848986306371, 0.091146090480032,
-0.0154168449752466, -0.0694803621032671, 0.0511978643139393,
-0.452924037757731, -0.0091835704914724, 0.0119918914092344,
0.0858960833880717, NA, 0.104901526886479, -0.23096183545392,
-0.0163058345980967, 0.100643431561465, 0.0527859573541712,
0.250207316117438, NA, 0.00193240515291123, 0.0624210741756767,
0.0178136227732972, -0.0321294913646274, -0.0699629484084657,
-0.00417176180400133, 0.209612573099415, 0.0285645570852926,
0.0551624216079071, 0.0172738293439595), Major.sectors.id = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 7L, 7L, 3L, 3L, 3L, 3L, 8L, 3L, 5L,
6L, 8L, 3L, 5L, 3L, 7L, 3L, 8L, 2L, 7L, 2L, 9L, 3L, 5L, 5L,
10L, 1L, 3L, 7L, 8L, 11L, 10L, 3L, 3L), Region.in.country.id = c(1L,
2L, 2L, 2L, 3L, 1L, 4L, 3L, 2L, 3L, 5L, 6L, 7L, 8L, 7L, 1L,
9L, 7L, 1L, 1L, 10L, 1L, 8L, 11L, 10L, 3L, 1L, 2L, 9L, 1L,
8L, 2L, 10L, 1L, 1L, 3L, 2L, 3L, 3L, 1L)), .Names = c("IDVar",
"Major.sectors", "Region.in.country", "EBIT.TA", "EBIT.TA_l1",
"EBIT.TA_l2", "Major.sectors.id", "Region.in.country.id"), row.names = c(NA,
40L), class = "data.frame")
I randomly generate a column of zero and ones for illustration.
x <- 40
df$x<- sample(c(0,1), replace=TRUE, size=x)
What I am trying to do is to do is to drop rows which have zero values based on a few conditons.
:If df$x == 1
and if intersect(region.id, sector.id) == 0 #i.e. there is no data
then drop
So, I want to group_by region and sector and if the intersect between both columns does not exist then drop that observation.
Consider the following image. I am basically looking to delete the intersects of the columns which has not data. So take sector.id: 1 and region.id: 5 there is no data so I want to remove it. (However my data is not grouped like the image below, its as the dput code.

I used NA for missing values in the sample x.
# get ready
set.seed(123) # set seed for reproducibility
df$x <- sample(c(NA,1), 40, replace = TRUE) # sample values
Base solution
# split by ids, check for values, bind together nonempty combinations
dfs_split <- split(df, list(df$Major.sectors.id, df$Region.in.country.id))
has_value <- sapply(dfs_split, function(df) !all(is.na(df$x)))
dfs_nonempty <- dfs_split[has_value]
res <- do.call(rbind, dfs_nonempty)
Explanation:
split divides the data into the groups you specified
sapply applies the test for non-missing values on each group
do.call helps to rbind the groups (which actually form a list)
dplyr solution
This is the cleaner option.
library(dplyr)
res <- df %>%
group_by(Major.sectors.id, Region.in.country.id) %>%
filter(!all(is.na(x)))

Count function in R [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 7 years ago.
Improve this question
structure(list(Date = structure(c(4L, 4L, 4L, 4L, 4L, 4L, 4L,
5L, 5L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 3L), .Label = c("13/09/14", "14/09/14", "15/09/14",
"16/08/14", "17/08/14", "18/08/14", "23/08/14", "24/08/14", "25/08/14",
"30/08/14", "31/08/14"), class = "factor"), HomeTeam = structure(c(1L,
8L, 11L, 13L, 15L, 19L, 20L, 9L, 12L, 3L, 2L, 4L, 5L, 6L, 14L,
17L, 7L, 16L, 18L, 10L, 3L, 6L, 10L, 12L, 13L, 17L, 20L, 2L,
8L, 18L, 1L, 4L, 5L, 9L, 14L, 15L, 16L, 19L, 11L, 7L), .Label = c("Arsenal",
"Aston Villa", "Burnley", "Chelsea", "Crystal Palace", "Everton",
"Hull", "Leicester", "Liverpool", "Man City", "Man United", "Newcastle",
"QPR", "Southampton", "Stoke", "Sunderland", "Swansea", "Tottenham",
"West Brom", "West Ham"), class = "factor"), AwayTeam = structure(c(5L,
6L, 17L, 7L, 2L, 16L, 18L, 14L, 10L, 4L, 12L, 8L, 20L, 1L, 19L,
3L, 15L, 11L, 13L, 9L, 11L, 4L, 15L, 5L, 16L, 19L, 14L, 7L, 1L,
9L, 10L, 17L, 3L, 2L, 12L, 8L, 18L, 6L, 13L, 20L), .Label = c("Arsenal",
"Aston Villa", "Burnley", "Chelsea", "Crystal Palace", "Everton",
"Hull", "Leicester", "Liverpool", "Man City", "Man United", "Newcastle",
"QPR", "Southampton", "Stoke", "Sunderland", "Swansea", "Tottenham",
"West Brom", "West Ham"), class = "factor"), FTR = structure(c(3L,
2L, 1L, 1L, 1L, 2L, 1L, 3L, 1L, 1L, 2L, 3L, 1L, 2L, 2L, 3L, 2L,
2L, 3L, 3L, 2L, 1L, 1L, 2L, 3L, 3L, 1L, 3L, 2L, 1L, 2L, 3L, 2L,
1L, 3L, 1L, 2L, 1L, 3L, 2L), .Label = c("A", "D", "H"), class = "factor"),
Referee = structure(c(4L, 10L, 9L, 3L, 1L, 12L, 2L, 8L, 7L,
11L, 9L, 6L, 8L, 5L, 15L, 3L, 4L, 7L, 1L, 11L, 2L, 4L, 6L,
10L, 16L, 14L, 9L, 8L, 1L, 13L, 8L, 5L, 9L, 6L, 2L, 11L,
3L, 1L, 13L, 7L), .Label = c("A Taylor", "C Foy", "C Pawson",
"J Moss", "K Friend", "L Mason", "M Atkinson", "M Clattenburg",
"M Dean", "M Jones", "M Oliver", "N Swarbrick", "P Dowd",
"P Tierney", "R East", "R Madley"), class = "factor")), .Names = c("Date",
"HomeTeam", "AwayTeam", "FTR", "Referee"), row.names = c(NA,
40L), class = "data.frame")
In the above dataset I am trying to find out the referee who served the most number of matches for each team. For example, which guy refereed for Aston Villa the most in home games and in away games and both.
Sorry about me being blunt with my question. I did make an attempt.
In order to find out how many times referee J Moss refereed for Arsenal I tried this,
awayref<-nrow(awayref<-(filter(fd,fd$Referee=='J Moss',fd$AwayTeam=='Arsenal')))
homeref<-nrow(hf<-(filter(fd,fd$Referee=='J Moss',fd$HomeTeam=='Arsenal')))
View(total<-homeref+awayref)
I needed some help with looping it to include all referees and all teams.

We can do
tbl1 <- table(df1$Referee)
tbl1[which.max(tbl1)]

Ggplot2 geom_line error

I have a daaset which consists of data points over a time series for the proportion of people living in urban/rural areas for a number of countries. Sadly, not all countries have data for the same years. I have been trying to produce a simple line plot to show the different proportions of people living in different locations by year, but as each country has a different number of data points I am running into trouble.
I think this is because some of the countries only have data for a single year and using geom_line from ggplot2 throws the following error:
geom_path: Each group consist of only one observation. Do you need to
adjust the group aesthetic?
I was hoping that there would be some way to override this, or perhaps just plot a single point where a COUNTRY only has data for a single year. Does anyone know if this is possible, or indeed, if this is actually what this error means?!!?
Any help greatly appreciated!!!
Thanks
Here is my data:
structure(list(COUNTRY = structure(c(1L, 2L, 2L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L,
8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 10L, 11L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L, 1L,
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 6L, 6L, 6L, 6L, 7L,
7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 10L, 11L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 14L,
14L, 14L, 14L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L,
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L,
9L, 9L, 10L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L,
13L, 13L, 13L, 14L, 14L, 14L, 14L, 1L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 8L,
8L, 8L, 9L, 9L, 9L, 9L, 9L, 10L, 11L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L, 1L, 2L,
2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L,
7L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 10L, 11L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 14L, 14L,
14L, 14L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 6L,
6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 9L,
9L, 10L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L,
13L, 13L, 14L, 14L, 14L, 14L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L,
4L, 4L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L,
8L, 9L, 9L, 9L, 9L, 9L, 10L, 11L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L), class = "factor", .Label = c("Comoros",
"Eritrea", "Ethiopia", "Kenya", "Lesotho", "Madagascar", "Malawi",
"Namibia", "Rwanda", "South Africa", "Swaziland", "Tanzania",
"Zambia", "Zimbabwe")), Year = structure(c(5L, 12L, 4L, 25L,
16L, 9L, 22L, 13L, 7L, 2L, 23L, 15L, 22L, 14L, 6L, 1L, 24L, 15L,
9L, 1L, 13L, 6L, 19L, 9L, 1L, 24L, 21L, 16L, 9L, 1L, 7L, 19L,
24L, 13L, 8L, 5L, 1L, 18L, 10L, 4L, 20L, 11L, 5L, 1L, 24L, 17L,
8L, 3L, 5L, 12L, 4L, 25L, 16L, 9L, 22L, 13L, 7L, 2L, 23L, 15L,
22L, 14L, 6L, 1L, 24L, 15L, 9L, 1L, 13L, 6L, 19L, 9L, 1L, 24L,
21L, 16L, 9L, 1L, 7L, 19L, 24L, 13L, 8L, 5L, 1L, 18L, 10L, 4L,
20L, 11L, 5L, 1L, 24L, 17L, 8L, 3L, 5L, 12L, 4L, 25L, 16L, 9L,
22L, 13L, 7L, 2L, 23L, 15L, 22L, 14L, 6L, 1L, 24L, 15L, 9L, 1L,
13L, 6L, 19L, 9L, 1L, 24L, 21L, 16L, 9L, 1L, 7L, 19L, 24L, 13L,
8L, 5L, 1L, 18L, 10L, 4L, 20L, 11L, 5L, 1L, 24L, 17L, 8L, 3L,
5L, 12L, 4L, 25L, 16L, 9L, 22L, 13L, 7L, 2L, 23L, 15L, 22L, 14L,
6L, 1L, 24L, 15L, 9L, 1L, 13L, 6L, 19L, 9L, 1L, 24L, 21L, 16L,
9L, 1L, 7L, 19L, 24L, 13L, 8L, 5L, 1L, 18L, 10L, 4L, 20L, 11L,
5L, 1L, 24L, 17L, 8L, 3L, 5L, 12L, 4L, 25L, 16L, 9L, 22L, 13L,
7L, 2L, 23L, 15L, 22L, 14L, 6L, 1L, 24L, 15L, 9L, 1L, 13L, 6L,
19L, 9L, 1L, 24L, 21L, 16L, 9L, 1L, 7L, 19L, 24L, 13L, 8L, 5L,
1L, 18L, 10L, 4L, 20L, 11L, 5L, 1L, 24L, 17L, 8L, 3L, 5L, 12L,
4L, 25L, 16L, 9L, 22L, 13L, 7L, 2L, 23L, 15L, 22L, 14L, 6L, 1L,
24L, 15L, 9L, 1L, 13L, 6L, 19L, 9L, 1L, 24L, 21L, 16L, 9L, 1L,
7L, 19L, 24L, 13L, 8L, 5L, 1L, 18L, 10L, 4L, 20L, 11L, 5L, 1L,
24L, 17L, 8L, 3L, 5L, 12L, 4L, 25L, 16L, 9L, 22L, 13L, 7L, 2L,
23L, 15L, 22L, 14L, 6L, 1L, 24L, 15L, 9L, 1L, 13L, 6L, 19L, 9L,
1L, 24L, 21L, 16L, 9L, 1L, 7L, 19L, 24L, 13L, 8L, 5L, 1L, 18L,
10L, 4L, 20L, 11L, 5L, 1L, 24L, 17L, 8L, 3L), class = "factor", .Label = c("1992",
"1993", "1994", "1995", "1996", "1997", "1998", "1999", "2000",
"2000/1", "2001/2", "2002", "2003", "2003/4", "2004", "2005",
"2005/6", "2006", "2006/7", "2007", "2007/8", "2008/9", "2009",
"2010", "2011")), location = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L), .Label = c("Urban",
"Rural", "Total", "Capital.City", "Other.Cities.towns", "Urban.Non.slum",
"Urban.Slum"), class = "factor"), percent = c(63.0434782608696,
93.8, 87, 79.5642604795185, 65.4240807416892, 63.0791092522326,
90.448386469558, 85.9419999774024, 92.7603614781794, 84.0437368780105,
89.9792286718626, 91.0916571421351, 87.1132950026762, 73.8624315865239,
60.8311005575454, 66.7, 96, 86.8, 90.6243926153181, 90.6911141749493,
90.7602286016099, 93.0377175475414, 86.073106379954, 84.253722056373,
77.8178199148702, 97.3, 91.8332260789258, 89.612164524266, 89.9070989918367,
94.9, 85.1351949905457, 94.8358752154967, 92.9, 89.656599879838,
90.2634019334124, 94.4, 91.6241263241579, 76.7337303943862, 68.4233513070184,
74.15601627144, 88.4802888646634, 85.4643913454376, 89.7457528950664,
81.3025210084024, 83.0579155525397, 71.5857386620092, 86.2324062094295,
87.687478493975, 63.5379061371841, 78.5, 40.7, 51.7763728811622,
32.2441768813334, 22.3138981723172, 83.3699691175754, 69.6742912391579,
76.0526239692028, 83.7290062290807, 77.4758329101792, 83.8081963934296,
67.5805226154664, 55.8951299980461, 41.9921451192584, 52.2, 92.5,
77.6, 82.0322170392223, 85.2850090044269, 70.8031150919282, 47.108593681531,
82.2215412952297, 78.3643348536815, 74.4253468485616, 94.8, 90.1711142192198,
85.0338348718722, 86.3134329333052, 90.4, 79.2813256726705, 90.7077549957666,
82.5, 77.7236217339155, 75.3278238729086, 77.7, 78.4592126267142,
67.1145693585691, 55.3459024734839, 57.8463881286199, 83.5604620304044,
83.9259722574938, 84.4589780509803, 73.3992444632325, 77.544833952707,
63.0503715222555, 75.6808008503601, 85.6943513045284, 63.4, 84.2,
51, 55.7151220012609, 34.9, 26.6, 85, 72.5, 79.2, 83.8, 80.3,
84.9, 69.6, 59, 46, 54, 93, 78.7, 83.2, 85.9, 76.7, 57.5, 83.8,
80.4, 75.6, 95, 90.4, 85.6, 86.9, 90.6, 82.2, 91.5, 84.5, 79.9,
78.1, 80.9, 81.2, 68.1, 56.8, 59.6, 84.9, 84.4, 86.5, 77, 79.1337842548663,
65.6, 79.1, 86.3, 68.421052631579, 96.1, 93.3, 93.461209969107,
82.2712525836501, 88.2708936990495, 87.6298001816506, 87.6386027991385,
93.1818181818183, 86.6666666666668, 88.1030398041979, 90.4761904761904,
83.4297434324662, 86.3744073211853, 83.6107223166148, 78.3, NA,
72.8, 80.952380952381, 87.5, 96.9073193030442, 99.1348508752745,
85.5297651573129, 86.4793919321843, 79.4520547945208, 98.2, 92.4613307718678,
85.4590408924955, 83.9378238341966, 92.1, 81.1594202898552, 96.0232554251852,
NA, 88.0377726639494, 83.690767555447, 93.4, 90.0349966633017,
71.2508707571865, 72, 79.4082828804656, 91.8032786885246, 84.5238095238095,
87.8787878787881, 75.6097560975609, 81.0643061692494, 68.4708412135189,
84.9056603773584, 89.5522388059702, 61.6438356164384, 91.7, 79.5,
77.0004220956012, 61.061381883032, 58.756042602018, 91.2594694272412,
85.20149612163, 92.4956062313464, 82.622382662868, 91.4036416540165,
91.6169313256523, 89.2957214499669, 67.6757501795213, 48.1479760952102,
NA, NA, 94.2, 94.3553068539161, 91.8799748693178, 89.3739230258784,
92.1418739343887, 86.4757947454868, 81.0102236379536, 77.0100025126874,
NA, 91.3720851411616, 92.2, 92.5003150086683, 97.8260869565219,
87.1461797069698, 93.5168077834096, NA, 90.1780793791367, 92.9758067301415,
94.9, 91.8829499602467, 81.749280834314, 65.1853441661798, 69.0503609949116,
87.2562445664681, 85.8298270239758, 90.6673511683335, 83.2861189801694,
84.9006282245266, 73.65452177457, 87.3075692692965, 85.5310215524833,
83.3333333333333, NA, NA, 98.5990187756088, 84.4640706359058,
NA, 93.9158337759274, 91.5744358611439, 100, NA, NA, NA, 88.7824144772468,
85.1972665683085, 89.54493171236, NA, NA, 89.8, NA, 100, 97.6261376125643,
96.3196943955923, 92.0952338262334, 87.9266080431752, 80.9429968520701,
NA, NA, 92.8, 95.2886158200472, 100, 86.4199793410402, NA, NA,
89.9001648604344, NA, NA, 91.5033109800214, 83.8918470610424,
73.9339911532972, 88.6921281548131, 94.309068022859, 85.3299585067346,
93.7362934447331, 86.5384615384618, 83.7424288707868, NA, 86.3836615391687,
88.1866796344726, 58.1081081081081, NA, NA, 75.7976468146464,
62.1289432084197, NA, 88.1488735873722, 84.2108238885019, 89.8335978405451,
NA, NA, NA, 86.9222656846515, 70.3584041024493, 70.9023609260137,
NA, NA, 85.9, NA, 89.8689917369566, 90.3864925686512, 92.628169473785,
80.9468895007753, 78.7885741638367, 75.4005791241575, NA, NA,
88.4, 87.7139456942162, 92.3809523809525, 83.7645232075473, NA,
NA, 89.567507133125, NA, NA, 91.6433898994358, 73.6225283043976,
65.9223049858496, 72.3148320483822, 86.2596215693035, 85.6224026570651,
87.4940330171337, 78.7499999999997, 81.9949404453665, NA, 84.5563115043796,
87.0190820047277)), .Names = c("COUNTRY", "Year", "location",
"percent"), row.names = c(NA, -336L), class = "data.frame")
I want to produce a simple plot with ggplot2 that is facetted by COUNTRY. I can do this fine using geom_point:
ggplot(meas_melt, aes(Year, percent, colour=location))+ geom_point() + facet_wrap(~COUNTRY)
However, if I try and produce a line plot with geom_line (ggplot(meas_melt, aes(Year, percent, colour=location))+ geom_line() + facet_wrap(~COUNTRY))
I get the following error:
geom_path: Each group consist of only one observation. Do you need to
adjust the group aesthetic?
I had thought that this could be because a couple of the countries have only one year's worth of data so I subsetted the date to remove these three countries like so:
ggplot(meas_melt, aes(Year, percent, colour=location))+ geom_line(data=meas_melt[!meas_melt$COUNTRY %in% c('Comoros','South Africa','Swaziland'),]) + facet_wrap(~COUNTRY)
However, I get the same error!

#Sven's answer is correct but fixes only part of the problem. Note how there's no plot for Comoros, South Africe, or Swaziland. This is because in your data, sometimes year is, e.g., 2006 or 2007, and sometimes it is "2006/7".
data[meas_melt$COUNTRY=="Swaziland",]
COUNTRY Year location percent
32 Swaziland 2006/7 Urban 94.83588
80 Swaziland 2006/7 Rural 90.70775
128 Swaziland 2006/7 Total 91.50000
176 Swaziland 2006/7 Capital.City 96.02326
224 Swaziland 2006/7 Other.Cities.towns 93.51681
272 Swaziland 2006/7 Urban.Non.slum NA
320 Swaziland 2006/7 Urban.Slum NA
Those countries really have only one "year" (hence, no line). More importantly, these odd year designations distort your x-axis. You can see that using the scales="free" argument to facet_wrap(...):
ggplot(meas_melt, aes(x=Year,y=percent, color=location)) +
geom_line(aes(group=location)) +facet_wrap(~COUNTRY, scales="free") +
theme(axis.text.x=element_text(angle=90, vjust=0.5, size=8),
legend.position="bottom")
Which produces this:

You have to specify aes(group = location) inside geom_line:
library(ggplot2)
ggplot(meas_melt, aes(Year, percent, colour=location)) +
geom_line(aes(group = location)) +
facet_wrap(~COUNTRY)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

R - WordCloud2 does not always render the most frequent words - r

Reduce the font size so that all words fit the available page space: wordcloud2(DataCloud, size = .5)

Related

How to change range of heatmap using gheatmap in R

Facets: organising their order and organising the levels within facets

drop from an intersection of two columns and based on a condition in a third column

Count function in R [closed]

Ggplot2 geom_line error

Categories

Resources