I have a dataframe like this one:
> dput(df)
structure(list(OBBLIGATORIO = structure(c(2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("no",
"yes"), class = "factor"), COUNTRY = structure(c(16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L), .Label = c("Austria", "Belgium", "Bulgaria",
"Croatia", "Cyprus", "Czech Republic", "Denmark", "Estonia",
"Finland", "France", "Germany", "Greece", "Hungary", "Iceland",
"Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg", "Malta",
"Norway", "Poland", "Portugal", "Romania", "Slovakia", "Slovenia",
"Spain", "Sweden", "United Kingdom of Great Britain and Northern Ireland"
), class = "factor"), YEAR = c(2003L, 2006L, 2007L, 2008L, 2009L,
2010L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L,
2003L, 2006L, 2007L, 2008L, 2009L, 2010L, 1995L, 1996L, 1997L,
1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2006L, 2007L, 2008L,
2009L, 2010L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L,
2002L, 2003L, 2006L, 2007L, 2008L, 2009L, 2010L, 1995L, 1996L,
1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2006L, 2007L,
2008L, 2009L, 2010L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L,
2001L, 2002L, 2003L, 2006L, 2007L, 2008L, 2009L, 2010L, 1995L,
1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2006L,
2007L, 2008L, 2009L, 2010L, 1995L, 1996L, 1997L, 1998L, 1999L,
2000L, 2001L, 2002L), AGE = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Total", class = "factor"),
`CAUSE OF DEATH` = c("Acute poliomyelitis", "Acute poliomyelitis",
"Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis",
"Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis",
"Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis",
"Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis",
"Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria",
"Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria",
"Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Measles",
"Measles", "Measles", "Measles", "Measles", "Measles", "Measles",
"Measles", "Measles", "Measles", "Measles", "Measles", "Measles",
"Measles", "Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tetanus",
"Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tetanus",
"Tetanus", "Tetanus", "Tetanus", "Tuberculosis", "Tuberculosis",
"Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis",
"Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis",
"Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis",
"Viral hepatitis", "Viral hepatitis", "Viral hepatitis",
"Viral hepatitis", "Viral hepatitis", "Viral hepatitis",
"Viral hepatitis", "Viral hepatitis", "Viral hepatitis",
"Viral hepatitis", "Viral hepatitis", "Viral hepatitis",
"Viral hepatitis", "Viral hepatitis", "Whooping cough", "Whooping cough",
"Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough",
"Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough",
"Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough"
), VALUE = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 4L, 2L, 2L, 2L, 1L, 1L, 6L, 7L, 7L, 1L, 2L,
3L, 2L, 5L, 12L, 9L, 13L, 9L, 13L, 8L, 17L, 14L, 16L, 18L,
15L, 19L, 11L, 10L, 25L, 24L, 21L, 22L, 23L, 20L, 34L, 32L,
31L, 30L, 29L, 28L, 27L, 26L, 41L, 42L, 43L, 45L, 46L, 47L,
33L, 35L, 36L, 37L, 38L, 39L, 40L, 44L, 1L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 3L, 1L, 1L, 1L, 1L), .Label = c("0", "1",
"2", "3", "6", "7", "9", "17", "18", "19", "21", "22", "27",
"28", "30", "31", "37", "41", "42", "301", "329", "333",
"344", "350", "396", "413", "415", "460", "517", "558", "597",
"609", "622", "647", "681", "1087", "1349", "1413", "1448",
"1499", "1576", "1654", "1725", "1948", "2531", "2665", "2757"
), class = "factor"), ID = 1:98), .Names = c("OBBLIGATORIO",
"COUNTRY", "YEAR", "AGE", "CAUSE OF DEATH", "VALUE", "ID"), row.names = c(NA,
-98L), class = "data.frame")
I want to obtain a chart that:
on x axis there are values from YEAR column
on y axis there are
values from VALUE column data are divided by CAUSE OF DEATH column
So something like:
I try:
x11()
ggplot(df, aes(x = df$`YEAR`, y = df$`VALUE`, fill = df$`CAUSE OF DEATH`, colour = df$`CAUSE OF DEATH`)) +
geom_density(alpha = 0.1) +
xlim(1995, 2010)
But the result is completely different from the one I want.
Thanks
I'm not sure what your actual question is, but one problem with your dataframe is that the VALUE column is currently defined as a factor, not as as a numeric. I think that remedying this will go a long way to solving your problem. I do this post-facto below (i.e. after the dataframe is already created), but if you are getting the data into R via a read.table() or similar command, you can specify the class of your columns at data frame creation time, which is probably a better approach.
In my code below I use the dplyr package for manipulating dataframes. It's quite powerful, but for this particular example it isn't doing anything that base R couldn't do.
require(ggplot2)
require(dplyr)
require(magrittr)
df <- ### YOUR dput output goes here ###
# fix the problem with the `VALUE` column
df %<>% mutate(VALUE = VALUE %>% as.character %>% as.numeric)
# equivalent in base R:
# df$VALUE <- as.numeric(as.character(df$VALUE))
# make a graph (is it the one you want?)
df %>% group_by(YEAR, `CAUSE OF DEATH`) %>%
summarize(value = sum(VALUE)) %>%
ggplot(aes(x = YEAR, y = value, color = `CAUSE OF DEATH`)) +
geom_line() +
theme_bw() +
geom_point()
# save graph for uploading to SO
ggsave('SO37230266.png')
The result is this graph:
Related
First at all I would like to apologise if I did not use the correct jargon.
I have the dataset as below which contains a wide range of categories
Here some excerpt from dput (using droplevels)
structure(list(
x = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L), *[ME: there are more years than 2010...]*
y = c(7.85986, 185.81068, 107.24097, 7094.74649,
1.4982, 185.77319, 5090.79354, 167.58584, 4189.64609, 157.08277,
3927.06932, 2.86732, 71.683, 4.70123, 117.53085, 2.93452, 73.36292,
1.4982, 18.18734, 901.14744, 0.90268, 13.77532, 613.38298, 0.01845,
0.0681, 7.19925, 3.75315, 0.14333, 136.54008, 0.04766, 0.59077,
28.97255, 0.38608, 115.05258, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
x1 = structure(c(4L, 2L, 3L, 1L, 4L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 4L, 2L, 1L, 4L, 2L, 1L, 4L, 2L,
1L, 2L, 4L, 1L, 4L, 2L, 1L, 4L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L), .Label = c("All greenhouse gases - (CO2 equivalent)",
"CH4", "CO2", "N2O"), class = "factor"),
x2 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Austria",
class = "factor"),
x4 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 4L,
4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 10L,
10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 13L, 13L, 14L, 14L,
15L, 15L, 16L, 16L, 17L, 17L, 18L, 18L), .Label = c("3",
"3.1", "3.A", "3.A.1", "3.A.2", "3.A.3", "3.A.4", "3.B",
"3.B.1", "3.B.2", "3.B.3", "3.B.4", "3.B.5", "3.C", "3.C.1",
"3.C.2", "3.C.3", "3.C.4"), class = "factor")), class = "data.frame",
row.names = c(NA,
-44L))
I want to know whether the of the sum of subcategories in x4 (e.g. 3.B.1+3.B.2+...+3.B.n) equal the figure stated in the parent category (e.g. 3.B). (i.e. the in the csv stated sum) for a given year and country. I want to verify the sums.
For get the sum of the subcategories I have this
sum(df$y[df$x4 %in% c("3.A.1", "3.A.2", "3.A.3", "3.A.4") & x ==
"2010" & x2 == "Austria"])
To receive the sum of the parent category I have this
sum(df$y[df$x4 %in% c("3.A") & x == "2010" & x2 == "Austria"])
Next I would need an operation which checks whether the results of both codes are equal (True/false). However, I have more than 20 countries, 20 years, dozens of categories to check. With my newby approach I would be writing code for ages...
is there anyway to automate this? Basically, I am looking for a code which is able to do the following
1) Run for one category, go to next one
2) once done with categories change year and start again with categories
3) ... same for countries....
Any sort of help would be appreciated and even a suggestions how to use the right jargon in the title. Thanks in any case
Here's a potential solution using dplyr (might require some tweaking based on the full dataset):
require(dplyr)
# Create two columns - one that shows only the parent category number, and one that tells you if it's a parent or child; note that the regex here makes some assumptions on the format of your data.
mutate(df,parent=gsub("(.?\\..?)\\..*", "\\1", df$x4),
type=ifelse(parent==x4,"Parent","Child")) %>%
# Sum the children y's by category, year and country
group_by(parent, type, x, x2) %>%
summarize(sum(y)) %>%
# See if the sum of the children is equal to the parent y
tidyr::spread(type,`sum(y)`) %>%
mutate(equals=isTRUE(all.equal(Child,Parent)))
Result using your (new) data:
parent x x2 Child Parent equals
<chr> <int> <fct> <dbl> <dbl> <lgl>
1 3 2010 Austria NA 7396. FALSE
2 3.1 2010 Austria NA 5278. FALSE
3 3.A 2010 Austria 4357. 4357. TRUE
4 3.B 2010 Austria 921. 921. TRUE
5 3.C 2010 Austria 0 0 TRUE
I can see from your new data that you have two levels of parents. My solution will only work for the second level (e.g. 3.1 and its children), but can be easily tweaked to also work for the top level.
A ggplot2 novice here
I am trying to generate a time series of the sample data given below using ggplot2. The following short code does not give me what I want.
ggplot(dat, aes(x = year,y = data, fill = period,
group = interaction(period, season))) +
geom_line() +
facet_grid(season ~ ., scales = "free")
You can see that the lines appear awkward. How can I plot cu and futogether for each season? Use redcolor for cuand blue for fu.
dat=structure(list(period = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("cu", "fu"), class = "factor"),
season = structure(c(2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 1L,
1L, 1L, 1L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L), .Label = c("DJF", "JJA",
"MAM", "SON"), class = "factor"), month = structure(c(7L,
6L, 2L, 7L, 12L, 11L, 10L, 12L, 3L, 5L, 4L, 3L, 8L, 1L, 9L,
8L, 7L, 6L, 2L, 7L, 12L, 11L, 10L, 12L, 3L, 5L, 4L, 3L, 8L,
1L, 9L, 8L), .Label = c("april", "august", "dec", "feb",
"jan", "july", "june", "march", "may", "nov", "oct", "sep"
), class = "factor"), year = c(2001L, 2001L, 2001L, 2002L,
2001L, 2001L, 2001L, 2002L, 2001L, 2001L, 2001L, 2002L, 2001L,
2001L, 2001L, 2002L, 2001L, 2001L, 2001L, 2002L, 2001L, 2001L,
2001L, 2002L, 2001L, 2001L, 2001L, 2002L, 2001L, 2001L, 2001L,
2002L), data = c(84.08969137, 76.4948428, 18.35492802, 101.8821712,
24.21773903, 16.44881361, 19.57283027, 48.27623315, 8.572824549,
12.97601394, 11.50496081, 15.14899058, 13.96396375, 27.21030149,
36.1606234, 23.35430348, 95.77643784, 94.84972642, 47.26900009,
2.385978093, 21.48062239, 24.67779645, 20.07044416, 43.09234771,
13.28295078, 19.27189857, 15.24661793, 21.75991334, 19.38239851,
39.93109491, 38.54500325, 33.77559647)), .Names = c("period",
"season", "month", "year", "data"), class = "data.frame", row.names = c(NA,
-32L))
Thanks for any suggestions.
I would do something like :
library(ggplot2)
ggplot(dat,aes(x=
as.Date(sprintf("%s-%s-01",year,month),
"%Y-%b-%d"),
y=data,group=period,color=period)) +
geom_line()+facet_grid(season ~ ., scales="free") +
xlab("time")
In fact I am creating a regular date and grouping just by period.
I am making a bar plot using lattice in R where I have data for 4 different years on sources of irrigation for different states. using my code, the bar plot is coming fine but I wish the bar corresponding to the year 1996 to be plotted first followed by the bar corresponding to year 2001 etc. so as to show the increasing area being irrigated by tube-wells. However, I am unable to change the ordering. Here is my data and the R code. Many thanks for your help.
# sample data
irr_atlas <- structure(list(state = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("ANDHRA PRADESH",
"KARNATAKA", "MADHYA PRADESH", "RAJASTHAN"), class = "factor"),
st_code = c(28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L,
28L, 28L, 28L, 28L, 28L, 28L, 28L, 29L, 29L, 29L, 29L, 29L,
29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 23L,
23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L,
23L, 23L, 23L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L), year = c(1996L, 1996L, 1996L, 1996L,
2001L, 2001L, 2001L, 2001L, 2006L, 2006L, 2006L, 2006L, 2011L,
2011L, 2011L, 2011L, 1996L, 1996L, 1996L, 1996L, 2001L, 2001L,
2001L, 2001L, 2006L, 2006L, 2006L, 2006L, 2011L, 2011L, 2011L,
2011L, 1996L, 1996L, 1996L, 1996L, 2001L, 2001L, 2001L, 2001L,
2006L, 2006L, 2006L, 2006L, 2011L, 2011L, 2011L, 2011L, 1996L,
1996L, 1996L, 1996L, 2001L, 2001L, 2001L, 2001L, 2006L, 2006L,
2006L, 2006L, 2011L, 2011L, 2011L, 2011L), irr_area = c(1.84066,
0.942819, 0.82886, 0.853502, 1.54922, 0.825659, 0.542492,
1.53412, 1.72969, 0.70271, 0.637221, 1.53894, 1.99893, 0.678425,
0.819829, 1.70708, 0.921594, 0.231669, 0.316999, 0.358529,
0.91339, 0.207157, 0.426549, 0.481061, 0.921255, 0.18192,
0.426145, 0.547193, 0.930802, 0.148065, 0.377149, 1.51843,
1.59425, 0.112145, 2.67683, 0.540054, 1.48056, 0.030502,
1.63696, 0.563948, 1.12595, 0.058667, 2.46494, 1.15004, 1.10444,
0.157069, 2.64378, 2.14177, 1.55814, 0.106623, 2.71347, 0.644683,
1.35746, 0.030586, 2.41845, 0.935234, 1.76933, 0.054374,
2.46197, 1.76918, 1.62587, 0.050299, 2.14737, 2.82708),irr_source = structure(c(1L,2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L,
1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L,
3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L,
4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L,
2L, 4L, 3L), .Label = c("Canal", "Tank", "Tube", "Well"), class = "factor")), .Names = c("state","st_code", "year", "irr_area", "irr_source"), class = "data.frame", row.names = c(NA, -64L))
Code for plot...
library(lattice)
barchart(~irr_area | factor(state) + factor(irr_source),
group=year, data=irr_atlas, auto.key=list(space="right"))
As mentioned, ordering of groups in R graphics is usually determined by the ordering of the factor variable. So, you can reorder your factors with factor and its levels argument.
library(lattice)
barchart(~irr_area | factor(state) + factor(irr_source),
group=factor(year, levels=sort(unique(year), decreasing=T)), # change the order of years
data=irr_atlas, auto.key=list(space="right"))
You can switch it back the other way by changing decreasing=F.
I'm trying to program something quite simple (I think) in R, but I can't seem to get it right. I have a dataset of 50 countries (1 to 50) for 15 years each and about 20 variables per country. For now I am only testing one variable (OS) on my dependent variable (SMD). I would like to do this with a loop country by country so I would get the output for each country in stead of the overall output.
I thought it would be wise to create a subset first (to be able to look at country 1 first, after which my loop should increase the number for country and test country 2). I believe my regression at the bottom of the page should give me the output for country 1 in stead of the overall score for the entire dataset. However I keep getting these errors:
> pdata <- plm.data(newdata, index=c("Country","Date"))
series are constants and have been removed
> pooling <- plm(Y ~ X, data=pdata, model= "pooling")
series Country, xRegion are constants and have been removed
Error in model.matrix.pFormula(formula, data, rhs = 1, model = model, :
NA in the individual index variable
> summary(pooling)
Error in summary(pooling) : object 'pooling' not found
I might be looking at this all wrong, but I believe that without getting this to work, there is no point in going further with programming the loop itself. Any advice on solving my errors, or other ways of programming a loop are really appreciated.
My code:
rm(list = ls())
mydata <- read.table(file = file.choose(), header = TRUE, dec = ",")
names(mydata)
attach(mydata)
Y <- cbind(SMD)
X <- cbind(OS)
newdata <- subset(mydata, Country %in% c(1))
newdata
pdata <- plm.data(newdata, index=c("Country","Date"))
pooling <- plm(Y ~ X, data=pdata, model= "pooling")
summary(pooling)
Edit: data sample of first 2 countries which causes same error
dput(mydata)
structure(list(Region = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("NAF", "SAME"), class = "factor"), Country = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), Date = c(1995L, 1996L, 1997L, 1998L,
1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L,
2008L, 2009L, 2010L, 2011L, 2012L, 2013L, 2014L, 1995L, 1996L,
1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L,
2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 2012L, 2013L, 2014L
), OS = structure(c(19L, 25L, 27L, 15L, 22L, 20L, 23L, 9L, 7L,
5L, 2L, 1L, 4L, 3L, 6L, 10L, 11L, 13L, 11L, 8L, 26L, 25L, 31L,
29L, 28L, 21L, 30L, 24L, 24L, 16L, 11L, 14L, 12L, 17L, 18L, 29L,
32L, 32L, 33L, 34L), .Label = c("51.5", "52.2", "55.6", "56.4",
"56.7", "57.7", "57.8", "58.3", "59", "59.2", "59.6", "59.9",
"60.2", "60.4", "61.1", "61.2", "62.2", "62.3", "62.8", "63.2",
"63.3", "63.8", "63.9", "64.2", "64.3", "64.5", "64.7", "65.3",
"65.5", "65.6", "66.4", "68", "69.6", "70.7"), class = "factor"),
SMD = structure(c(7L, 12L, 20L, 21L, 17L, 15L, 13L, 10L,
14L, 22L, 23L, 33L, 1L, 32L, 29L, 34L, 28L, 25L, NA, NA,
9L, 6L, 8L, 4L, 2L, 35L, 3L, 36L, 5L, 11L, 16L, 18L, 24L,
19L, 26L, 31L, 27L, 30L, NA, NA), .Label = c("100.3565662",
"13.44788845", "13.45858747", "13.56815534", "15.05892471",
"17.63789658", "18.04088718", "18.3101351", "19.34226196",
"21.25530884", "21.54423145", "23.75898948", "24.08770926",
"26.39817342", "29.44079001", "31.40605191", "34.46667996",
"34.52913657", "35.66070947", "36.4419931", "39.16875621",
"44.0126137", "45.72949566", "49.13062679", "54.83730247",
"56.87886311", "59.80971583", "60.5658962", "69.20148901",
"70.91362874", "72.64845214", "73.97139238", "75.20140919",
"76.18378138", "9.570435019", "9.867635305"), class = "factor")), .Names = c("Region",
"Country", "Date", "OS", "SMD"), class = "data.frame", row.names = c(NA,
-40L))
Are you sure you need to use plm?? This produces a list of summaries by country.
# convert factors to numeric
mydata$SMD <- as.numeric(mydata$SMD)
mydata$OS <- as.numeric(mydata$OS)
# Using lapply(...)
smry <- lapply(unique(mydata$Country),
function(cntry)
summary(lm(SMD~OS,data=mydata[mydata$Country==cntry,])))
# Same thing, using for loop
smry <- list()
for (cntry in unique(mydata$Country)) {
smry <- list(smry,
summary(lm(SMD~OS,data=mydata[mydata$Country==cntry,])))
}
In your dataset, SMD and OS are factors, which need to be converted to numeric first.
JD Long helped me with this: question about manual annotation.
But is it possible to do something similar on a facetted plot, such that the label style corresponds to the linestyle (aestetics) and in a way that I can annotate different facets individually?
Some data:
funny <- structure(list(Institution = structure(c(1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("Q-branch",
"Some-Ville", "Spectre"), class = "factor"), Type = structure(c(5L,
6L, 1L, 3L, 5L, 6L, 2L, 4L, 5L, 6L, 2L, 4L, 5L, 6L, 2L, 4L, 5L,
6L, 2L, 4L, 5L, 6L, 2L, 4L, 5L, 6L, 2L, 4L, 5L, 6L, 2L, 4L, 5L,
6L, 2L, 4L, 5L, 6L, 2L, 4L, 5L, 6L, 2L, 4L, 5L, 6L, 2L, 4L), .Label = c("Korte videregående uddannelser",
"Mammas beer", "Mellemlange videregående uddannelser", "Tastes good",
"Unknown", "Your"), class = "factor"), År = c(2008L, 2008L,
2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L,
2008L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L), Mndr = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 27L, 27L, 27L,
27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L), Data = c(159L,
NA, NA, 23L, 204L, NA, NA, 12L, 256L, NA, NA, 24L, 166L, 6L,
NA, 43L, 228L, NA, NA, 20L, 196L, 11L, NA, 37L, 99L, 14L, 9L,
96L, 147L, 7L, 5L, 91L, 100L, 10L, 7L, 126L, 60L, 17L, 6L, 106L,
78L, 18L, 13L, 140L, 48L, 23L, 5L, 136L)), .Names = c("Institution",
"Type", "År", "Mndr", "Data"), class = "data.frame", row.names = c(NA,
-48L))
And a facetted plot:
ggplot(funny, aes(Mndr, y=Data, group=Type, col=Type)) +
geom_line() +
facet_grid(.~Institution)
Thanks in advance for your help!
The idea is that for each manual annotation you have to define not only the label, but all the variables that define the panel, color, etc. The following code adds two labels in different panels.
pl <- ggplot(funny, aes(Mndr, y=Data, group=Type, col=Type))+geom_line()
+facet_grid(.~Institution) #your plot
nd <- data.frame(Institution=c("Q-branch","Some-Ville"), #panel
Type=c("Unknown", "Tastes good"), #color
Mndr=c(7,12), #x-coordinate of label
Data= c(170,50), #y-coordinate of label
Text=c("Label 1", "Label 2")) #label text
# add labels to plot:
pl <- pl + geom_text(aes(label=Text), data=nd, hjust=0, legend=FALSE)
pl
The legend=FALSE option will ensure that the small a's denoting the text are not added to the legend. You don't have to have a data frame for the labels, you could have a separate geom_text for each, but I find this way simpler.