Sum variables using R by categories condition - r

I have a data frame that shows the number of publications by year. But I am interested just in Conference and Journals Publications. I would like to sum all other categories in Others type.
Examples of data frame:
year type n
1994 Conference 2
1994 Journal 3
1995 Conference 10
1995 Editorship 3
1996 Conference 20
1996 Editorship 2
1996 Books and Thesis 3
And the result would be:
year type n
1994 Conference 2
1994 Journal 3
1995 Conference 10
1995 Other 3
1996 Conference 20
1996 Other 5

With dplyr we can replace anything other than "Journal" or "Conference" to "Other" and then sum them by year and type.
library(dplyr)
df %>%
mutate(type = sub("^((Journal|Conference))", "Other", type)) %>%
group_by(year, type) %>%
summarise(n = sum(n))
# year type n
# <int> <chr> <int>
#1 1994 Conference 2
#2 1994 Journal 3
#3 1995 Conference 10
#4 1995 Other 3
#5 1996 Conference 20
#6 1996 Other 5

We can use data.table
library(data.table)
library(stringr)
setDT(df1)[, .(n = sum(n)), .(year, type = str_replace(type,
'(Journal|Conference)', 'Other'))]
# year type n
#1: 1994 Other 5
#2: 1995 Other 10
#3: 1995 Editorship 3
#4: 1996 Other 20
#5: 1996 Editorship 2
#6: 1996 Books and Thesis 3

levels(df$type)[levels(df$type) %in% c("Editorship", "Books_and_Thesis")] <- "Other"
aggregate(n ~ type + year, data=df, sum)
# type year n
# 1 Conference 1994 2
# 2 Journal 1994 3
# 3 Other 1995 3
# 4 Conference 1995 10
# 5 Other 1996 5
# 6 Conference 1996 20
Input data:
df <- structure(list(year = c(1994L, 1994L, 1995L, 1995L, 1996L, 1996L,
1996L), type = structure(c(2L, 3L, 2L, 1L, 2L, 1L, 1L), .Label = c("Other",
"Conference", "Journal"), class = "factor"), n = c(2L, 3L, 10L,
3L, 20L, 2L, 3L)), .Names = c("year", "type", "n"), row.names = c(NA, -7L), class = "data.frame")

Related

Repeatedly compare same portion of dataset to other portions of dataset based on index value in R

I have a dataframe that looks like the following:
state year value
1 1980 4
1 1981 5
1 1982 4
2 1980 2
2 1981 3
2 1982 4
100 1980 3
100 1981 2
100 1982 5
In the actual dataset, there are more states than are shown here. I would like to make a comparison between state 100 and all other states.
Specifically, for each state, I would like to find the difference between the value given by that state for a particular year and the value given for state 100 for that same year. Below, I have shown how I could compare the value for year 1980 between state 1 and state 100.
df_1 <- df %>% filter(state == 1)
df_100 <- df %>% filter(state == 100)
df_1_1980 <- df_1 %>% filter(year == 1980)
df_100_1980 <- df_100 %>% filter(year == 1980)
difference <- df_1_1980$value - df_100_1980$value
How could I do this for all the other states and years in the dataframe?
One possibility I have considered is making a dataframe composed only of the data from state 100 and then connecting it to the original dataframe, like this:
state year value state100 year100 value100
1 1980 4 100 1980 3
1 1981 5 100 1981 2
1 1982 4 100 1982 5
2 1980 2 100 1980 3
2 1981 3 100 1981 2
2 1982 4 100 1982 5
I could then subtract df$value from df$value100 for each row. I assume there is a better way of doing this.
We can filter the 'state' that is not equal to 100, left_join with the dataset with 'state' 100, by 'year' and get the difference between the 'value' columns
library(dplyr)
df %>%
filter(state != 100) %>%
left_join(df %>%
filter(state == 100) %>%
select(-state), by = c('year')) %>%
transmute(state, year, value = value.x, difference = value.x - value.y)
# state year value difference
#1 1 1980 4 1
#2 1 1981 5 3
#3 1 1982 4 -1
#4 2 1980 2 -1
#5 2 1981 3 1
#6 2 1982 4 -1
data
df <- structure(list(state = c(1L, 1L, 1L, 2L, 2L, 2L, 100L, 100L,
100L), year = c(1980L, 1981L, 1982L, 1980L, 1981L, 1982L, 1980L,
1981L, 1982L), value = c(4L, 5L, 4L, 2L, 3L, 4L, 3L, 2L, 5L)),
class = "data.frame", row.names = c(NA,
-9L))

Transforming multiple columns structure using Dplyr in R

I have a dataset, df,
State Year 0 1 2 3 4 5
Georgia 2001 10,000 200 300 400 500 800
Georgia 2002 20,000 500 500 1,000 2,000 2,500
Georgia 2003 2,000 5,000 1,000 400 300 8,000
Washington 2001 1,000 10,000 6,000 8,000 9,900 10,000
Washington 2006 5,000 300 200 900 1,000 8,000
I would like my desired output to look like this:
State Year Age Population
Georgia 2001 0 10,000
Georgia 2002 0 20,000
Georgia 2003 0 2,000
Georgia 2001 1 200
Georgia 2002 1 500
Georgia 2003 1 5000
Georgia 2001 2 300
Georgia 2002 2 500
Georgia 2003 2 1000
Georgia 2001 3 400
Georgia 2002 3 1000
Georgia 2003 3 400
Georgia 2001 4 500
Georgia 2002 4 2000
Georgia 2003 4 300
Georgia 2001 5 800
Georgia 2002 5 2500
Georgia 2003 5 8000
Washington 2001 0 1000
Washington 2006 0 5000
Washington 2001 1 10000
Washington 2006 1 300
Washington 2001 2 6000
Washington 2006 2 200
Washington 2001 3 8000
Washington 2006 3 900
Washington 2001 4 9900
Washington 2006 4 1000
Washington 2001 5 10000
Washington 2006 5 8200
Here is my dput
structure(list(state = structure(c(1L, 1L, 1L, 2L, 2L), .Label = c("georgia",
"washington"), class = "factor"), year = c(2001L, 2002L, 2003L,
2001L, 2006L), X0 = structure(c(1L, 3L, 4L, 2L, 5L), .Label = c("10,000",
"1000", "20,000", "2000", "5000"), class = "factor"), X1 = structure(c(2L,
4L, 5L, 1L, 3L), .Label = c("10,000", "200", "300", "500", "5000"
), class = "factor"), X2 = c(300L, 500L, 1000L, 6000L, 200L),
X3 = c(400L, 1000L, 400L, 8000L, 900L), X4 = c(500L, 2000L,
300L, 99000L, 1000L), X5 = structure(c(3L, 2L, 4L, 1L, 4L
), .Label = c("10,000", "2500", "800", "8000"), class = "factor")), class = "data.frame", row.names
=
c(NA,
-5L))
This is what I have tried:
I know that I must groupby the state and the year as well as perform some type of pivot by possibly utilizing the gather() function
library(tidyr)
library(dplyr)
df1 <- gather(df, 0, 1, 2, 3, 4, 5 factor_key=TRUE)
df %>% groupby(State, Year) %>%
mutate('Age', 'Population')
We can first convert the column type to numeric by extracting the numeric part and then do the reshape
library(dplyr)
library(tidyr)
df %>%
mutate_at(vars(matches('\\d+$')), ~readr::parse_number(as.character(.))) %>%
pivot_longer(cols = -c(state, year), names_to = "Age", values_to = "Population")

How do I combine dates, regardless of a third variable in R?

The following is a data example,
Month Year Tornado Location
January 1998 3 Illinois
February 1998 2 Illinois
March 1998 5 Illinois
January 1998 1 Florida
January 2010 3 Illinois
Here is what I want it to look like essentially,
Date Tornado
1998-01 4
1998-02 2
1998-03 5
2010-01 3
So, I want to combine the Year and Month into one, new column. The locations do not matter, I want to know the total number of tornadoes for January, 1998, and etc.
I have the following code, but do not know how to change it to incorporate both the variables I want, or if this is even the correct code for what I am attempting to do.
mydata$Date <- format(as.Date(mydata$month), "%m-%Y")
The real dataset is far too large to fix manually. I am basically attempting to make this data into time series data.
You need to apply some data transformation before applying How to sum a variable by group
aggregate(Tornado~Date, transform(df, Date = format(as.Date(paste(Month,Year,"01"),
"%B %Y %d"), "%Y-%m")), sum)
# Date Tornado
#1 1998-01 4
#2 1998-02 2
#3 1998-03 5
#4 2010-01 3
data
df <- structure(list(Month = structure(c(2L, 1L, 3L, 2L, 2L),
.Label = c("February", "January", "March"), class = "factor"),
Year = c(1998L, 1998L,1998L, 1998L, 2010L),
Tornado = c(3L, 2L, 5L, 1L, 3L), Location = structure(c(2L,
2L, 2L, 1L, 2L), .Label = c("Florida", "Illinois"), class = "factor")),
class = "data.frame", row.names = c(NA, -5L))
In the first place, I combined Month and Year into a single variable called Date, applied the appropriate format with zoo package, and grouped the results by Date.
library(tidyverse)
library(zoo)
df %>%
unite(Date, Month, Year) %>%
mutate(Date = as.yearmon(Date, format = '%B_%Y')) %>%
group_by(Date) %>%
summarise(Tornado = sum(Tornado))
# A tibble: 4 x 2
Date Tornado
<yearmon> <int>
1 Jan 1998 4
2 Feb 1998 2
3 Mar 1998 5
4 Jan 2010 3
if the day doesn't matter you can do:
#library (tidyverse)
library(lubridate)
x$Date<-as_date(paste0(x$Year,x$Month,"-01"))
# A tibble: 5 x 4
Month Year Tornados Date
<chr> <dbl> <dbl> <date>
1 January 1998 3 1998-01-01
2 February 1998 2 1998-02-01
3 March 1998 5 1998-03-01
4 January 1998 1 1998-01-01
5 January 2010 3 2010-01-01

Proper way to split data frames at multiple levels using ddply [duplicate]

This question already has answers here:
Efficient method to filter and add based on certain conditions (3 conditions in this case)
(3 answers)
Closed 6 years ago.
Let's say I have a data frame like the following:
year stint ID W
1 2003 1 abc 10
2 2003 2 abc 3
3 2003 1 def 16
4 2004 1 abc 15
5 2004 1 def 11
6 2004 2 def 7
I would like to combine the data so that it looks like
year ID W
1 2003 abc 13
3 2003 def 16
4 2004 abc 15
5 2004 def 18
I found a way to combine the data as desired, but I'm very sure that there's a better way.
combinedData = unique(ddply(data, "ID", function(x) {
ddply(x, "year", function(y) {
data.frame(ID=x$ID, W=sum(y$W))
})
}))
combinedData[order(combinedData$year),]
This produces the following output:
year ID W
1 2003 abc 13
7 2003 def 16
4 2004 abc 15
10 2004 def 18
Specifically I don't like that I had to use unique (otherwise I get each unique combo of year,ID,W three times in the outputted data), and I don't like that the row numbers aren't sequential. How can I do this more cleanly?
Do this with base R:
aggregate(W~year+ID, df, sum)
# year ID W
#1 2003 abc 13
#2 2004 abc 15
#3 2003 def 16
#4 2004 def 18
data
df <- structure(list(year = c(2003L, 2003L, 2003L, 2004L, 2004L, 2004L
), stint = c(1L, 2L, 1L, 1L, 1L, 2L), ID = structure(c(1L, 1L,
2L, 1L, 2L, 2L), .Label = c("abc", "def"), class = "factor"),
W = c(10L, 3L, 16L, 15L, 11L, 7L)), .Names = c("year", "stint",
"ID", "W"), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6"))

Merging cases into one in R

I have a very newbie question. I'm using the Aid Worker Security Database, which records episodes of violence against aid workers, with incident reports from 1997 through the present. The events are marked independently in the dataset. I would like to merge all events that happened in a single country in a given year, sum the values of the other variables and create a simple time series with the same number of years for all countries (1997-2013). Any idea how to do it?
df
# year country totalnationals internationalskilled
# 1 1997 Rwanda 0 3
# 2 1997 Cambodia 1 0
# 3 1997 Somalia 0 1
# 4 1997 Rwanda 1 0
# 5 1997 DR Congo 10 0
# 6 1997 Somalia 1 0
# 7 1997 Rwanda 1 0
# 8 1998 Angola 5 0
Where "df" is defined as:
df <- structure(list(year = c(1997L, 1997L, 1997L, 1997L, 1997L, 1997L,
1997L, 1998L), country = c("Rwanda", "Cambodia", "Somalia", "Rwanda",
"DR Congo", "Somalia", "Rwanda", "Angola"), totalnationals = c(0L,
1L, 0L, 1L, 10L, 1L, 1L, 5L), internationalskilled = c(3L, 0L,
1L, 0L, 0L, 0L, 0L, 0L)), .Names = c("year", "country", "totalnationals",
"internationalskilled"), class = "data.frame", row.names = c(NA, -8L))
I would like to have something like that:
# year country totalnationals internationalskilled
# 1 1997 Rwanda 2 3
# 2 1997 Cambodia 1 0
# 3 1997 Somalia 1 1
# 4 1997 DR Congo 10 0
# 5 1997 Angola 0 0
# 6 1998 Rwanda 0 0
# 7 1998 Cambodia 0 0
# 8 1998 Somalia 0 0
# 9 1998 DR Congo 0 0
# 10 1998 Angola 5 0
Sorry for the very, very newbie question... but so far I couldn't figure out how to do it. Thanks! :-)
Updated after OP's comments -
df <- subset(df, year <= 2013 & year >= 1997)
df$totalnationals <- as.integer(df$totalnationals)
df$internationalskilled <- as.integer(df$internationalskilled)
df2 <- aggregate(data = df,cbind(totalnationals,internationalskilled)~year+country, sum)
To add 0s for years without a record -
df3 <- expand.grid(unique(df$year),unique(df$country))
df3 <- merge(df3,df2, all.x = TRUE, by = 1:2)
df3[is.na(df3)] <- 0
Same thing with data tables (can be faster on large datasets).
library(data.table)
dt <- data.table(df,key="year,country")
smry <- dt[,list(totalnationals =sum(totalnationals),
internationalskilled=sum(internationalskilled)),
by="year,country"]
countries <- unique(dt$country)
template <- data.table(year=rep(1997:2013,each=length(countries)),
country=countries,
key="year,country")
time.series <- smry[template]
time.series[is.na(time.series)]=0

Resources