Resetting the cumulative sum when a condition is met in R [duplicate] - r

This question already has answers here:
R Cumulative Sum with a condition and a reset
(3 answers)
Closed 3 years ago.
So I have a table that looks like this currently:
data_wrong <- data.table(State = c("NY", "NY", "NY", "NY", "PA", "PA", "PA",
"NJ", "NJ", "NJ"), Year = c("1973", "1974", "1975", "2005", "1992", "1993",
"2001", "1930", "1931", "1932"), Consecutive_Yrs = c(1,2,3,1,1,6,1,1,9,10))
And I'd like it to look like this:
data <- data.table(State = c("NY", "NY", "NY", "NY", "PA", "PA", "PA", "NJ",
"NJ", "NJ"), Year = c("1973", "1974", "1975", "2005", "1992", "1993",
"2001", "1930", "1931", "1932"), Consecutive_Yrs = c(1,2,3,1,1,2,1,1,2,3))
This is the code I'm using right now to get my table:
data$diff <- NA
data <- data %>%
group_by(State) %>%
arrange(State) %>%
mutate(diff = Year - lag(Year, default = first(Year)))
data$Consecutive_Yrs <- 1
data$Consecutive_Yrs <- ifelse(data$diff == 1, cumsum(data$Consecutive_Yrs),
1)
Any help would be greatly appreciated :)

As. it is a data.table, an option is to use data.table methods
library(data.table)
data_wrong[, grp := cumsum(c(TRUE, diff(as.numeric(Year)) > 1)),
.(State)][, Consecutive_Yrs := as.numeric(seq_len(.N)), .(State, grp)]
data_wrong
# State Year Consecutive_Yrs grp
# 1: NY 1973 1 1
# 2: NY 1974 2 1
# 3: NY 1975 3 1
# 4: NY 2005 1 2
# 5: PA 1992 1 1
# 6: PA 1993 2 1
# 7: PA 2001 1 2
# 8: NJ 1930 1 1
# 9: NJ 1931 2 1
#10: NJ 1932 3 1
Or use rowid
data_wrong[, Consecutive_Yrs2 := rowid(rleid(as.numeric(Year) -
shift(as.numeric(Year), fill = as.numeric(Year[1])) >1)), .(State)]

Related

Calculate median grouping in multiple year increments R

I'm trying to use dplyr to calculate medians by grouping 3 different columns and in 3 year increments.
My data looks like this:
data <- data.frame("Year" = c("1990","1990", "1992", "1993", "1994", "1990", "1991", "1990",
"1991", "1992", "1994", "1995"),"Type" = c("Al", "Al", "Al", "Al", "Al", "Al", "Al", "Cu",
"Cu", "Cu", "Cu", "Cu"), "Frac" = c("F", "F", "F", "F", "F", "UF", "UF", "F", "F", "UF",
"UF", "UF"), "Value" = c(0.1, 0.2, 0.3, 0.6, 0.7, 1.3, 1.5, 0.4, 0.2, 0.9, 2.3, 2.9))
I would like to calculate the median of "Value" in 3 year groupings and also grouping by "Type" and "Frac".
The problem is that sometimes there is a missing year, so I want it to group in 3 year increments based on the data that I have. Showing what I mean with my example data it would be grouped like this: (1990, 1992, 1993) for Al and F. Then just (1994) for Al and F since there's no more data for Al and F. Then (1990, 1991) for Al and UF since there's only 2 years worth of data. So basically I want it to be grouped by 3 years if possible, but if not, then do whatever is left over.
This is the end table I would like to have:
stats_wanted <- data.frame("Year" = c("1990, 1992, 1993", "1994", "1990, 1991",
"1990, 1991", "1992, 1994, 1995"), "Type" = c("Al", "Al", "Al", "Cu", "Cu"), "Frac" =
c("F", "F", "UF", "F", "UF"), "Median" = c(0.25, 0.7, 1.4, 0.3, 2.3))
Hopefully this makes sense... let me know if you have any questions :)!
I do not know dplyr, but here is a data.table solution.
library(data.table)
setDT(data)
data = data[order(Type,Frac,Year)]
# data = data[order(Year)] also works fine
data[
!duplicated(.SD,by=c('Year','Type','Frac')),
yeargroup:=0:(.N-1) %/% 3,
.(Type,Frac)]
# !duplicated... selects only the first unique row by year,type,frac
# 0:(.N-1) gives 0 to N-1 for each Type,Frac group
# %/% 3 gives the remainder when divided by 3
> data
Year Type Frac Value yeargroup
1: 1990 Al F 0.1 0
2: 1990 Al F 0.2 NA <- NA because dupe Year,Type,Frac
3: 1992 Al F 0.3 0
4: 1993 Al F 0.6 0
5: 1994 Al F 0.7 1
6: 1990 Al UF 1.3 0
7: 1991 Al UF 1.5 0
8: 1990 Cu F 0.4 0
9: 1991 Cu F 0.2 0
10: 1992 Cu UF 0.9 0
11: 1994 Cu UF 2.3 0
12: 1995 Cu UF 2.9 0
# handle dupe Year,Type,Frac rows:
data[,yeargroup:=max(yeargroup,na.rm=T),.(Year,Type,Frac)]
> data
Year Type Frac Value yeargroup
1: 1990 Al F 0.1 0
2: 1990 Al F 0.2 0 <- fixed NA
3: 1992 Al F 0.3 0
4: 1993 Al F 0.6 0
5: 1994 Al F 0.7 1
6: 1990 Al UF 1.3 0
7: 1991 Al UF 1.5 0
8: 1990 Cu F 0.4 0
9: 1991 Cu F 0.2 0
10: 1992 Cu UF 0.9 0
11: 1994 Cu UF 2.3 0
12: 1995 Cu UF 2.9 0
stats_wanted = data[,
.(Year=paste0(unique(Year),collapse=', '),Median=median(Value)),
.(Type,Frac,yeargroup)]
> stats_wanted
Type Frac yeargroup Year Median
1: Al F 0 1990, 1992, 1993 0.25
2: Al F 1 1994 0.70
3: Al UF 0 1990, 1991 1.40
4: Cu F 0 1990, 1991 0.30
5: Cu UF 0 1992, 1994, 1995 2.30
PS: #ronak-shah posted a concise dplyr solution, which inspired me to post another data.table solution which is even conciser:
> data[
order(Year),
.(Year,Value,group=(rleid(Year)-1)%/%3),
.(Type,Frac)
][,
.(Year=paste0(unique(Year),collapse=', '),Median=median(Value)),
.(Type,Frac,group)
]
Here's a dplyr solution -
For each Type and Frac, we create a group column which assigns the same number to every 3 values. For each group, we concatenate the Year value and calculate the median.
library(dplyr)
data %>%
group_by(Type, Frac) %>%
mutate(group = match(Year, unique(Year)),
group = ceiling(group/3)) %>%
group_by(group, .add = TRUE) %>%
summarise(Year = toString(unique(Year)),
Median = median(Value), .groups = 'drop') %>%
select(Year, Type, Frac, Median)
# Year Type Frac Median
# <chr> <chr> <chr> <dbl>
#1 1990, 1992, 1993 Al F 0.25
#2 1994 Al F 0.7
#3 1990, 1991 Al UF 1.4
#4 1990, 1991 Cu F 0.3
#5 1992, 1994, 1995 Cu UF 2.3

Merge two datasets in R based on column values [duplicate]

This question already has answers here:
How to join (merge) data frames (inner, outer, left, right)
(13 answers)
Closed 2 years ago.
I have two datasets I would like to merge in R: one is a long catch dataset and the other is a small effort dataset. I would like to join these so that I can multiply values for the same years AND industry together. Eg, the small effort columns will be repeated many times over, as they are industry-wide characteristics. I think this is a very simple merge but am having trouble making it work!
Catch <- data.frame(
Species = c("a", "a", "c", "c", "a", "b"),
Industry= c( "ag", "fi", "ag", "fi", "ag", "fi" ),
Year = c("1990", "1990", "1991", "1992", "1990", "1990"),
Catch = c(0,1,4,7,5,6))
Effort<-data.frame(
Industry= c( "ag", "ag", "ag" , "fi", "fi", "fi"),
Year = c("1990", "1991", "1992", "1990", "1991", "1992"),
Effort = c(0,1,4,7,5,6))
What I have tried so far:
effort_catch<-merge(Effort, Catch , by.x= Year, by.y=Year )
I am not sure which one is what you need
transform(
merge(Catch, Effort, by = c("Industry", "Year"), all.x = TRUE),
prod = Catch * Effort
)
Industry Year Species Catch Effort prod
1 ag 1990 a 0 0 0
2 ag 1990 a 5 0 0
3 ag 1991 c 4 1 4
4 fi 1990 a 1 7 7
5 fi 1990 b 6 7 42
6 fi 1992 c 7 6 42
or
transform(
merge(Catch, Effort, by = c("Industry", "Year"), all = TRUE),
prod = Catch * Effort
)
Industry Year Species Catch Effort prod
1 ag 1990 a 0 0 0
2 ag 1990 a 5 0 0
3 ag 1991 c 4 1 4
4 ag 1992 <NA> NA 4 NA
5 fi 1990 a 1 7 7
6 fi 1990 b 6 7 42
7 fi 1991 <NA> NA 5 NA
8 fi 1992 c 7 6 42
Here's a solution using dplyr
library(dplyr)
full_join(Catch, Effort) %>%
mutate(Multiplied = Catch * Effort)
#> Joining, by = c("Industry", "Year")
#> Species Industry Year Catch Effort Multiplied
#> 1 a ag 1990 0 0 0
#> 2 a fi 1990 1 7 7
#> 3 c ag 1991 4 1 4
#> 4 c fi 1992 7 6 42
#> 5 a ag 1990 5 0 0
#> 6 b fi 1990 6 7 42
#> 7 <NA> ag 1992 NA 4 NA
#> 8 <NA> fi 1991 NA 5 NA
Based on your provided data...
Catch <- data.frame(
Species = c("a", "a", "c", "c", "a", "b"),
Industry= c( "ag", "fi", "ag", "fi", "ag", "fi" ),
Year = c("1990", "1990", "1991", "1992", "1990", "1990"),
Catch = c(0,1,4,7,5,6))
Effort<-data.frame(
Industry= c( "ag", "ag", "ag" , "fi", "fi", "fi"),
Year = c("1990", "1991", "1992", "1990", "1991", "1992"),
Effort = c(0,1,4,7,5,6))

Using summarise to obtain percentage of selected subgroups

I have a data frame as follows:
df <- data.frame(Mode = c("air", "water", "rail", "road", "air", "water", "rail", "road", "air", "water", "rail", "road"), Year = c("2000", "2000", "2000", "2000", "2001", "2001", "2001", "2001", "2002", "2002", "2002", "2002"), Country = c("USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "Germany", "Germany", "Germany", "Germany" ), VALUE = c(2, 3, NaN, 5, 1, NaN, 12, 19, 29, 30, 31, 32))
The objective is to get percentage of specific/selected sub-groups (mode of transport). In this case, I want to get, for a given year and country, what is share/percentage of rail and water and also in another case every other subgroup except for road (i.e.: air+water+rail).
So in this case for Year 2000 in USA, percentage of rail and water is 3+0 / (2+3+5) * 100 = 30% and for non-road subgroup (ignoring NaN) is 2+3 / (2+3+5)*100 = 50%
My starting point is this - ignoring selected grouping, but then I am lost:
df %>%
na.omit() %>%
group_by(Year) %>%
mutate(pct = (VALUE/sum(VALUE) * 100))
Any one can guide the logic/ or thinking in solving this?
I think you've got the logic. I replaced NA by 0 just in order to keep them in the df. Then, group by year and country, as you would imagine. Finally, use [] to select only a portion of your vector.
require(dplyr)
df %>%
mutate(VALUE=replace_na(VALUE,0)) %>%
group_by(Year,Country) %>%
mutate(rail_water = sum(VALUE[Mode %in% c('rail','water')])/sum(VALUE),
non_road = sum(VALUE[!Mode %in% c('road')])/sum(VALUE))
# A tibble: 12 x 6
# Groups: Year, Country [3]
Mode Year Country VALUE rail_water non_road
<fct> <fct> <fct> <dbl> <dbl> <dbl>
1 air 2000 USA 2 0.3 0.5
2 water 2000 USA 3 0.3 0.5
3 rail 2000 USA 0 0.3 0.5
4 road 2000 USA 5 0.3 0.5
5 air 2001 USA 1 0.375 0.406
6 water 2001 USA 0 0.375 0.406
7 rail 2001 USA 12 0.375 0.406
8 road 2001 USA 19 0.375 0.406
9 air 2002 Germany 29 0.5 0.738
10 water 2002 Germany 30 0.5 0.738
11 rail 2002 Germany 31 0.5 0.738
12 road 2002 Germany 32 0.5 0.738

R cumulative sum using dplyr with reset

I am trying to make a table that counts the number of consecutive years grouped by columns "state" and "p" that looks like this:
data_right <- data.table(state = c("NY", "NY", "NY", "NY", "NY","NY", "PA",
"PA", "PA", "PA", "PA", "PA"), p = c("n", "n","n","n", "p", "p", "n", "n", "n",
"p", "p", "p"),Year = c("1973", "1974", "1977", "1978", "1988", "1989" ,"1991",
"1992", "1993", "1920", "1929", "1931"), Consecutive_Yrs =
c(1,2,1,2,1,2,1,2,3,1,1,1))
The code I am using right now is not working properly. I'm trying mutate, and group_by statements in dplyr but am having no luck. I also cannot use the data.table package because my R version is not up to date.
Any help to get this output is greatly appreciated!
library(dplyr)
data_right %>%
group_by(state, p) %>%
mutate(grp = cumsum(c(TRUE, diff(as.integer(Year)) > 1))) %>%
group_by(state, p, grp) %>%
mutate(cy = row_number()) %>%
ungroup() %>%
select(-grp)
# # A tibble: 12 x 5
# state p Year Consecutive_Yrs cy
# <chr> <chr> <chr> <dbl> <int>
# 1 NY n 1973 1 1
# 2 NY n 1974 2 2
# 3 NY n 1977 1 1
# 4 NY n 1978 2 2
# 5 NY p 1988 1 1
# 6 NY p 1989 2 2
# 7 PA n 1991 1 1
# 8 PA n 1992 2 2
# 9 PA n 1993 3 3
# 10 PA p 1920 1 1
# 11 PA p 1929 1 1
# 12 PA p 1931 1 1
Assumes the data is already ordered by Year.
Data:
data_right <- data.table(state = c("NY", "NY", "NY", "NY", "NY","NY", "PA", "PA", "PA", "PA", "PA", "PA"), p = c("n", "n","n","n", "p", "p", "n", "n", "n", "p", "p", "p"),Year = c("1973", "1974", "1977", "1978", "1988", "1989" ,"1991", "1992", "1993", "1920", "1929", "1931"), Consecutive_Yrs = c(1,2,1,2,1,2,1,2,3,1,1,1))

How to calculate percent differences in a table in R

I have a csv file where rows 1-5 represent one state, 5-10 another, etc... I also have a column with years 1970,1980,..,2010 repeated for each state. In R (although I'm not opposed to a solution in Excel if that is easier), I want for each state to calculate the percent difference between that year and 1970, i.e. for Alabama 1990 it would be (AL 1990 - AL 1970)/(AL 1970), and add it to a new column in the data table so I can export it to a csv.
State, Year, Num
AL, 1970, 1
AL, 1980, 2
AL, 1990, 3
AL, 2000, 4
AL, 2010, 6
Output would be a column
pct_change
0
1
2
3
5
The dplyr package includes the function first which provides an easy method for getting the first value of a group. So if we arrange by Year to make it so that 1970 will be the first value of each group, when we group_by(State), we can use first(Num) to get that first value of Num which represents the value from 1970:
# Example data with 2 states
df <- structure(list(State = c("AL", "AL", "AL", "AL", "AL", "TX",
"TX", "TX", "TX", "TX"), Year = c(1970L, 1980L, 1990L, 2000L,
2010L, 1970L, 1980L, 1990L, 2000L, 2010L), Num = c(1, 2, 3, 4,
6, 5, 2, 10, 12, 6)), class = "data.frame", row.names = c(NA,
-10L))
library(dplyr)
df %>%
arrange(State, Year) %>%
group_by(State) %>%
mutate(perc_diff = 100 * (Num - first(Num))/first(Num))
# A tibble: 10 x 4
# Groups: State [2]
State Year Num perc_diff
<chr> <int> <dbl> <dbl>
1 AL 1970 1 0
2 AL 1980 2 100
3 AL 1990 3 200
4 AL 2000 4 300
5 AL 2010 6 500
6 TX 1970 5 0
7 TX 1980 2 -60
8 TX 1990 10 100
9 TX 2000 12 140
10 TX 2010 6 20
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df)), order by 'State', 'Year' in the i, grouped by 'State', get the difference of the 'Num' with the first value of 'Num' and assign (:=) to create the 'perc_diff'
library(data.table)
setDT(df)[order(State, Year), perc_diff :=
100 * (Num - first(Num))/first(Num), State][]
# State Year Num perc_diff
# 1: AL 1970 1 0
# 2: AL 1980 2 100
# 3: AL 1990 3 200
# 4: AL 2000 4 300
# 5: AL 2010 6 500
# 6: TX 1970 5 0
# 7: TX 1980 2 -60
# 8: TX 1990 10 100
# 9: TX 2000 12 140
#10: TX 2010 6 20
Or using base R
v1 <- with(df, ave(Num, State, FUN = function(x) x[1]))
df$perc_diff <- with(df, 100 * (Num - v1)/v1)
data
df <- structure(list(State = c("AL", "AL", "AL", "AL", "AL", "TX",
"TX", "TX", "TX", "TX"), Year = c(1970L, 1980L, 1990L, 2000L,
2010L, 1970L, 1980L, 1990L, 2000L, 2010L), Num = c(1, 2, 3, 4,
6, 5, 2, 10, 12, 6)), class = "data.frame", row.names = c(NA,
-10L))
Base R solution using tapply
df <- df[with(df, order(State, Year)), ]
df$pct_change <- unlist( tapply(df$Num, df$State, function(x) 100 * (x - x[1]) / x[1]) )
> df
State Year Num pct_change
1 AL 1970 1 0
2 AL 1980 2 100
3 AL 1990 3 200
4 AL 2000 4 300
5 AL 2010 6 500
6 TX 1970 5 0
7 TX 1980 2 -60
8 TX 1990 10 100
9 TX 2000 12 140
10 TX 2010 6 20

Resources