I'd like to expand observations from single row-per-id to multiple rows-per-id based on a given time interval:
> dput(df)
structure(list(id = c(123, 456, 789), gender = c(0, 1, 1), yr.start = c(2005,
2010, 2000), yr.last = c(2007, 2012, 2000)), .Names = c("id",
"gender", "yr.start", "yr.last"), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -3L))
> df
# A tibble: 3 x 4
id gender yr.start yr.last
<dbl> <dbl> <dbl> <dbl>
1 123 0 2005 2007
2 456 1 2010 2012
3 789 1 2000 2000
I want to get id expanded into one row per year:
> dput(df_out)
structure(list(id = c(123, 123, 123, 456, 456, 456, 789), gender = c(0,
0, 0, 1, 1, 1, 1), yr = c(2005, 2006, 2007, 2010, 2011, 2012,
2000)), .Names = c("id", "gender", "yr"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -7L))
> df_out
# A tibble: 7 x 3
id gender yr
<dbl> <dbl> <dbl>
1 123 0 2005
2 123 0 2006
3 123 0 2007
4 456 1 2010
5 456 1 2011
6 456 1 2012
7 789 1 2000
I know how to melt/reshape, but I'm not sure how I can expand the years.
Thanks.
Here is a base R method.
# expand years to a list
yearList <- mapply(":", df$yr.start, df$yr.last)
Now, use this list to calculate the number of rows to repeat for each ID (the second argument of rep) and then append it as a vector (transformed from list with unlist) using cbind.
# get data.frame
cbind(df[rep(seq_along(df$id), lengths(yearList)), c("id", "gender")], yr=unlist(yearList))
id gender yr
1 123 0 2005
1.1 123 0 2006
1.2 123 0 2007
2 456 1 2010
2.1 456 1 2011
2.2 456 1 2012
3 789 1 2000
You could gather into long format and then fill in the missing rows via complete using tidyr.
library(dplyr)
library(tidyr)
df %>%
gather(group, yr, starts_with("yr") ) %>%
group_by(id, gender) %>%
complete(yr = full_seq(yr, period = 1) )
You can use select to get rid of the extra column.
df %>%
gather(group, yr, starts_with("yr") ) %>%
select(-group) %>%
group_by(id, gender) %>%
complete(yr = full_seq(yr, period = 1) )
# A tibble: 8 x 3
# Groups: id, gender [3]
id gender yr
<dbl> <dbl> <dbl>
1 123 0 2005
2 123 0 2006
3 123 0 2007
4 456 1 2010
5 456 1 2011
6 456 1 2012
7 789 1 2000
8 789 1 2000
Here is a tidyverse solution
library(tidyverse)
df %>%
group_by(id, gender) %>%
nest() %>%
mutate(data = map(data, ~ seq(.x$yr.start, .x$yr.last))) %>%
unnest() %>%
rename(year = data)
# A tibble: 7 x 3
id gender year
<dbl> <dbl> <int>
1 123 0 2005
2 123 0 2006
3 123 0 2007
4 456 1 2010
5 456 1 2011
6 456 1 2012
7 789 1 2000
As the OP mentions that his production data set has more than 1 M rows and he is benchmarking the different solutions, it might be worthwhile to try a data.table version:
library(data.table) # CRAN version 1.10.4 used
data.table(DF)[, .(yr = yr.start:yr.last), by = .(id, gender)]
which returns
id gender yr
1: 123 0 2005
2: 123 0 2006
3: 123 0 2007
4: 456 1 2010
5: 456 1 2011
6: 456 1 2012
7: 789 1 2000
If there are more non-varying columns than just gender it might be more efficient to do a join rather than including all those columns in the grouping parameter by =:
data.table(DF)[DF[, .(yr = yr.start:yr.last), by = id], on = "id"]
id gender yr.start yr.last yr
1: 123 0 2005 2007 2005
2: 123 0 2005 2007 2006
3: 123 0 2005 2007 2007
4: 456 1 2010 2012 2010
5: 456 1 2010 2012 2011
6: 456 1 2010 2012 2012
7: 789 1 2000 2000 2000
Note that both approaches assume that id is unique in the input data.
Benchmarking
The OP has noted that he is surprised that above data.table solution is five times slower than lmo's base R solution, apparently with OP's production data set of more than 1 M rows.
Also, the question has attracted 5 different answers plus additional suggestions. So, it's worthwhile to compare the solution in terms of processing speed.
Data
As the production data set isn't available, and problem size among other factors like the strcuture of the data is important for benchmarking, sample data sets are created.
# parameters
n_rows <- 1E2
yr_range <- 10L
start_yr <- seq(2000L, length.out = 10L, by = 1L)
# create sample data set
set.seed(123L)
library(data.table)
DT <- data.table(id = seq_len(n_rows),
gender = sample(0:1, n_rows, replace = TRUE),
yr.start = sample(start_yr, n_rows, replace = TRUE))
DT[, yr.last := yr.start + sample(0:yr_range, n_rows, replace = TRUE)]
DF <- as.data.frame(DT)
str(DT)
Classes ‘data.table’ and 'data.frame': 100 obs. of 4 variables:
$ id : int 1 2 3 4 5 6 7 8 9 10 ...
$ gender : int 0 1 0 1 1 0 1 1 1 0 ...
$ yr.start: int 2005 2003 2004 2009 2004 2008 2009 2006 2004 2001 ...
$ yr.last : int 2007 2013 2010 2014 2008 2017 2013 2009 2005 2002 ...
- attr(*, ".internal.selfref")=<externalptr>
For the first run, 100 rows are created, the start year can vary between 2000 and 2009, and the span of years an indivdual id can cover is between 0 and 10 years. Thus, the result set should be expected to have approximately 100 * (10 + 1) / 2 rows.
Also, only one additional column gender is included although the OP has told that the producion data may have 2 to 10 non-varying columns.
Code
library(magrittr)
bm <- microbenchmark::microbenchmark(
lmo = {
yearList <- mapply(":", DF$yr.start, DF$yr.last)
res_lmo <- cbind(DF[rep(seq_along(DF$id), lengths(yearList)), c("id", "gender")],
yr=unlist(yearList))
},
hao = {
res_hao <- DF %>%
dplyr::group_by(id, gender) %>%
tidyr::nest() %>%
dplyr::mutate(data = purrr::map(data, ~ seq(.x$yr.start, .x$yr.last))) %>%
tidyr::unnest() %>%
dplyr::rename(yr = data)
},
aosmith = {
res_aosmith <- DF %>%
tidyr::gather(group, yr, dplyr::starts_with("yr") ) %>%
dplyr::select(-group) %>%
dplyr::group_by(id, gender) %>%
tidyr::complete(yr = tidyr::full_seq(yr, period = 1) )
},
jason = {
res_jason <- DF %>%
dplyr::group_by(id, gender) %>%
dplyr::do(data.frame(yr=.$yr.start:.$yr.last))
},
uwe1 = {
res_uwe1 <- DT[, .(yr = yr.start:yr.last), by = .(id, gender)]
},
uwe2 = {
res_uwe2 <- DT[DT[, .(yr = yr.start:yr.last), by = id], on = "id"
][, c("yr.start", "yr.last") := NULL]
},
frank1 = {
res_frank1 <- DT[rep(1:.N, yr.last - yr.start + 1L),
.(id, gender, yr = DT[, unlist(mapply(":", yr.start, yr.last))])]
},
frank2 = {
res_frank2 <- DT[, {
m = mapply(":", yr.start, yr.last); c(.SD[rep(.I, lengths(m))], .(yr = unlist(m)))},
.SDcols=id:gender]
},
times = 3L
)
Note that references to tidyverse functions are explicit in order to avoid name conflicts due to a cluttered name space.
First run
Unit: microseconds
expr min lq mean median uq max neval
lmo 655.860 692.6740 968.749 729.488 1125.193 1520.899 3
hao 40610.776 41484.1220 41950.184 42357.468 42619.887 42882.307 3
aosmith 319715.984 336006.9255 371176.437 352297.867 396906.664 441515.461 3
jason 77525.784 78197.8795 78697.798 78869.975 79283.804 79697.634 3
uwe1 834.079 870.1375 894.869 906.196 925.264 944.332 3
uwe2 1796.910 1810.8810 1880.482 1824.852 1922.268 2019.684 3
frank1 981.712 1057.4170 1086.680 1133.122 1139.164 1145.205 3
frank2 994.172 1003.6115 1081.016 1013.051 1124.438 1235.825 3
For the given problem size of 100 rows, the timings clearly indicate that the dplyr/ tidyr solutions are magnitudes slower than base R or data.table solutions.
The results are essentially consistent:
all.equal(as.data.table(res_lmo), res_uwe1)
all.equal(res_hao, res_uwe1)
all.equal(res_jason, res_uwe1)
all.equal(res_uwe2, res_uwe1)
all.equal(res_frank1, res_uwe1)
all.equal(res_frank2, res_uwe1)
return TRUE except all.equal(res_aosmith, res_uwe1) which returns
[1] "Incompatible type for column yr: x numeric, y integer"
Second run
Due to the long execution times, the tidyverse solutions are skipped when benchmarking larger problem sizes.
With the modified parameters
n_rows <- 1E4
yr_range <- 100L
the result set is expected to consist of about 500'000 rows.
Unit: milliseconds
expr min lq mean median uq max neval
lmo 425.026101 447.716671 455.85324 470.40724 471.26681 472.12637 3
uwe1 9.555455 9.796163 10.05562 10.03687 10.30571 10.57455 3
uwe2 18.711805 18.992726 19.40454 19.27365 19.75091 20.22817 3
frank1 22.639031 23.129131 23.58424 23.61923 24.05685 24.49447 3
frank2 13.989016 14.124945 14.47987 14.26088 14.72530 15.18973 3
For the given problem size and structure the data.table solutions are the fastest while the base R approach is a magnitude slower. The most concise solution uwe1 is also the fastest, here.
Note that the results depend on the structure of the data, in particular the parameters n_rows and yr_range and the number of non-varying columns. If there are more of those columns than just gender the timings might look differently.
The benchmark results are in contradiction to the OP's observation on execution speed which needs to be further investigated.
Another way using do in dplyr, but it's slower than the base R method.
df %>%
group_by(id, gender) %>%
do(data.frame(yr=.$yr.start:.$yr.last))
# # A tibble: 7 x 3
# # Groups: id, gender [3]
# id gender yr
# <dbl> <dbl> <int>
# 1 123 0 2005
# 2 123 0 2006
# 3 123 0 2007
# 4 456 1 2010
# 5 456 1 2011
# 6 456 1 2012
# 7 789 1 2000
Related
Hi I have two data frames (df1 and df2) with two shared variables (ID and Yr). I want to update the values in a third variable (value) in df1 with the new data in the respective value in df2. But below code does not update the value in df1, it seems the values are not passed to the corresponding cels in df1.
df1 = data.frame(ID = c("a","b","c","d","e") ,
Yr = c(2000,2001,2002,2003,2004),
value= c(100,100,100,100, 100))
df2 = data.frame(ID = c("a","b","c") ,
Yr = c(2000,2001,2002),
valuenew= c(200,150,120))
for (i in 1:nrow(df2)){
id <- df2[i,'ID']
year <- df2[i, 'Yr']
valuenew<- df2[i, 'valuenew']
df1[which (df1$ID == id & df1$Yr == year), 'value'] <- valuenew
}
the desired result
ID Yr value
a 2000 200
b 2001 150
c 2002 120
d 2003 100
e 2004 100
The real data I use with which none of these solutions works
df1
head(df1, 5)
CoreID Yr FluxTot
1 Asmund2000_Greenland coast_4001 1987 0.3239693
2 Asmund2000_Greenland coast_4001 1986 0.2864100
3 Asmund2000_Greenland coast_4001 1985 0.2488508
4 Asmund2000_Greenland coast_4001 1984 0.2964794
5 Asmund2000_Greenland coast_4001 1983 0.3441080
df2
head(df2, 5)
CoreID Yr GamfitHgdep
1 Beal2015_Mount Logan 2000 0.01105077
2 Eyrikh2017_Belukha glacier 2000 0.02632597
3 Zheng2014_Mt. Oxford 2000 0.01377599
4 Zheng2014_Agassiz 2000 0.01940151
5 Zheng2014_NEEM-2010-S3 2000 -0.01483026
#merged database
m<-merge(df1, df2)
head(m,5)
CoreID Yr FluxTot GamfitHgdep
1 Beal2014_Yanacocha 2000 0.003365556 0.024941373
2 Beal2014_Yanacocha 2001 0.003423333 0.027831253
3 Beal2014_Yanacocha 2002 0.003481111 -0.002908330
4 Beal2014_Yanacocha 2003 0.003538889 -0.004591100
5 Beal2014_Yanacocha 2004 0.003596667 0.005189858
Below is the exact code I used to do the trick but failed. No difference if the value assigning part is replaced with any other solutions. No warning, no error raised.
library(readxl)
library(dplyr)
metal = 'Hg'
df = read_excel('All core data.xlsx','Sheet1')
df = data.frame(df)
df1 <- df[which (df$Metal==metal),]
rownames(df1) = seq(length=nrow(df1))
head(df1, 5)
dfgam = read_excel('GAM prediction.xlsx','Sheet1')
df2 <- data.frame(dfgam)
head(df2, 5)
for (i in 1:nrow(df2)){
coreid <- df2[i,'CoreID']
year <- df2[i, 'Yr']
predicted<- df2[i, 'GamfitHgdep']
df1[which (df1$CoreID == coreid & df1$Yr == year), 'FluxTot'] <- predicted
}
after running the code, the values in df1 have not changed, for instance
the value should be 0.024941373 as shown in head(m,5)
Since dplyr version 1.0.0, you can use rows_update for this:
dplyr::rows_update(
df1,
rename(df2, value=valuenew),
by = c("ID", "Yr")
)
# ID Yr value
# 1 a 2000 200
# 2 b 2001 150
# 3 c 2002 120
# 4 d 2003 100
# 5 e 2004 100
We could use a join for this: For example left_join
library(dplyr)
left_join(df1, df2, by="ID") %>%
mutate(value = ifelse(!is.na(valuenew), valuenew, value)) %>%
select(ID, Yr=Yr.x, value)
ID Yr value
1 a 2000 200
2 b 2001 150
3 c 2002 120
4 d 2003 100
5 e 2004 100
Option using data.table:
df1 = data.frame(ID = c("a","b","c","d","e") ,
Yr = c(2000,2001,2002,2003,2004),
value= c(100,100,100,100, 100))
df2 = data.frame(ID = c("a","b","c") ,
Yr = c(2000,2001,2002),
valuenew= c(200,150,120))
library(data.table)
setDT(df1)[df2, value := i.valuenew, on = .(ID, Yr)]
df1
#> ID Yr value
#> 1: a 2000 200
#> 2: b 2001 150
#> 3: c 2002 120
#> 4: d 2003 100
#> 5: e 2004 100
Created on 2022-07-05 by the reprex package (v2.0.1)
Your example is working and updating df1 just fine.
However, to add one more solution, you can try the lines below without using a for loop or attaching extra packages:
key <- paste(df1$ID, df1$Yr)
values <- setNames(df2$value, paste(df2$ID, df2$Yr))[key]
df1$value[!is.na(values)] <- values[!is.na(values)]
Maybe something worth to mention in general for your problem, make sure you don't have any duplicated ID/Yr combinations in df2...
EDIT:
Sorry, I was terrible at helping you! Providing just another working solution is not helpful at all. So here's my attempt to help you further.
First, check that you have the classes/types that you expect for the columns that you compare.
Next - usually I'd recommend placing a browser() in your code (e.g. before your assignment/last line in your example:
for (i in 1:nrow(df2)){
id <- df2[i,'ID']
year <- df2[i, 'Yr']
valuenew<- df2[i, 'valuenew']
browser()
df1[which (df1$ID == id & df1$Yr == year), 'value'] <- valuenew
}
This is especially helpful if you need to debug a function. However in your case you can step through your for loop manually, which is a bit simpler to handle:
Assign the first value to your iterator i <- 1 and run the code inside your for loop. Is which(df1$ID == id & df1$Yr == year) really returning what you expect?
If you can't find any issues, increment i by 1 and proceed with debugging...
You can try this for loop
for(i in 1:nrow(df1)){
y <- which(df1$Yr[i] == df2$Yr)
if(length(y) > 0) df1$value[i] <- df2$valuenew[y]
}
Output
ID Yr value
1 a 2000 200
2 b 2001 150
3 c 2002 120
4 d 2003 100
5 e 2004 100
The data that looks like
Month Location Money
1 Miami 12
1 Cal 15
2 Miami 5
2 Cal 3
...
12 Miami 6
12 Cal 8
I want to transform it so it looks like
Month Location Money
Spring Miami sum(from month=1,2,3)
spring Cal sum (from month= 1,2,3)
summer...
summer...
fall...
fall...
winter...
winter...
I dont' know how to ask the question directly (merging rows, aggregating rows?) but googling it only returns dplyr::group_by and summarize which collapses the rows based on a single value of the row.
I want to collpase/summarise the data based on multiple row values.
Is there an easy way? Any help would be appreciated Thanks!
It sounds like you want to
assign season to each record,
group_by season,
summarize.
If this is where you are going, you can either create a new column, Or you can do it directly. You could also create a separate table with month and season and left_join to your data.
library(dplyr)
## simulate data
df = tibble(
month = rep(1:12, each = 4),
location = rep(c("Cal", "Miami"), times = 24),
money = as.integer(runif(48, 10, 100 ))
)
head(df)
# # A tibble: 6 x 3
# month location money
# <int> <chr> <int>
# 1 1 Cal 69
# 2 1 Miami 84
# 3 1 Cal 38
# 4 1 Miami 44
# 5 2 Cal 33
# 6 2 Miami 64
## Create season based on month in groups of 3
df %>%
mutate(season = (month-1) %/% 3 +1) %>%
group_by(season, location) %>%
summarize(Monthly_Total = sum(money))
# # A tibble: 8 x 3
# # Groups: season [4]
# season location Monthly_Total
# <dbl> <chr> <int>
# 1 1 Cal 360
# 2 1 Miami 265
# 3 2 Cal 392
# 4 2 Miami 380
# 5 3 Cal 348
# 6 3 Miami 278
# 7 4 Cal 358
# 8 4 Miami 411
Using the same data you can skip the column creation and include it in group_by:
df %>%
group_by(season = (month-1) %/% 3 +1, location) %>%
summarize(Monthly_Total = sum(money))
## results identical to above.
It may make more sense to just create a season table:
seasons = tibble(
month = 1:12,
season = rep(c("Spring", "Summer", "Winter", "Fall"), each = 3)
)
df %>%
left_join(seasons) %>%
group_by(season, location) %>%
summarize(Monthly_Total = sum(money))
## again identical to above
The latter has the advantage of being more transparent.
You could aggregate after transforming the Month variable:
aggregate(Money ~ Month + Location, transform(data, Month = (Month - 1) %/% 3), sum)
Let's assume I have a dataset similar to this:
library(tidyverse)
country <- c(rep("Germany", 9), rep("Greece", 9), rep("Turkey", 9), rep("Austria", 9))
date <- rep(seq(ymd('2012-04-07'),ymd('2012-04-15'),by='days'), 4)
gyros_eaten <- c(lag(1:3, n = 2, default = 0), floor(runif(6, min=1, max=4)),
lag(1:6, n = 5, default = 0), floor(runif(3, min=1, max=4)),
lag(1:2, n = 1, default = 0), floor(runif(7, min=1, max=4)),
lag(1:3, n = 2, default = 0), floor(runif(6, min=1, max=4)))
df <- data.frame(country, date, gyros_eaten)
I want to create a new column, name it days_since_first_local_gyros, which would tell me how many days have passed since eating the first gyros, in that particular country. That is, it would take the value of zero if no gyros was eaten yet, or if on that day I ate my first gyros in e.g. Greece. After that, it would run from 1 to infinity, increasing by 1 on each day. Hence each group (i.e. country) would have a distinct index, and I have rows for each possible country x day dyads in my dataset, within a particular time range.
How should I create this column? I assume it's some combination of case when, group_by and lag, but i just can't wrap my head around it.
We could get the first date where gyros_eaten > 0 and subtract all the dates from that date in the country. We use pmax to keep max value of 0 in days_since_first_local_gyros.
library(dplyr)
df %>%
arrange(country, date) %>%
group_by(country) %>%
mutate(days_since_first_local_gyros = pmax(as.integer(date -
date[which.max(gyros_eaten > 0)]), 0))
# country date gyros_eaten days_since_first_local_gyros
# <chr> <date> <dbl> <dbl>
# 1 Austria 2012-04-07 0 0
# 2 Austria 2012-04-08 0 0
# 3 Austria 2012-04-09 1 0
# 4 Austria 2012-04-10 1 1
# 5 Austria 2012-04-11 1 2
# 6 Austria 2012-04-12 2 3
# 7 Austria 2012-04-13 2 4
# 8 Austria 2012-04-14 2 5
# 9 Austria 2012-04-15 3 6
#10 Germany 2012-04-07 0 0
# … with 26 more rows
I am looking for a way to check wether two columns in a data frame contain the same elements for one or more rows, then eliminate the row containing more NAs.
Lets assume we have a data frame as such:
x <- data.frame("Year" = c(2017,2017,2017,2018,2018),
"Country" = c("Sweden", "Sweden", "Norway", "Denmark", "Finland"),
"Sales" = c(15, 15, 18, 13, 12),
"Campaigns" = c(3, NA, 4, 1, 1),
"Employees" = c(15, 15, 12, 8, 9),
"Satisfaction" = c(0.8, NA, 0.9, 0.95, 0.87),
"Expenses" = c(NA, NA, 9000, 7500, 4300))
Note that the entry for Sweden in the year 2017 is there twice, but the first row has one entry with NA while the other one contains NAs in three places. Now I would like to check wether two rows contain the same "Year" and "Country", then proceed to eliminate the row containing the higher amount of NAs, in this case the second row. I did some research but I could not seem to find a solution for this particular case.
Thank you very much in advance.
Using dplyr:
library(dplyr)
x %>%
mutate(n_na = rowSums(is.na(.))) %>% ## calculate NAs for each row
group_by(Year, Country) %>% ## for each year/country
arrange(n_na) %>% ## sort by number of NAs
slice(1) %>% ## take the first row
select(-n_na) ## remove the NA counter column
# A tibble: 4 x 7
# Groups: Year, Country [4]
Year Country Sales Campaigns Employees Satisfaction Expenses
<dbl> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2017 Norway 18 4 12 0.90 9000
2 2017 Sweden 15 3 15 0.80 NA
3 2018 Denmark 13 1 8 0.95 7500
4 2018 Finland 12 1 9 0.87 4300
We can use a data.table approach
library(data.table)
ind <- setDT(x)[, {
i1 <- Reduce(`+`, lapply(.SD, is.na))
.I[i1 > 0 & (i1 == max(i1))]
}, .(Year, Country)]$V1
x[-ind]
# Year Country Sales Campaigns Employees Satisfaction Expenses
#1: 2017 Sweden 15 3 15 0.80 NA
#2: 2017 Norway 18 4 12 0.90 9000
#3: 2018 Denmark 13 1 8 0.95 7500
#4: 2018 Finland 12 1 9 0.87 4300
Base R solution:
x$nas <- rowSums(sapply(x, is.na))
do.call(rbind,
by(x, x[c("Year","Country")],
function(df) head(df[order(df$nas),,drop=FALSE], n=1)))
# Year Country Sales Campaigns Employees Satisfaction Expenses nas
# 4 2018 Denmark 13 1 8 0.95 7500 0
# 5 2018 Finland 12 1 9 0.87 4300 0
# 3 2017 Norway 18 4 12 0.90 9000 0
# 1 2017 Sweden 15 3 15 0.80 NA 1
Not too surprisingly, the data.table implementation is the fast, though I"m a little surprised by how much faster it was than base R. Being a small dataset could affect this. (In the benchmarking, I had to create a copy of the original, since data.table modifies the data in-place, so x is no longer a data.frame.)
microbenchmark(
data.table = {
x0 <- copy(x)
ind <- setDT(x0)[, {
i1 <- Reduce(`+`, lapply(.SD, is.na))
.I[i1 > 0 & (i1 == max(i1))]
}, .(Year, Country)]$V1
x0[-ind]
},
dplyr = {
x %>%
mutate(n_na = rowSums(is.na(.))) %>% ## calculate NAs for each row
group_by(Year, Country) %>% ## for each year/country
arrange(n_na) %>% ## sort by number of NAs
slice(1) %>% ## take the first row
select(-n_na) ## remove the NA counter column
},
base = {
x0 <- x
x0$nas <- rowSums(sapply(x0, is.na))
do.call(rbind,
by(x0, x0[c("Year","Country")],
function(df) head(df[order(df$nas),,drop=FALSE], n=1)))
}
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# data.table 1.223477 1.441005 1.973714 1.582861 1.919090 12.837569 100
# dplyr 2.675239 2.901882 4.465172 3.079295 3.806453 42.261540 100
# base 2.039615 2.209187 2.737758 2.298714 2.570760 8.586946 100
I have a data.frame with two columns: year and score. The years go from 2000-2012 and each year can be listed multiple times. In the score column I list all the scores for each year with each row having a different score.
What I'd like to do is filter the data.frame so only the rows with the maximum scores for each year remain.
So as a tiny example if I have
year score
2000 18
2001 22
2000 21
I would want to return just
year score
2001 22
2000 21
If you know sql this is easier to understand
library(sqldf)
sqldf('select year, max(score) from mydata group by year')
Update (2016-01): Now you can also use dplyr
library(dplyr)
mydata %>% group_by(year) %>% summarise(max = max(score))
using plyr
require(plyr)
set.seed(45)
df <- data.frame(year=sample(2000:2012, 25, replace=T), score=sample(25))
ddply(df, .(year), summarise, max.score=max(score))
using data.table
require(data.table)
dt <- data.table(df, key="year")
dt[, list(max.score=max(score)), by=year]
using aggregate:
o <- aggregate(df$score, list(df$year) , max)
names(o) <- c("year", "max.score")
using ave:
df1 <- df
df1$max.score <- ave(df1$score, df1$year, FUN=max)
df1 <- df1[!duplicated(df1$year), ]
Edit: In case of more columns, a data.table solution would be the best (my opinion :))
set.seed(45)
df <- data.frame(year=sample(2000:2012, 25, replace=T), score=sample(25),
alpha = sample(letters[1:5], 25, replace=T), beta=rnorm(25))
# convert to data.table with key=year
dt <- data.table(df, key="year")
# get the subset of data that matches this criterion
dt[, .SD[score %in% max(score)], by=year]
# year score alpha beta
# 1: 2000 20 b 0.8675148
# 2: 2001 21 e 1.5543102
# 3: 2002 22 c 0.6676305
# 4: 2003 18 a -0.9953758
# 5: 2004 23 d 2.1829996
# 6: 2005 25 b -0.9454914
# 7: 2007 17 e 0.7158021
# 8: 2008 12 e 0.6501763
# 9: 2011 24 a 0.7201334
# 10: 2012 19 d 1.2493954
using base packages
> df
year score
1 2000 18
2 2001 22
3 2000 21
> aggregate(score ~ year, data=df, max)
year score
1 2000 21
2 2001 22
EDIT
If you have additional columns that you need to keep, then you can user merge with aggregate to get those columns
> df <- data.frame(year = c(2000, 2001, 2000), score = c(18, 22, 21) , hrs = c( 10, 11, 12))
> df
year score hrs
1 2000 18 10
2 2001 22 11
3 2000 21 12
> merge(aggregate(score ~ year, data=df, max), df, all.x=T)
year score hrs
1 2000 21 12
2 2001 22 11
data <- data.frame(year = c(2000, 2001, 2000), score = c(18, 22, 21))
new.year <- unique(data$year)
new.score <- sapply(new.year, function(y) max(data[data$year == y, ]$score))
data <- data.frame(year = new.year, score = new.score)
one liner,
df_2<-data.frame(year=sort(unique(df$year)),score = tapply(df$score,df$year,max));