I have a dataset on this form:
set.seed(4561) # Make the results reproducible
df=data.frame(
colour=rep(c("green","red","blue"),each=3),
year=rep("2017",9),
month=rep(c(1,2,3),3),
price=c(200,254,188,450,434,490,100,99,97),
work=ceiling(runif(9,30,60)),
gain=ceiling(runif(9,1,10)),
work_weighed_price=NA,
gain_weighed_price=NA
)
For each colour, year, month I have a price (output variable) and two input variables called gain and work. In reality I have many more input variables, but this suffices to show what I desire to do with my dataframe.
> df
colour year month price work gain work_weighed_price gain_weighed_price
1 green 2017 1 200 33 9 NA NA
2 green 2017 2 254 56 5 NA NA
3 green 2017 3 188 42 8 NA NA
4 red 2017 1 450 39 3 NA NA
5 red 2017 2 434 45 2 NA NA
6 red 2017 3 490 36 8 NA NA
7 blue 2017 1 100 50 8 NA NA
8 blue 2017 2 99 45 8 NA NA
9 blue 2017 3 97 56 4 NA NA
I wish to calculate the weighted gain and work (and also the weighted price), where the weight is the price for that month and year, divided by the sum of price across colours:
desired_output=data.frame(
year=rep("2017",3),
month=rep(c(1,2,3),1),
price=c(200*(200/(200+450+100))+450*(450/(200+450+100))+100*(100/(200+450+100)),
254*(254/(254+434+99))+434*(434/(254+434+99))+99*(99/(254+434+99)),
188*(188/(188+490+97))+490*(490/(188+490+97))+97*(97/(188+490+97))),
work_weighed_price=c(47*(200/(200+450+100))+44*(450/(200+450+100))+52*(100/(200+450+100)),
44*(254/(254+434+99))+42*(434/(254+434+99))+32*(99/(254+434+99)),
38*(188/(188+490+97))+52*(490/(188+490+97))+52*(97/(188+490+97))) ,
gain_weighed_price=c(5*(200/(200+450+100))+8*(450/(200+450+100))+10*(100/(200+450+100)),
3*(254/(254+434+99))+7*(434/(254+434+99))+9*(99/(254+434+99)),
2*(188/(188+490+97))+4*(490/(188+490+97))+9*(97/(188+490+97)))
)
> desired_output
year month price work_weighed_price gain_weighed_price
1 2017 1 336.6667 45.86667 7.466667
2 2017 2 333.7649 41.38755 5.960610
3 2017 3 367.5523 48.60387 4.140645
How would I attack this in R?
You can use the weighted.mean function
df %>%
group_by(year, month) %>%
summarise_at(vars(price, work, gain),
funs(price_weighted = weighted.mean(., price)))
# # A tibble: 3 x 5
# # Groups: year [?]
# year month price_price_weighted work_price_weighted gain_price_weighted
# <int> <int> <dbl> <dbl> <dbl>
# 1 2017 1 337 45.9 7.47
# 2 2017 2 334 41.4 5.96
# 3 2017 3 368 48.6 4.14
Or, in data.table
library(data.table)
setDT(df)
df[, lapply(.SD, weighted.mean, price)
, .SDcols = c('price', 'work', 'gain')
, by = .(year, month)]
# year month price work gain
# 1: 2017 1 336.6667 45.86667 7.466667
# 2: 2017 2 333.7649 41.38755 5.960610
# 3: 2017 3 367.5523 48.60387 4.140645
An approach using dplyr. Your use of runif in your example df without setting seed and the fact that it doesn't line up with your desired output is causing some confusion. In the code below, I use a df that's consistent with your desired output.
library(dplyr)
df %>%
group_by(year, month) %>%
mutate(weight = price / sum(price)) %>%
mutate_at(vars(price, work, gain), funs(weighed_price = . * weight)) %>%
summarise_at(vars(ends_with("weighed_price")), sum)
# # A tibble: 3 x 5
# # Groups: year [?]
# year month work_weighed_price gain_weighed_price price_weighed_price
# <int> <int> <dbl> <dbl> <dbl>
# 1 2017 1 45.9 7.47 337.
# 2 2017 2 41.4 5.96 334.
# 3 2017 3 48.6 4.14 368.
df:
structure(list(colour = c("green", "green", "green", "red", "red",
"red", "blue", "blue", "blue"), year = c(2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L), month = c(1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L), price = c(200L, 254L, 188L, 450L,
434L, 490L, 100L, 99L, 97L), work = c(47L, 44L, 38L, 44L, 42L,
52L, 52L, 32L, 52L), gain = c(5L, 3L, 2L, 8L, 7L, 4L, 10L, 9L,
9L), work_weighed_price = c(NA, NA, NA, NA, NA, NA, NA, NA, NA
), gain_weighed_price = c(NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("colour",
"year", "month", "price", "work", "gain", "work_weighed_price",
"gain_weighed_price"), class = "data.frame", row.names = c(NA,
-9L))
A base R solution could be the following sequence of tapply instructions.
fun_price <- function(x){
s <- sum(x)
sum(x*(x/s))
}
fun_weighted <- function(x, w){
s <- sum(w)
sum(x*(w/s))
}
desired <- data.frame(year = unique(df$year), month = sort(unique(df$month)))
desired$price <- with(df, tapply(price, month, FUN = fun_price))
desired$work_weighed_price <- with(df, tapply(work, month, FUN = fun_weighted, w = price))
desired$gain_weighed_price <- with(df, tapply(gain, month, FUN = fun_weighted, w = price))
desired
# year month price work_weighed_price gain_weighed_price
#1 2017 1 336.6667 40.74092 6.622405
#2 2017 2 333.7649 48.56834 4.984429
#3 2017 3 367.5523 44.65052 6.659170
Related
This question already has answers here:
Aggregate by multiple columns and reshape from long to wide
(4 answers)
Closed 2 years ago.
I have a table similar to this
Year Month Purchase_ind Value
2018 1 1 100
2018 1 1 100
2018 1 0 100
2018 2 1 2
2018 2 0 198
2018 3 1 568
2019 1 0 230
.
.
.
And I want to do a matrix whth:
Year for Y axis
Month for X axis
in the calculate section, I need (Value with Purchase ind=1)/Total value
Having this as a result:
2018 2019 2020
1 0.66 0 x
2 0.01 x x
3 1 x x
Thanks a lot for your help!
You can calculate the proportion for Year and Month and cast the data to wide format :
library(dplyr)
df %>%
group_by(Year, Month) %>%
summarise(Value = sum(Value[Purchase_ind == 1])/sum(Value)) %>%
tidyr::pivot_wider(names_from = Year, values_from = Value)
#Add values_fill = 0 if you want 0's instead of `NA`'s
#tidyr::pivot_wider(names_from = Year, values_from = Value, values_fill = 0)
# Month `2018` `2019`
# <int> <dbl> <dbl>
#1 1 0.667 0
#2 2 0.01 NA
#3 3 1 NA
data
df <- structure(list(Year = c(2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2019L), Month = c(1L, 1L, 1L, 2L, 2L, 3L, 1L), Purchase_ind = c(1L,
1L, 0L, 1L, 0L, 1L, 0L), Value = c(100L, 100L, 100L, 2L, 198L,
568L, 230L)), class = "data.frame", row.names = c(NA, -7L))
using data.table:
DT <- data.table(year = c(2018,2018,2018,2018,2018,2018,2019),
month = c(1,1,1,2,2,3,1),
purchase_ind = c(1,1,0,1,0,1,0),
value = c(100,100,100,2,198,568,230))
DT[, value_ind := fifelse(purchase_ind == 1, value, 0)]
DT <- copy(DT[, .(calculate_session = sum(value_ind) / sum(value)), by = .(year, month)])
dcast(DT, month ~ year, value.var = 'calculate_session')
Output:
month 2018 2019
1: 1 0.6666667 0
2: 2 0.0100000 NA
3: 3 1.0000000 NA
in base R you could do:
(a <- prop.table(xtabs(Value ~ Month + Year + Purchase_ind, df), c(1, 2)))
, , Purchase_ind = 0
Year
Month 2018 2019
1 0.3333333 1.0000000
2 0.9900000
3 0.0000000
, , Purchase_ind = 1
Year
Month 2018 2019
1 0.6666667 0.0000000
2 0.0100000
3 1.0000000
of course if you only need the purchase_ind = 1, you could just subscript it:
a[, , "1"] #or even a[, , 2]
Year
Month 2018 2019
1 0.6666667 0.0000000
2 0.0100000
3 1.0000000
I'm dealing with a large dataset that has some cleanliness issues. For your sanity, I've produced a much simpler example. Let's say the dataset looks like this:
A B C D E F G H
1 Albania 2015 10 NA NA NA 60 NA
2 Albania 2015 NA NA 50 NA NA 10
3 Greece 2016 30 NA 20 NA NA NA
4 Greece 2016 NA 400 NA 30 30 10
5 Greece 2017 NA 40 NA NA NA NA
6 Greece 2017 20 NA 30 30 50 10
7 Albania 2015 NA 200 NA 40 NA NA
Excuse the complexity, but I'm hoping a solution will work for my practical application. As you can see, duplicate entries of rows (eg Albania 2015) have caused the variable observations to be spread across multiple rows. I'm looking to bind rows with matching string values in the first two columns (country and year) and unify the spread out variable values (C-H) into a single row. In the end, the data frame should look like this:
A B C D E F G H
1 Albania 2015 10 200 50 40 60 10
2 Greece 2016 30 400 20 30 30 10
3 Greece 2017 20 40 30 30 50 10
Can someone carry me to a solution here? Thanks!
We can group by 'A', 'B', and summarise the rest of the columns with one of the functions max/sum/min as there is only a single non-NA element for each column/group
library(dplyr)
df1 %>%
group_by(A, B) %>%
summarise_at(vars(-group_cols()), ~ if(all(is.na(.))) NA
else max(., na.rm = TRUE))
# A tibble: 3 x 8
# Groups: A [2]
# A B C D E F G H
# <chr> <int> <int> <int> <int> <int> <int> <int>
#1 Albania 2015 10 200 50 40 60 10
#2 Greece 2016 30 400 20 30 30 10
#3 Greece 2017 20 40 30 30 50 10
data
df1 <- structure(list(A = c("Albania", "Albania", "Greece", "Greece",
"Greece", "Greece", "Albania"), B = c(2015L, 2015L, 2016L, 2016L,
2017L, 2017L, 2015L), C = c(10L, NA, 30L, NA, NA, 20L, NA), D = c(NA,
NA, NA, 400L, 40L, NA, 200L), E = c(NA, 50L, 20L, NA, NA, 30L,
NA), F = c(NA, NA, NA, 30L, NA, 30L, 40L), G = c(60L, NA, NA,
30L, NA, 50L, NA), H = c(NA, 10L, NA, 10L, NA, 10L, NA)),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7"))
I want to make groups of data where measurements are done in multiple Year on the same species at the same Lat and Long. Then, I want to run linear regression on all those groups (using N as dependent variable and Year as independent variable).
Practice dataset:
Species Year Lat Long N
1 1 1999 1 1 5
2 1 2001 2 1 5
3 2 2010 3 3 4
4 2 2010 3 3 2
5 2 2011 3 3 5
6 2 2012 3 3 8
7 3 2007 8 7 -10
8 3 2019 8 7 100
9 2 2000 1 1 5
First, I averaged data where multiple measurements were done in the same Year on the same Species at the same latitude and longitude . Then, I split data based on Lat, Long and Species. However, this still groups rows together where Lat, Long and Species are not equal ($ '4'). Furthermore, I want to remove $'1', since I only want to use data where multiple measurements are done over a number of Year. How do I do this?
Data <- read.table("Dataset.txt", header = TRUE)
Agr_Data <- aggregate(N ~ Lat + Long + Year + Species, data = Data, mean)
Split_Data <- split(Agr_Data, Agr_Data$Lat + Agr_Data$Long + Agr_Data$Species)
Regression_Data <- lapply(Split_Data, function(Split_Data) lm(N~Year, data = Split_Data) )
Split_Data
$`3`
Lat Long Year Species N
1 1 1 1999 1 5
$`4`
Lat Long Year Species N
2 2 1 2001 1 5
3 1 1 2000 2 5
$`8`
Lat Long Year Species N
4 3 3 2010 2 3
5 3 3 2011 2 5
6 3 3 2012 2 8
$`18`
Lat Long Year Species N
7 8 7 2007 3 -10
8 8 7 2019 3 100
Desired output:
Lat Long Species Coefficients
3 3 2 2.5
8 7 3 9.167
Base R solution:
# 1. Import data:
df <- structure(list(Species = c(1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 2L ),
Year = c(1999L, 2001L, 2010L, 2010L, 2011L, 2012L, 2007L, 2019L, 2000L),
Lat = c(1L, 2L, 3L, 3L, 3L, 3L, 8L, 8L, 1L),
Long = c(1L, 1L, 3L, 3L, 3L, 3L, 7L, 7L, 1L),
N = c(5L, 5L, 4L, 2L, 5L, 8L, -10L, 100L, 5L)),
class = "data.frame", row.names = c(NA, -9L ))
# 2. Aggregate data:
df <- aggregate(N ~ Lat + Long + Year + Species, data = df, mean)
# 3. Concatenate vecs to create grouping vec:
df$grouping_var <- paste(df$Species, df$Lat, df$Long, sep = ", ")
# 4. split apply combine lm:
coeff_n <- as.numeric(do.call("rbind", lapply(split(df, df$grouping_var),
function(x){
ifelse(nrow(x) > 1, coef(lm(N ~ Species+Lat+Long, data = x)), NA)
}
)
)
)
# 5. Create a dataframe of coeffs:
coeff_df <- data.frame(cbind(grouping_var = unique(df$grouping_var), coeff_n = coeff_n))
# 6. Merge the dataframes together:
df <- merge(df, coeff_df, by = "grouping_var", all.x = TRUE)
I have a data frame that looks like the following:
Year Day ID V1 V2 ....
2003 35 1102 3 6
2003 35 1103 5 NA
2003 35 1104 8 100
.....
2003 40 1102 NA 8
2003 40 1103 NA 10
2003 40 1104 9 NA
.....
.....
2018 49 1104 5 NA
.....
2018 50 1102 3 6
2018 50 1103 7 NA
2018 50 1104 NA 100
I would like to build a data frame that extracts, for each combination of Year and ID, the the latest (high value per the Day column) non-NA value in V1, V2... Based on the above data set, for Year = 2018 and ID = 1104, I would like to extract V1 = 5 (on Day = 49) and V2 = 100 (on Day = 50). If all values for that Year and ID combination are NA then I would like it to return NA.
We can create a function which gives us the latest non-NA value based on Day for each Vn column
get_last_non_NA_value <- function(x) {
x[which.max(cumsum(!is.na(x)))]
}
and then apply that function for each Year and ID
library(dplyr)
df %>%
group_by(Year, ID) %>%
summarise_at(vars(V1:V2), funs(get_last_non_NA_value(.[order(Day)])))
# Year ID V1 V2
# <int> <int> <int> <int>
#1 2003 1102 3 8
#2 2003 1103 5 10
#3 2003 1104 9 100
#4 2018 1102 3 6
#5 2018 1103 7 NA
#6 2018 1104 5 100
EDIT
If we also want to extract corresponding Day for each value, we can change the function to return both values as comma-separated string
get_last_non_NA_value <- function(x, y) {
ind <- which.max(cumsum(!is.na(x[order(y)])))
paste(x[ind], y[ind], sep = ",")
}
and then use cSplit to separate these comma separated values into different columns.
library(dplyr)
library(splitstackshape)
cols <- c("V1", "V2")
df %>%
group_by(Year, ID) %>%
summarise_at(cols, funs(get_last_non_NA_value(., Day))) %>%
cSplit(cols) %>%
rename_at(vars(contains("_1")), funs(sub("_1", "_last_value", .))) %>%
rename_at(vars(contains("_2")), funs(sub("_2", "_days", .)))
# Year ID V1_last_value V1_days V2_last_value V2_days
#1: 2003 1102 3 35 8 40
#2: 2003 1103 5 35 10 40
#3: 2003 1104 9 40 100 35
#4: 2018 1102 3 50 6 50
#5: 2018 1103 7 50 NA 50
#6: 2018 1104 5 49 100 50
Note that rename_at part renames the columns for better understanding of what value it holds, you can skip that part if you are not interested in renaming columns.
data
df <- structure(list(Year = c(2003L, 2003L, 2003L, 2003L, 2003L, 2003L,
2018L, 2018L, 2018L, 2018L), Day = c(35L, 35L, 35L, 40L, 40L,
40L, 49L, 50L, 50L, 50L), ID = c(1102L, 1103L, 1104L, 1102L,
1103L, 1104L, 1104L, 1102L, 1103L, 1104L), V1 = c(3L, 5L, 8L,
NA, NA, 9L, 5L, 3L, 7L, NA), V2 = c(6L, NA, 100L, 8L, 10L, NA,
NA, 6L, NA, 100L)), .Names = c("Year", "Day", "ID", "V1", "V2"
), class = "data.frame", row.names = c(NA, -10L))
You can use dplyr
Assuming you want max for V1 and V2
library(dplyr)
df %>%
group_by(Year, ID) %>%
summarise(Day = max(Day, na.rm = TRUE),
V1 = max(V1, na.rm = TRUE),
V2 = max(V2, na.rm = TRUE))
If for V1 and V2, you want first non-NA then
df %>%
group_by(Year, ID) %>%
summarise(Day = max(Day, na.rm = TRUE),
V1 = first(setdiff(V1, NA)),
V2 = first(setdiff(V1, NA)))
This question already has answers here:
How to sum a variable by group
(18 answers)
Closed 7 years ago.
I have a dataset that looks something like this:
Type Age count1 count2 Year Pop1 Pop2 TypeDescrip
A 35 1 1 1990 30000 50000 alpha
A 35 3 1 1990 30000 50000 alpha
A 45 2 3 1990 20000 70000 alpha
B 45 2 1 1990 20000 70000 beta
B 45 4 5 1990 20000 70000 beta
I want to add the counts of the rows that are matching in the Type and Age columns. So ideally I would end up with a dataset that looks like this:
Type Age count1 count2 Year Pop1 Pop2 TypeDescrip
A 35 4 2 1990 30000 50000 alpha
A 45 2 3 1990 20000 70000 alpha
B 45 6 6 1990 20000 70000 beta
I've tried using nested duplicated() statements such as below:
typedup = duplicated(df$Type)
bothdup = duplicated(df[(typedup == TRUE),]$Age)
but this returns indices for which age or type are duplicated, not necessarily when one row has duplicates of both.
I've also tried tapply:
tapply(c(df$count1, df$count2), c(df$Age, df$Type), sum)
but this output is difficult to work with. I want to have a data.frame when I'm done.
I don't want to use a for-loop because my dataset is quite large.
Try
library(dplyr)
df1 %>%
group_by(Type, Age) %>%
summarise_each(funs(sum))
# Type Age count1 count2
#1 A 35 4 2
#2 A 45 2 3
#3 B 45 6 6
In the newer versions of dplyr
df1 %>%
group_by(Type, Age) %>%
summarise_all(sum)
Or using base R
aggregate(.~Type+Age, df1, FUN=sum)
# Type Age count1 count2
#1 A 35 4 2
#2 A 45 2 3
#3 B 45 6 6
Or
library(data.table)
setDT(df1)[, lapply(.SD, sum), .(Type, Age)]
# Type Age count1 count2
#1: A 35 4 2
#2: A 45 2 3
#3: B 45 6 6
Update
Based on the new dataset,
df2 %>%
group_by(Type, Age,Pop1, Pop2, TypeDescrip) %>%
summarise_each(funs(sum), matches('^count'))
# Type Age Pop1 Pop2 TypeDescrip count1 count2
#1 A 35 30000 50000 alpha 4 2
#2 A 45 20000 70000 beta 2 3
#3 B 45 20000 70000 beta 6 6
data
df1 <- structure(list(Type = c("A", "A", "A", "B", "B"), Age = c(35L,
35L, 45L, 45L, 45L), count1 = c(1L, 3L, 2L, 2L, 4L), count2 = c(1L,
1L, 3L, 1L, 5L)), .Names = c("Type", "Age", "count1", "count2"
), class = "data.frame", row.names = c(NA, -5L))
df2 <- structure(list(Type = c("A", "A", "A", "B", "B"), Age = c(35L,
35L, 45L, 45L, 45L), count1 = c(1L, 3L, 2L, 2L, 4L), count2 = c(1L,
1L, 3L, 1L, 5L), Year = c(1990L, 1990L, 1990L, 1990L, 1990L),
Pop1 = c(30000L, 30000L, 20000L, 20000L, 20000L), Pop2 = c(50000L,
50000L, 70000L, 70000L, 70000L), TypeDescrip = c("alpha",
"alpha", "beta", "beta", "beta")), .Names = c("Type", "Age",
"count1", "count2", "Year", "Pop1", "Pop2", "TypeDescrip"),
class = "data.frame", row.names = c(NA, -5L))
#hannah you can also use sql using the sqldf package
sqldf("select
Type,Age,
sum(count1) as sum_count1,
sum(count2) as sum_count2
from
df
group by
Type,Age
")