Count number of occurences for every column in dataframe - r

I have a dataframe with an unknown amount of columns (it can change frequently) and I need to count the number of observations for a given ID and year for every column and create a costum "n" column for each column of my dataframe telling me how many observations were made for that specific column.
I have tried:
library(dplyr)
count <- tally(group_by(final_database,ID,Year))
But that will count unique combinations of ID + Year. While I need to know how many times over the years my ID was observed for each characteristic. Example:
ID Year CHAR1 n_CHAR1
A 2016 0 3
A 2017 5 3
A 2018 2 3
A 2019 3
B 2016 1 2
B 2017 2
B 2018 2
B 2019 1 2
And so on for all characteristics. I would insert the "n_CHAR" columns to the original dataframe.
It doesn't need to be tidy.
Thanks!

Try:
transform(final_database, n_CHAR1 = ave(CHAR1, ID, FUN = function(x) sum(x != "")))
If the blank rows are actually NA, then just replace sum(x != "") with sum(!is.na(x)).
Edit:
If you'd need multiple n columns for multiple NCHAR columns, you could do:
library(dplyr)
final_database %>%
group_by(ID) %>%
mutate_at(vars(starts_with("CHAR")),
list(n = ~ sum(. != "")))
This example assumes that all the relevant NCHAR columns start with the string NCHAR (e.g. NCHAR1, NCHAR2, NCHAR3, etc.).
If the columns you're referring to are 3rd to last, then you can do:
library(dplyr)
finalDatabase <- final_database %>%
group_by(ID) %>%
mutate_at(vars(3:ncol(.)), # If you don't have many other vars except NCHAR, you can also do vars(-ID, -Year) as suggested by #camille
list(n = ~ sum(. != ""))) %>%
select(ID, Year, ends_with("_n"))

We can also do this with data.table:
library(data.table)
setDT(df)[, n_CHAR1 := sum(CHAR1 != ""), by = "ID"]
Output:
ID Year CHAR1 n_CHAR1
1: A 2016 0 3
2: A 2017 5 3
3: A 2018 2 3
4: A 2019 3
5: B 2016 1 2
6: B 2017 2
7: B 2018 2
8: B 2019 1 2
Data:
df <- structure(list(ID = c("A", "A", "A", "A", "B", "B", "B", "B"),
Year = c(2016L, 2017L, 2018L, 2019L, 2016L, 2017L, 2018L,
2019L), CHAR1 = c("0", "5", "2", "", "1", "", "", "1")), row.names = c(NA,
-8L), class = "data.frame")

Related

Delete duplicates between groups in R

Thanks in advance for any help.
my Data looks like this:
|year|class|
|---|----|
|2007|a|
|2007|b|
|2007|c|
|2007|d|
|2008|a|
|2008|b|
|2008|e|
|2008|f|
|2009|c|
|2009|d|
|2009|e|
|2009|g|
The goal would be to delete any classes which occure in the previous year, so the final data looks like this:
|year|class|
|---|----|
|2007|a|
|2007|b|
|2007|c|
|2007|d|
|2008|e|
|2008|f|
|2009|c|
|2009|d|
|2009|g|
I tried this code, I intendet to group the data and then delete all within group duplicates but it did not remove everything just a few rows.
Instead of duplicates() I also tried unique() which did not work.
d %>% group_by(class, Group = c(0, cumsum(diff(year) != 1))) %>%
filter(!(duplicated(class, fromLast = TRUE)| duplicated(class))) %>%
ungroup() %>%
select(-Group)
Is there maybe another R function which can look at group differences?
Thanks for any help
Edit: Thanks too all for your very helpfull answers!
Left join DF to itself on class and a year difference of 1 and retain only those rows for which there is no such match.
library(sqldf)
sqldf("select a.*
from DF a
left join DF b on b.class = a.class and b.year = a.year - 1
where b.year is null")
giving:
year class
1 2007 a
2 2007 b
3 2007 c
4 2007 d
5 2008 e
6 2008 f
7 2009 c
8 2009 d
9 2009 g
Note
Lines <- "|year|class|
|2007|a|
|2007|b|
|2007|c|
|2007|d|
|2008|a|
|2008|b|
|2008|e|
|2008|f|
|2009|c|
|2009|d|
|2009|e|
|2009|g|"
DF <- read.table(text = Lines, sep = "|", header = TRUE)[2:3]
using library(data.table)
setDT(df)[, .(class = setdiff(class, df[year==y-1, class])), by=.(y=year)]
# y class
# 1: 2007 a
# 2: 2007 b
# 3: 2007 c
# 4: 2007 d
# 5: 2008 e
# 6: 2008 f
# 7: 2009 c
# 8: 2009 d
# 9: 2009 g
df=df[order(df$class,df$year),]
df$y_diff=c(0,diff(df$year))
df$c_lag=c("x",head(df$class,-1))
df[df$y_diff!=1 | df$class!=df$c_lag,1:2]
year class
1 2007 a
2 2007 b
3 2007 c
9 2009 c
4 2007 d
10 2009 d
7 2008 e
8 2008 f
12 2009 g
Here are some base R solution:
split + for loop
dflst <- unname(split(df, df$year))
for (k in seq_along(dflst)[-1]) {
dflst[[k]] <- subset(dflst[[k]], !class %in% dflst[[k - 1]]$class)
}
dfout <- do.call(rbind, dflst)
merge + subset + is.na
dfout <- subset(merge(
df,
transform(
df,
yr = year + 1
),
by.x = c("year", "class"),
by.y = c("yr", "class"),
all.x = TRUE
),
is.na(year.y),
select = -year.y
)
which gives
year class
1 2007 a
2 2007 b
3 2007 c
4 2007 d
7 2008 e
8 2008 f
9 2009 c
10 2009 d
12 2009 g
data
> dput(df)
structure(list(year = c(2007L, 2007L, 2007L, 2007L, 2008L, 2008L,
2008L, 2008L, 2009L, 2009L, 2009L, 2009L), class = c("a", "b",
"c", "d", "a", "b", "e", "f", "c", "d", "e", "g")), class = "data.frame", row.names = c(NA,
-12L))
An analysis of all the current answer
df=structure(list(year = c(2007L, 2007L, 2007L, 2007L, 2008L, 2008L,
2008L, 2008L, 2009L, 2009L, 2009L, 2009L), class = c("a", "b",
"c", "d", "a", "b", "e", "f", "c", "d", "e", "g")), class = "data.frame", row.names = c(NA,
-12L))
library(sqldf)
library(data.table)
library(dplyr)
library(purrr)
library(microbenchmark)
groth = function() {
sqldf("select a.*
from df a
left join df b on b.class = a.class and b.year = a.year - 1
where b.year is null")
}
thomas1 = function() {
dflst <- unname(split(df, df$year))
for (k in seq_along(dflst)[-1]) {
dflst[[k]] <- subset(dflst[[k]], !class %in% dflst[[k - 1]]$class)
}
dfout <- do.call(rbind, dflst)
}
thomas2 = function() {
dfout <- subset(merge(
df,
transform(
df,
yr = year + 1
),
by.x = c("year", "class"),
by.y = c("yr", "class"),
all.x = TRUE
),
is.na(year.y),
select = -year.y
)
}
dww = function() {
setDT(df)[, .(class = setdiff(class, df[year==y-1, class])), by=.(y=year)]
}
user29 = function() {
df=df[order(df$class,df$year),]
df$y_diff=c(0,diff(df$year))
df$c_lag=c("x",head(df$class,-1))
df[df$y_diff!=1 | df$class!=df$c_lag,1:2]
}
anous = function() {
df %>%
group_by(class) %>%
mutate(dup = n() > 1) %>%
group_split() %>%
map_dfr(~ if(unique(.x$dup) & (.x$year[2] - .x$year[1]) == 1) {
.x %>% slice_head(n = 1)
} else {
.x
}) %>%
select(-dup) %>%
arrange(year)
}
benchmark
set.seed(1)
microbenchmark::microbenchmark(
groth(), thomas1(), thomas2(), dww(), user29(), anous(), times=10)
Unit: microseconds
expr min lq mean median uq max neval
groth() 8864.702 9532.502 10885.691 9774.151 11628.401 14432.101 10
thomas1() 792.801 836.001 1666.511 1024.651 1065.601 7921.401 10
thomas2() 1758.700 2024.700 3172.011 2371.601 3348.701 8032.301 10
dww() 3876.201 4280.400 4953.251 4383.701 5320.101 8807.501 10
user29() 464.601 494.502 1249.081 542.951 643.300 7562.401 10
anous() 10506.801 11091.602 12232.101 11424.801 12889.401 17279.201 10
with a much bigger dataframe, I had to remove thomas2 because it did not work
df=data.frame(
"year"=sample(2000:2020,1e5,replace=T),
"class"=sample(LETTERS[1:20],1e5,replace=T)
)
microbenchmark::microbenchmark(
groth(), thomas1(), dww(), user29(), anous(), times=10)
Unit: milliseconds
expr min lq mean median uq max neval
groth() 1217.9176 1270.225702 1290.86323 1305.06580 1322.3443 1341.0451 10
thomas1() 13.6828 14.331401 17.94286 17.76540 21.2913 23.5265 10
dww() 31.3091 36.660201 41.31367 40.27055 44.5629 54.6295 10
user29() 7.8137 9.481402 11.97380 11.31740 14.2235 16.9593 10
anous() 12.7733 13.266902 14.60760 13.50610 15.1067 19.9610 10
General assumptions
Table is ordered by Year
Case one
For each group of records (grouped by Year) remove Class value if it has appeared in previous Year.
Solution
Transform the data, so for each Year in table, Class becomes a list of all Class values appeared during particular Year (chop());
For each particular Year remove Class value (setdiff) if it has appeared during previous Year (lag(Class));
Transform the Class from list of lists to atomic vector.
Code
library(tidyverse)
dat %>%
chop(Class) %>%
mutate(Class = map2(Class, lag(Class), setdiff)) %>%
unchop(Class)
Output
# Year Class
#1 2007 a
#2 2007 b
#3 2007 c
#4 2007 d
#5 2008 e
#6 2008 f
#7 2009 c
#8 2009 d
#9 2009 g
Case two
This case is more interesting comparing to the previous one, because in order to solve it, one needs to compare current list of Class values, to all of the values of Class appeared during previous years (sic!).
Solution
Transform the data, so for each Year in table, Class becomes a list of all Class values appeared during particular Year (chop());
Create list of Class values so each entry of the list contain the unique set of Class values appeared during particular Year and all the Years before (accumulate(Class, union));
For each particular Year remove Class value (setdiff) if it has appeared during previous Years (lag(...)) as it has been calculated at step 2.
Transform the Class from list of lists to atomic vector.
Code
library(tidyverse)
dat %>%
chop(Class) %>%
mutate(Class = map2(Class, lag(accumulate(Class, union)), setdiff)) %>%
unchop(Class)
Output
# Year Class
#1 2007 a
#2 2007 b
#3 2007 c
#4 2007 d
#5 2008 e
#6 2008 f
#7 2009 g
Data
I have changed the names of the variables, capitalizing first letter. It is against the concept of tidy data, and it bothers me a lot. However, the fact that you do use name class, which is the name of a R's primitive function bothers me even more.
dat <- structure(
list(
Year = c(2007, 2007, 2007, 2007, 2008, 2008, 2009, 2009, 2009),
Class = c("a", "b", "c", "d", "e", "f", "c", "d", "g")
),
class = "data.frame", row.names = c(NA,-9L)
)
You can also use the following tidyverse solution. I would like to thank #ThomasIsCoding for the data:
library(dplyr)
library(purrr)
df %>%
group_by(class) %>%
mutate(dup = n() > 1) %>%
group_split() %>%
map_dfr(~ if(unique(.x$dup) & (.x$year[2] - .x$year[1]) == 1) {
.x %>% slice_head(n = 1)
} else {
.x
}) %>%
select(-dup) %>%
arrange(year)
# A tibble: 9 x 2
year class
<int> <chr>
1 2007 a
2 2007 b
3 2007 c
4 2007 d
5 2008 e
6 2008 f
7 2009 c
8 2009 d
9 2009 g

Finding minimum by groups and among columns

I am trying to find the minimum value among different columns and group.
A small sample of my data looks something like this:
group cut group_score_1 group_score_2
1 a 1 3 5.0
2 b 2 2 4.0
3 a 0 2 2.5
4 b 3 5 4.0
5 a 2 3 6.0
6 b 1 5 1.0
I want to group by the groups and for each group, find the row which contains the minimum group score among both group scores and then also get the name of the column which contains the minimum (group_score_1 or group_score_2),
so basically my result should be something like this:
group cut group_score_1 group_score_2
1 a 0 2 2.5
2 b 1 5 1.0
I tried a few ideas, and came up eventually to dividing the into several new data frames, filtering by group and selecting the relevant columns and then using which.min(), but I'm sure there's a much more efficient way to do it. Not sure what I am missing.
We can use data.table methods
library(data.table)
setDT(df)[df[, .I[which.min(do.call(pmin, .SD))],
group, .SDcols = patterns('^group_score')]$V1]
# group cut group_score_1 group_score_2
#1: a 0 2 2.5
#2: b 1 5 1.0
For each group, you can calculate min value and select the row in which that value exist in one of the column.
library(dplyr)
df %>%
group_by(group) %>%
filter({tmp = min(group_score_1, group_score_2);
group_score_1 == tmp | group_score_2 == tmp})
# group cut group_score_1 group_score_2
# <chr> <int> <int> <dbl>
#1 a 0 2 2.5
#2 b 1 5 1
The above works well when you have only two group_score columns. If you have many such columns it is not possible to list down each one of them with group_score_1 == tmp | group_score_2 == tmp etc. In such case, get the data in long format and get the corresponding cut value of the minimum value and join the data. Assuming cut is unique in each group.
df %>%
tidyr::pivot_longer(cols = starts_with('group_score')) %>%
group_by(group) %>%
summarise(cut = cut[which.min(value)]) %>%
left_join(df, by = c("group", "cut"))
Here is a base R option using pmin + ave + subset
subset(
df,
as.logical(ave(
do.call(pmin, df[grep("group_score_\\d+", names(df))]),
group,
FUN = function(x) x == min(x)
))
)
which gives
group cut group_score_1 group_score_2
3 a 0 2 2.5
6 b 1 5 1.0
Data
> dput(df)
structure(list(group = c("a", "b", "a", "b", "a", "b"), cut = c(1L,
2L, 0L, 3L, 2L, 1L), group_score_1 = c(3L, 2L, 2L, 5L, 3L, 5L
), group_score_2 = c(5, 4, 2.5, 4, 6, 1)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))

Set name of variable defined in formula

This just popped into my head,
Let's take this example from a recent question:
data:
df1<-
structure(list(Year = c(2015L, 2015L, 2015L, 2015L, 2016L, 2016L,
2016L, 2016L), Category = c("a", "1", "2", "3", "1", "2", "3",
"1"), Value = c(2L, 3L, 2L, 1L, 7L, 2L, 1L, 1L)), row.names = c(NA,
-8L), class = "data.frame")
code:
aggregate( Value ~ Year + c(MY_NAME = c("OneTwo", "three")[Category %in% 1:2 + 1]), data=df1, FUN=sum )
current output: (look at the long ugly name of the new var)
# Year c(MY_NAME = c("OneTwo", "three")[Category %in% 1:2 + 1]) Value
#1 2015 OneTwo 3
#2 2016 OneTwo 1
#3 2015 three 5
#4 2016 three 10
desired output:
# Year MY_NAME Value
#1 2015 OneTwo 3
#2 2016 OneTwo 1
#3 2015 three 5
#4 2016 three 10
please note:
One could (possibly should) declare a new variable.
This question is about how to set the name of the new variable DIRECTLY by adding code to the one-liner in code: section.
Instead of c, we need cbind, which results in a matrix of one column with column name 'MY_NAME' while c gets a named vector with unique names (make.unique) of the 'MY_NAME'
aggregate( Value ~ Year +
cbind(MY_NAME = c("OneTwo", "three")[Category %in% 1:2 + 1]), data=df1, FUN=sum )
# Year MY_NAME Value
#1 2015 OneTwo 3
#2 2016 OneTwo 1
#3 2015 three 5
#4 2016 three 10
In the ?aggregate, it is mentioned about the usage of cbind in the formula method
formula - a formula, such as y ~ x or cbind(y1, y2) ~ x1 + x2, where
the y variables are numeric data to be split into groups according to
the grouping x variables (usually factors).
An option with tidyverse would be
library(dplyr)
df1 %>%
group_by(Year, MY_NAME = c("OneTwo", "three")[Category %in% 1:2 + 1]) %>%
summarise(Value = sum(Value))
1) aggregate.data.frame Use aggregate.data.frame rather than aggregate.formula:
by <- with(df1,
list(
Year = Year,
MY_NAME = c("OneTwo", "three")[Category %in% 1:2 + 1]
)
)
aggregate(df1["Value"], by, FUN = sum)
giving:
Year MY_NAME Value
1 2015 OneTwo 3
2 2016 OneTwo 1
3 2015 three 5
4 2016 three 10
2) 2 step It might be a bit cleaner to split this into two parts (1) create a new data frame in which Category is transformed and (2) perform the aggregate.
df2 <- transform(df1, MY_NAME = c("OneTwo", "three")[Category %in% 1:2 + 1])
aggregate(Value ~ Year + MY_NAME, df2, sum)
2a) or expressing (2) in terms of a magrittr pipeline:
library(magrittr)
df1 %>%
transform(MY_NAME = c("OneTwo", "three")[Category %in% 1:2 + 1]) %>%
aggregate(Value ~ Year + MY_NAME, ., sum)

Create columns from aggregated row data in R

I have a data frame that contains historical price returns. The data is organized with date columns and many Asset columns (denoted as A1,A2...). Each asset column contains price return data for each unique historical date. I would like to process this data to create a data frame with many asset columns and only one row of data - with the data row containing the aggregated/average of the rows for the new columns. The new columns needs headers that are the original asset name, concatenated with date information. A simplified example of the original date follows:
> df <- read.csv("data.csv", header=T)
> df
Year Month A1 A2 A3
1 2015 Jan 1 1 1
2 2015 Feb 2 2 2
3 2015 Mar 3 3 3
4 2016 Jan 1 1 1
5 2016 Feb 2 2 2
6 2016 Mar 3 3 3
I used simple repeating numbers for the returns here. I am using a function that requires the data to be organized as follows:
> df2 <- read.csv("data2.csv", header=T)
> df2
Returns A1.Jan A1.Feb A1.Mar A2.Jan A2.Feb A2.Mar A3.Jan A3.Feb A3.Mar
1 Average 1 2 3 1 2 3 1 2 3
For clarity, A1.Jan contains the average of all Year's Jan returns. Thanks in advance for the insight and/or solution.
Take a look at the base function reshape. This is basically the same task as is solved by the last example on its help page:
reshape(df, idvar="Year", direction="wide", timevar="Month")
Year A1.Jan A2.Jan A3.Jan A1.Feb A2.Feb A3.Feb A1.Mar A2.Mar A3.Mar
1 2015 1 1 1 2 2 2 3 3 3
4 2016 1 1 1 2 2 2 3 3 3
You wanted the Year variable to remain as a column identifier but wanted the Month variable to act as a sequence that gets spread "wide".
With data.table you can do
library(data.table)
setDT(df)
df[, lapply(.SD, mean), .SDcols = names(df)[grep("^A", names(df))], by = Month
][, Returns := "Average"
][, melt(.SD, id = c("Month", "Returns"))
][, dcast(.SD, Returns ~ variable + Month, value.var = 'value', sep = ".")]
# Returns A1.Feb A1.Jan A1.Mar A2.Feb A2.Jan A2.Mar A3.Feb A3.Jan A3.Mar
#1: Average 2 1 3 2 1 3 2 1 3
In the first line we aggregate the data by Month. The part names(df)[grep("^A", names(df)) ensures that we only aggregate variables that start with the letter "A".
The second line creates variable Returns that contains the value "Average".
melt gathers you data into long format and dcast finally spreads into desired output.
data
df <- structure(list(Year = c(2015L, 2015L, 2015L, 2016L, 2016L, 2016L
), Month = c("Jan", "Feb", "Mar", "Jan", "Feb", "Mar"), A1 = c(1L,
2L, 3L, 1L, 2L, 3L), A2 = c(1L, 2L, 3L, 1L, 2L, 3L), A3 = c(1L,
2L, 3L, 1L, 2L, 3L)), .Names = c("Year", "Month", "A1", "A2",
"A3"), class = "data.frame", row.names = c("1", "2", "3", "4",
"5", "6"))
Here's a tidyverse solution. I factored the months so they can be ordered, then used tidyr::gather() to convert into long format so I could dplyr::group_by() by month to dplyr::summarise() to find the average:
library(dplyr)
library(tidyr)
df <- read.table(text = "
Year Month A1 A2 A3
1 2015 Jan 1 1 1
2 2015 Feb 2 2 2
3 2015 Mar 3 3 3
4 2016 Jan 1 1 1
5 2016 Feb 2 2 2
6 2016 Mar 3 3 3", header = T) %>%
tbl_df()
df$Month <- df$Month %>%
factor(levels = format(ISOdate(2000, 1:12, 1), "%b"))
df_tidy <- df %>%
gather(asset, value, -Year, -Month) %>%
group_by(Month, asset) %>%
summarise(Average = mean(value)) %>%
arrange(asset, Month)
df_tidy
# # A tibble: 9 x 3
# # Groups: Month [3]
# Month asset Average
# <fct> <chr> <dbl>
# 1 Jan A1 1
# 2 Feb A1 2
# 3 Mar A1 3
# 4 Jan A2 1
# 5 Feb A2 2
# 6 Mar A2 3
# 7 Jan A3 1
# 8 Feb A3 2
# 9 Mar A3 3
# convert to wide format, as in OP - not sure of 'easy' way
# to order columns by asset.month other than using 'select()'
# (it currently sorts alphabetically).
df_tidy %>%
unite(Returns, c(asset, Month), sep = ".") %>%
spread(Returns, Average)
# # A tibble: 1 x 9
# A1.Feb A1.Jan A1.Mar A2.Feb A2.Jan A2.Mar A3.Feb A3.Jan A3.Mar
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2 1 3 2 1 3 2 1 3

Count # of IDs that meet both criteria

I have a dataset that has two columns. One is userid, the other is company type, like below:
userid company.type
1 A
2 A
3 C
1 B
2 B
3 B
4 A
I want to know how many unique userid's there are that have company.type of A and B or A and C, (but not B and C).
I'm assuming it's some sort of aggregate function, but I'm not sure how to place the qualifier that company.type has to be A and B or A and C only.
We can do this with base R using table
tbl <- table(df1) > 0
sum(((tbl[, 1] & tbl[,2]) | (tbl[,1] & tbl[,3])) & (!(tbl[,2] & tbl[,3])))
#[1] 2
Here's an idea with dplyr. setequal checks if two vectors are composed of the same elements, regardless of ordering:
library(dplyr)
df %>%
group_by(userid) %>%
summarize(temp = setequal(company.type, c("A", "B")) |
setequal(company.type, c("A", "C"))) %>%
pull(temp) %>%
sum()
# [1] 2
Data:
df <- structure(list(userid = c(1L, 2L, 3L, 1L, 2L, 3L, 4L), company.type = c("A",
"A", "C", "B", "B", "B", "A")), .Names = c("userid", "company.type"
), class = "data.frame", row.names = c(NA, -7L))
See: Check whether two vectors contain the same (unordered) elements in R
Sort DF and reduce it to one row per userid with a types column consisting of a comma-separated string of company types. Then filter it using the indicated condition. Finally use tally to get the number of rows left after filtering. To get the details omit the tally line.
library(dplyr)
DF %>%
arrange(userid, company.type) %>%
group_by(userid) %>%
summarize(types = toString(company.type)) %>%
ungroup %>%
filter(grepl("A.*B|A.*C", types) & ! grepl("B.*C", types)) %>%
tally
giving:
# A tibble: 1 x 1
n
<int>
1 2
Note
The input used, in reproducible form, is:
Lines <- "userid company.type
1 A
2 A
3 C
1 B
2 B
3 B
4 A"
DF <- read.table(text = Lines, header = TRUE)

Resources