Filtering using dplyr package - r

My dataset is set up as follows:
User Day
10 2
1 3
15 1
3 1
1 2
15 3
1 1
I'n trying to find out the users that are present on all three days. I'm using the below code using dplyr package:
MAU%>%
group_by(User)%>%
filter(c(1,2,3) %in% Day)
# but get this error message:
# Error in filter_impl(.data, quo) : Result must have length 12, not 3
any idea how to fix?

Using the input shown reproducibly in the Note at the end, count the distinct Users and filter out those for which there are 3 days:
library(dplyr)
DF %>%
distinct %>%
count(User) %>%
filter(n == 3) %>%
select(User)
giving:
# A tibble: 1 x 1
User
<int>
1 1
Note
Lines <- "
User Day
10 2
1 3
15 1
3 1
1 2
15 3
1 1"
DF <- read.table(text = Lines, header = TRUE)

We can use all to get a single TRUE/FALSE from the logical vector 1:3 %in% Day
library(dplyr)
MAU %>%
group_by(User)%>%
filter(all(1:3 %in% Day))
# A tibble: 3 x 2
# Groups: User [1]
# User Day
# <int> <int>
#1 1 3
#2 1 2
#3 1 1
data
MAU <- structure(list(User = c(10L, 1L, 15L, 3L, 1L, 15L, 1L), Day = c(2L,
3L, 1L, 1L, 2L, 3L, 1L)), class = "data.frame", row.names = c(NA,
-7L))

Related

Creating loop to count the number of unique values in column based on values in another column

So, for example, I have the following dataframe, data:
col1
col2
1
5
1
5
1
3
2
10
2
11
3
11
Now, I want to make a new column, col3, which gives me the number of unique values in col2 for every grouping in col1.
So far, I have the following code:
length(unique(data$col2[data$col1 == 1]))
Which would here return the number 2.
However, I'm having a hard time making a loop that goes through all the values in col1 to create the new column, col3.
We can use n_distinct after grouping
library(dplyr)
data <- data %>%
group_by(col1) %>%
mutate(col3 = n_distinct(col2)) %>%
ungroup
-output
data
# A tibble: 6 × 3
col1 col2 col3
<int> <int> <int>
1 1 5 2
2 1 5 2
3 1 3 2
4 2 10 2
5 2 11 2
6 3 11 1
Or with data.table
library(data.table)
setDT(data)[, col3 := uniqueN(col2), col1]
data
data <- structure(list(col1 = c(1L, 1L, 1L, 2L, 2L, 3L), col2 = c(5L,
5L, 3L, 10L, 11L, 11L)), class = "data.frame", row.names = c(NA,
-6L))
You want the counts for every row, so using a for loop you would do
data$col3 <- NA_real_
for (i in seq_len(nrow(data))) {
data$col3[i] <- length(unique(data$col2[data$col1 == data$col1[i]]))
}
data
# col1 col2 col3
# 1 1 5 2
# 2 1 5 2
# 3 1 3 2
# 4 2 10 2
# 5 2 11 2
# 6 3 11 1
However, using for loops in R is mostly inefficient, and in this case we can use the grouping function ave which comes with R.
data <- transform(data, col3=ave(col2, col1, FUN=\(x) length(unique(x))))
data
# col1 col2 col3
# 1 1 5 2
# 2 1 5 2
# 3 1 3 2
# 4 2 10 2
# 5 2 11 2
# 6 3 11 1
Data:
data <- structure(list(col1 = c(1L, 1L, 1L, 2L, 2L, 3L), col2 = c(5L,
5L, 3L, 10L, 11L, 11L)), class = "data.frame", row.names = c(NA,
-6L))

How to keep the true uniques row? [duplicate]

This question already has answers here:
How can I remove all duplicates so that NONE are left in a data frame?
(3 answers)
Closed 12 months ago.
Here an example of a matrix,
A
B
C
1
1
1
1
1
4
1
2
4
2
1
1
3
1
1
3
1
2
I would like extract only rows which are unique in A and B.
I can't use unique, duplicate etc. because they retain always one of my duplicated row.
In final result I wish obtain:
A
B
C
1
2
4
2
1
1
How can I do it?
Thank you
Here are couple of options -
Base R -
cols <- c('A', 'B')
res <- df[!(duplicated(df[cols]) | duplicated(df[cols], fromLast = TRUE)), ]
res
# A B C
#3 1 2 4
#4 2 1 1
dplyr -
library(dplyr)
df %>% group_by(A, B) %>% filter(n() == 1) %>% ungroup
# A tibble: 2 x 3
# A B C
# <int> <int> <int>
#1 1 2 4
#2 2 1 1
data.table
df <- data.frame(
A = c(1L, 1L, 1L, 2L, 3L, 3L),
B = c(1L, 1L, 2L, 1L, 1L, 1L),
C = c(1L, 4L, 4L, 1L, 1L, 2L)
)
library(data.table)
setDT(df)[, .SD[.N == 1], by = list(A, B)]
#> A B C
#> 1: 1 2 4
#> 2: 2 1 1
Created on 2022-02-28 by the reprex package (v2.0.1)

Cumulative sums by month in R

I want to transform my data from this
Month Expenditures
1 1
1 2
2 3
2 6
3 2
3 5
to this:
Month Cumulative_expenditures
1 3
2 12
3 19
, but can't seem to figure out how to do it.
I tried using the cumsum() function, but it counts each observation - it doesn't distinguish between groups.
Any help would be much appreciated!
A two steps base R solution would be:
#Code
df1 <- aggregate(Expenditures~Month,data=mydf,sum)
#Create cum sum
df1$Expenditures <- cumsum(df1$Expenditures)
Output:
Month Expenditures
1 1 3
2 2 12
3 3 19
Some data used:
#Data
mydf <- structure(list(Month = c(1L, 1L, 2L, 2L, 3L, 3L), Expenditures = c(1L,
2L, 3L, 6L, 2L, 5L)), class = "data.frame", row.names = c(NA,
-6L))
Using dplyr:
library(dplyr)
df %>%
group_by(Month) %>%
summarise(Expenditures = sum(Expenditures), .groups = "drop") %>%
mutate(Expenditures = cumsum(Expenditures))
#> # A tibble: 3 x 2
#> Month Expenditures
#> <int> <int>
#> 1 1 3
#> 2 2 12
#> 3 3 19
Or in base R:
data.frame(Month = unique(df$Month),
Expenditure = cumsum(tapply(df$Expenditure, df$Month, sum)))
#> Month Expenditure
#> 1 1 3
#> 2 2 12
#> 3 3 19
Here is another base R option using subset + ave
subset(
transform(df, Expenditures = cumsum(Expenditures)),
ave(rep(FALSE, nrow(df)), Month, FUN = function(x) seq_along(x) == length(x))
)
which gives
Month Expenditures
2 1 3
4 2 12
6 3 19
We can use base R
out <- with(df1, rowsum(Expenditures, Month))
data.frame(Month = row.names(out), Expenditure = cumsum(out))
# Month Expenditure
#1 1 3
#2 2 12
#3 3 19
Or more compactly
with(df1, stack(cumsum(rowsum(Expenditures, Month)[,1])))[2:1]
data
df1 <- structure(list(Month = c(1L, 1L, 2L, 2L, 3L, 3L), Expenditures = c(1L,
2L, 3L, 6L, 2L, 5L)), class = "data.frame", row.names = c(NA,
-6L))

Subset data from the last time a condition is met till the last row of a data frame - applied to each subject

I have a data frame like this:
ID TIME AMT CONC
1 0 10 2
1 1 0 1
1 5 20 15
1 10 0 30
1 12 0 16
I want to subset data for each subject ID, from the last time when AMT > 0 till the last row of the data frame for that individual.
output should be this:
ID TIME AMT CONC
1 5 20 15
1 10 0 30
1 12 0 16
I am using RStudio.
We can use slice and create a sequence between the max index where AMT > 0 and the last index for each ID.
library(dplyr)
df %>%
group_by(ID) %>%
slice(max(which(AMT > 0)) : n())
# ID TIME AMT CONC
# <int> <int> <int> <int>
#1 1 5 20 15
#2 1 10 0 30
#3 1 12 0 16
We can use filter
library(dplyr)
df %>%
group_by(ID) %>%
mutate(ind = cumsum(AMT > 0)) %>%
filter(ind == max(ind), ind > 0) %>%
select(-ind)
# A tibble: 3 x 4
# Groups: ID [1]
# ID TIME AMT CONC
# <int> <int> <int> <int>
#1 1 5 20 15
#2 1 10 0 30
#3 1 12 0 16
NOTE: This also works well when all the elements of 'AMT' is 0 for a particular group
df$ID[4:5] <- 2
df$AMT <- 0
df$AMT[4:5] <- c(1, 0)
Or another option is fewer steps
df %>%
group_by(ID) %>%
filter(row_number() >= which.max(cumsum(AMT > 0)))
data
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L), TIME = c(0L, 1L, 5L,
10L, 12L), AMT = c(10L, 0L, 20L, 0L, 0L), CONC = c(2L, 1L, 15L,
30L, 16L)), class = "data.frame", row.names = c(NA, -5L))

counting number of times an id has duplicated years

I have the following data frame:
df =
id Year Value
1 1 3
1 2 4
2 1 6
2 2 2
2 2 3
3 1 7
3 2 3
I want to count the number of times an individual id has a duplicating year.
Desired Outcome:
1
Id 2 has year 2 twice, that's why 1 is the outcome
So far I have tried:
library("dplyr")
df %>% group_by(id, Year) %>% summarize(count=n())
but I cannot get a single number with the count
Cheers
We can use table and create counts of observation for each id and year and then calculate the ones which occur more than 1 time.
sum(table(df$id, df$Year) > 1)
#[1] 1
Just for completion, if we want to do this in dplyr
library(dplyr)
df %>%
group_by(id, Year) %>%
summarise(count= n()) %>%
ungroup() %>%
summarise(new_count = sum(count > 1))
# new_count
# <int>
#1 1
Just for fun:
data.table solution:
data:
dt<-
fread("id Year Value
1 1 3
1 2 4
2 1 6
2 2 2
2 2 3
3 1 7
3 2 3")
code:
dt[,.N>1,by=c("id","Year")]$V1 %>% sum
A (fast) alternative:
sum(sapply(split(df$Year, df$id), function(x) any(duplicated(x))))
Where:
df <- data.frame(
id = c(1L, 1L, 2L, 2L, 2L, 3L, 3L),
Year = c(1L, 2L, 1L, 2L, 2L, 1L, 2L),
Value = c(3L, 4L, 6L, 2L, 3L, 7L, 3L)
)

Resources