counting number of times an id has duplicated years - r

I have the following data frame:
df =
id Year Value
1 1 3
1 2 4
2 1 6
2 2 2
2 2 3
3 1 7
3 2 3
I want to count the number of times an individual id has a duplicating year.
Desired Outcome:
1
Id 2 has year 2 twice, that's why 1 is the outcome
So far I have tried:
library("dplyr")
df %>% group_by(id, Year) %>% summarize(count=n())
but I cannot get a single number with the count
Cheers

We can use table and create counts of observation for each id and year and then calculate the ones which occur more than 1 time.
sum(table(df$id, df$Year) > 1)
#[1] 1
Just for completion, if we want to do this in dplyr
library(dplyr)
df %>%
group_by(id, Year) %>%
summarise(count= n()) %>%
ungroup() %>%
summarise(new_count = sum(count > 1))
# new_count
# <int>
#1 1

Just for fun:
data.table solution:
data:
dt<-
fread("id Year Value
1 1 3
1 2 4
2 1 6
2 2 2
2 2 3
3 1 7
3 2 3")
code:
dt[,.N>1,by=c("id","Year")]$V1 %>% sum

A (fast) alternative:
sum(sapply(split(df$Year, df$id), function(x) any(duplicated(x))))
Where:
df <- data.frame(
id = c(1L, 1L, 2L, 2L, 2L, 3L, 3L),
Year = c(1L, 2L, 1L, 2L, 2L, 1L, 2L),
Value = c(3L, 4L, 6L, 2L, 3L, 7L, 3L)
)

Related

How to remove rows of dataframe after a particular string is detected, using filter and dplyr

I have data like the example below. For each participant, if a particular string ("trial_end") is present in the my_strings column I would like to remove all the rows after its appearance.
library(dplyr)
library(stringr)
library(tibble)
df1 <- tibble::tribble(
~participant_id, ~timestamp, ~my_strings,
1L, 1L, "other_string",
1L, 2L, "other_string",
1L, 3L, "trial_end",
1L, 4L, "other_string",
2L, 1L, "other_string",
2L, 2L, "other_string",
2L, 3L, "other_string",
2L, 4L, "other_string",
3L, 1L, "other_string",
3L, 2L, "trial_end",
3L, 3L, "other_string",
3L, 4L, "other_string"
)
My first attempt was to use str_detect to look for the presence of the string, which to provide the row number, then use filter to keep only that row and all those before it:
df2 <- df1 %>%
group_by(participant_id) %>%
filter(row_number() < (which(str_detect(my_strings, "trial_end"))) + 1)
This seems to throw an error when the string is not detected (e.g. participant 2 in the example here).
My next attempt was to add a conditional if_else, trying to effectively say 'IF the target string is detected THEN remove all the rows after for that participant, ELSE if the string is not detected, do nothing'.
df3 <- df1 %>%
group_by(participant_id) %>%
if_else(str_detect(my_strings, "trial_end"),
filter(row_number() < (which(str_detect(my_strings, "trial_end"))) + 1),
filter(timestamp < max(timestamp)))
This also returned an error:
Error: condition must be a logical vector, not a grouped_df/tbl_df/tbl/data.frame object.
My final attempt tried to make use of another answer already here, by placing the conditional if else inside filter, but this too produced an error.
df4 <- df1 %>%
group_by(participant_id) %>%
filter(if(str_detect(my_strings, "trial_end") < (which(str_detect(my_strings, "trial_end")) + 1))
else < n())
Can anyone point out the best approach for this problem? Is filter the wrong way to go about it?
Many thanks.
For clarity, the desired outcome would look like this:
desired_output <- tibble::tribble(
~participant_id, ~timestamp, ~my_strings,
1L, 1L, "other_string",
1L, 2L, "other_string",
1L, 3L, "trial_end",
2L, 1L, "other_string",
2L, 2L, "other_string",
2L, 3L, "other_string",
2L, 4L, "other_string",
3L, 1L, "other_string",
3L, 2L, "trial_end"
)
One option could be:
df1 %>%
group_by(participant_id) %>%
slice(if(all(my_strings != "trial_end")) 1:n() else 1:which(my_strings == "trial_end"))
participant_id timestamp my_strings
<int> <int> <chr>
1 1 1 other_string
2 1 2 other_string
3 1 3 trial_end
4 2 1 other_string
5 2 2 other_string
6 2 3 other_string
7 2 4 other_string
8 3 1 other_string
9 3 2 trial_end
You could count the cumulative sum of matches up to the prior row and filter to only include rows up to the first match per participant:
df1 %>%
group_by(participant_id) %>%
filter(lag(cumsum(my_strings == "trial_end"), default = 0) < 1) %>%
ungroup()
# A tibble: 9 x 3
participant_id timestamp my_strings
<int> <int> <chr>
1 1 1 other_string
2 1 2 other_string
3 1 3 trial_end
4 2 1 other_string
5 2 2 other_string
6 2 3 other_string
7 2 4 other_string
8 3 1 other_string
9 3 2 trial_end
You can write a small helper function to drop rows after a string is detected, if the string is not detected it does not drop anything.
library(dplyr)
drop_string_after <- function(string_vec, string) {
i <- match(string, string_vec)
if(is.na(i)) seq_along(string_vec) else seq_len(i)
}
and apply this function for each participant :
df1 %>%
group_by(participant_id) %>%
slice(drop_string_after(my_strings, 'trial_end')) %>%
ungroup
# participant_id timestamp my_strings
# <int> <int> <chr>
#1 1 1 other_string
#2 1 2 other_string
#3 1 3 trial_end
#4 2 1 other_string
#5 2 2 other_string
#6 2 3 other_string
#7 2 4 other_string
#8 3 1 other_string
#9 3 2 trial_end
To use filter you need to change the return value of the function.
drop_string_after <- function(string_vec, string) {
i <- match(string, string_vec)
if(is.na(i)) TRUE else row_number() <= i
}
df1 %>%
group_by(participant_id) %>%
filter(drop_string_after(my_strings, 'trial_end')) %>%
ungroup
critpart <- 0
for (i in 1:nrow(df1)){
if (is.na(df1$participant_id[i])) next
if (df1$my_strings[i] == "trial_end"){
critpart <- df1$participant_id[i]
next
}
if (df1$participant_id[i] == critpart){
df1[i,] <- NA
}
}
df1 <- df1 %>% filter(!is.na(participant_id))

Cumulative sums by month in R

I want to transform my data from this
Month Expenditures
1 1
1 2
2 3
2 6
3 2
3 5
to this:
Month Cumulative_expenditures
1 3
2 12
3 19
, but can't seem to figure out how to do it.
I tried using the cumsum() function, but it counts each observation - it doesn't distinguish between groups.
Any help would be much appreciated!
A two steps base R solution would be:
#Code
df1 <- aggregate(Expenditures~Month,data=mydf,sum)
#Create cum sum
df1$Expenditures <- cumsum(df1$Expenditures)
Output:
Month Expenditures
1 1 3
2 2 12
3 3 19
Some data used:
#Data
mydf <- structure(list(Month = c(1L, 1L, 2L, 2L, 3L, 3L), Expenditures = c(1L,
2L, 3L, 6L, 2L, 5L)), class = "data.frame", row.names = c(NA,
-6L))
Using dplyr:
library(dplyr)
df %>%
group_by(Month) %>%
summarise(Expenditures = sum(Expenditures), .groups = "drop") %>%
mutate(Expenditures = cumsum(Expenditures))
#> # A tibble: 3 x 2
#> Month Expenditures
#> <int> <int>
#> 1 1 3
#> 2 2 12
#> 3 3 19
Or in base R:
data.frame(Month = unique(df$Month),
Expenditure = cumsum(tapply(df$Expenditure, df$Month, sum)))
#> Month Expenditure
#> 1 1 3
#> 2 2 12
#> 3 3 19
Here is another base R option using subset + ave
subset(
transform(df, Expenditures = cumsum(Expenditures)),
ave(rep(FALSE, nrow(df)), Month, FUN = function(x) seq_along(x) == length(x))
)
which gives
Month Expenditures
2 1 3
4 2 12
6 3 19
We can use base R
out <- with(df1, rowsum(Expenditures, Month))
data.frame(Month = row.names(out), Expenditure = cumsum(out))
# Month Expenditure
#1 1 3
#2 2 12
#3 3 19
Or more compactly
with(df1, stack(cumsum(rowsum(Expenditures, Month)[,1])))[2:1]
data
df1 <- structure(list(Month = c(1L, 1L, 2L, 2L, 3L, 3L), Expenditures = c(1L,
2L, 3L, 6L, 2L, 5L)), class = "data.frame", row.names = c(NA,
-6L))

Counting patients that have not had medication [duplicate]

This question already has answers here:
How to sum a variable by group
(18 answers)
Closed 2 years ago.
I have a dataframe in which patients have multiple observations of medication use over time. Some patients have consistently used medication, others have gaps, while I am trying to count the patients which have never used medication.
I can't show the actual data but here is an example data frame of what I am working with.
patid meds
1 0
1 1
1 1
2 0
2 0
3 1
3 1
3 1
4 0
5 1
5 0
So from this two patients (4 and 2) never used medication. That's what I'm looking for.
I'm fairly new to R and have no idea how to do this, any would be appreciated.
Here is another alternative from dplyr package.
library(dplyr)
df <- data.frame(patid = c(1,1,1,2,2,3,3,3,4,5,5),
meds = c(0,1,1,0,0,1,1,1,0,1,0))
df %>%
distinct(patid, meds) %>%
arrange(desc(meds))%>%
filter(meds == 0 & !duplicated(patid))
# patid meds
#1 2 0
#2 4 0
Try this:
library(dplyr)
#Data
df <- structure(list(patid = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L,
5L, 5L), meds = c(0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-11L))
#Code
df %>% group_by(patid) %>% summarise(sum=sum(meds,na.rm=T)) %>% filter(sum==0)
# A tibble: 2 x 2
patid sum
<int> <int>
1 2 0
2 4 0
A Base R solution could be
subset(aggregate(meds ~ patid, df, sum), meds == 0)
which returns
patid meds
2 2 0
4 4 0

Filtering using dplyr package

My dataset is set up as follows:
User Day
10 2
1 3
15 1
3 1
1 2
15 3
1 1
I'n trying to find out the users that are present on all three days. I'm using the below code using dplyr package:
MAU%>%
group_by(User)%>%
filter(c(1,2,3) %in% Day)
# but get this error message:
# Error in filter_impl(.data, quo) : Result must have length 12, not 3
any idea how to fix?
Using the input shown reproducibly in the Note at the end, count the distinct Users and filter out those for which there are 3 days:
library(dplyr)
DF %>%
distinct %>%
count(User) %>%
filter(n == 3) %>%
select(User)
giving:
# A tibble: 1 x 1
User
<int>
1 1
Note
Lines <- "
User Day
10 2
1 3
15 1
3 1
1 2
15 3
1 1"
DF <- read.table(text = Lines, header = TRUE)
We can use all to get a single TRUE/FALSE from the logical vector 1:3 %in% Day
library(dplyr)
MAU %>%
group_by(User)%>%
filter(all(1:3 %in% Day))
# A tibble: 3 x 2
# Groups: User [1]
# User Day
# <int> <int>
#1 1 3
#2 1 2
#3 1 1
data
MAU <- structure(list(User = c(10L, 1L, 15L, 3L, 1L, 15L, 1L), Day = c(2L,
3L, 1L, 1L, 2L, 3L, 1L)), class = "data.frame", row.names = c(NA,
-7L))

Finding the max number of occurrences from the available result

I have a dataframe which looks like -
Id Result
A 1
B 2
C 1
B 1
C 1
A 2
B 1
B 2
C 1
A 1
B 2
Now I need to calculate how many 1's and 2's are there for each Id and then select the number whose frequency of occurrence is the greatest.
Id Result
A 1
B 2
C 1
How can I do that? I have tried using the table function in some way but not able to use it effectively. Any help would be appreciated.
Here you can use aggregate in one step:
df <- structure(list(Id = structure(c(1L, 2L, 3L, 2L, 3L, 1L, 2L, 2L,
3L, 1L, 2L), .Label = c("A", "B", "C"), class = "factor"),
Result = c(1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L)),
.Names = c("Id", "Result"), class = "data.frame", row.names = c(NA, -11L)
)
res <- aggregate(Result ~ Id, df, FUN=function(x){which.max(c(sum(x==1), sum(x==2)))})
res
Result:
Id Result
1 A 1
2 B 2
3 C 1
With data.table you can try (df is your data.frame):
require(data.table)
dt<-as.data.table(df)
dt[,list(times=.N),by=list(Id,Result)][,list(Result=Result[which.max(times)]),by=Id]
# Id Result
#1: A 1
#2: B 2
#3: C 1
Using dplyr, you can try
library(dplyr)
df %>% group_by(Id, Result) %>% summarize(n = n()) %>% group_by(Id) %>%
filter(n == max(n)) %>% summarize(Result = Result)
Id Result
1 A 1
2 B 2
3 C 1
An option using table and ave
subset(as.data.frame(table(df1)),ave(Freq, Id, FUN=max)==Freq, select=-3)
# Id Result
# 1 A 1
# 3 C 1
# 5 B 2

Resources