Return IDs for which a column satisfies a condition - r

I have a simple data frame consisting of (account) IDs and default rates for five years. Many default rates are missing. The data can be generated as follows:
ID = rep(1:50, each = 5)
def= rnorm(n=250, mean=0.5, sd=0.2)
ind= which(def %in% sample(def, 100))
def[ind] = NA
df = data.frame(ID = ID, Def = def)
And looks like this:
> head(df, 20)
ID Def
1 1 0.39506938
2 1 NA
3 1 0.42946603
4 1 NA
5 1 NA
6 2 0.45125199
7 2 0.40519126
8 2 NA
9 2 0.65082718
10 2 NA
11 3 NA
12 3 0.46132736
13 3 0.06324983
14 3 0.72630862
15 3 0.63996092
16 4 0.72093890
17 4 NA
18 4 NA
19 4 0.61471461
20 4 0.51788498
How can show the ID numbers for which at least 4 of the 5 default rates are not NAs?

You may try:
library(dplyr)
df %>%
group_by(ID) %>%
dplyr::summarize(p = sum(!is.na(Def))/n()) %>%
filter(p >= 0.8) %>% # or > 0.8?
pull(ID)
[1] 2 8 9 11 13 17 20 23 25 27 28 29 33 38 44 45 47 49

If you need to keep the data frame you can do:
library(dplyr)
df |>
group_by(ID) |>
mutate(m = sum(is.na(Def))) |>
filter(m <= 1) |>
select(ID, Def)
EDIT:
Can be further simplified
df |>
group_by(ID) |>
filter(sum(is.na(Def)) <= 1)

Here is an alternative:
library(dplyr)
df %>%
group_by(ID) %>%
mutate(x = 5 - sum(is.na(Def))) %>%
slice(1) %>%
filter(x >=4) %>%
pull(ID)

Related

R rowwise replace the first instance of the minimum

How can I do the following:
replace all values < 6 with NA,
if there is only one NA in the row, replace the first instance of the minimum value with -99?
Some data that includes an ID variable and a total column:
library(tidyverse)
df <- data.frame(id = c(1,2,3,4,5), a = c(10,12,4,17,3), b = c(9,12,3,20,6), c = c(2,2,10,10,10), d = c(12,16,12,10,12))
df$total <- apply(df[,c(2:5)], 1, sum)
Giving
id a b c d total
1 10 9 2 12 33
2 12 12 2 16 42
3 4 3 10 12 29
4 17 20 10 10 57
5 3 6 10 12 31
My desired output is
id a b c d total
1 10 -99 NA 12 33
2 -99 12 NA 16 42
3 NA NA 10 12 29
4 17 20 10 10 57
5 NA -99 10 12 31
My attempt
df_mod <- df %>%
# Make <6 NA
mutate(
across(
.cols = 'a':'total',
~case_when(
.x < 6 ~ as.numeric(NA),
TRUE ~ .x
)
)
) %>%
# Add a count of NAs
rowwise() %>%
mutate(Count_NA = sum(is.na(cur_data()))) %>%
ungroup()
# Transpose and get row minimum
df_mod2 <- t(df_mod[,-c(1,ncol(df_mod))]) %>%
apply(., 2, function(a){
min <- min(a, na.rm = TRUE)
}
) %>%
cbind(df_mod, .) %>%
rename(., min = .) %>%
tibble(.)
# If count_NA = 1 replace the first instance of min
df_mod2 %>%
rowwise() %>%
mutate(
across(
.cols = 'a':'total',
~case_when(
Count_NA == 1 & .x == min ~ replace(.x, first(match(min, .x)), -99),
TRUE ~ .x)
)
) %>%
select(-'Count_NA', -'min')
Which gives the following
id a b c d total
1 10 -99 NA 12 33
2 -99 -99 NA 16 42
3 NA NA 10 12 29
4 17 20 10 10 57
5 NA -99 10 12 31
Thanks
If you're willing to pivot rather than work rowwise, then this solution will work.
library(dplyr)
df %>%
pivot_longer(names_to = 'col',
values_to = 'val',
-c(id, total)) %>%
group_by(id) %>%
mutate(val2 = rank(val, ties.method = 'first'),
val = ifelse(val < 6, NA , val),
val = ifelse(sum(is.na(val)) == 1 & val2 == 2, -99, val)) %>%
select(-val2) %>%
pivot_wider(names_from = col,
values_from = val) %>%
relocate(total, .after = "d")
Here's the result:
# A tibble: 5 × 6
# Groups: id [5]
id a b c d total
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 10 -99 NA 12 33
2 2 -99 12 NA 16 42
3 3 NA NA 10 12 29
4 4 17 20 10 10 57
5 5 NA -99 10 12 31
It is not clear what you mean by '2nd' minimum value because you replace minimum value. You can use data.table:
library(data.table)
setDT(df)[
,
(cols) := transpose(
lapply(
transpose(lapply(.SD, function(x) fifelse(x < 6, NA_real_, x))),
function(x) if(sum(is.na(x)) == 1) replace(x, which.min(x), -99) else x
)
),
.SDcols = setdiff(names(df), "id")
]

Counting all unique strings in a data frame containing strings and numeric values

I have a number of large data frames which has the occasional string value and I would like to know what the unique string values are (ignoring the numeric values) and if possible count these strings.
df <- data.frame(1:16)
df$A <- c("Name",0,0,0,0,0,12,12,0,14,NA_real_,14,NA_real_,NA_real_,16,16)
df$B <- c(10,0,"test",0,12,12,12,12,0,14,NA_real_,14,16,16,16,16)
df$C <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
X1.16 A B C
1 1 Name 10 10
2 2 0 0 12
3 3 0 test 14
4 4 0 0 16
5 5 0 12 10
6 6 0 12 12
7 7 12 12 14
8 8 12 12 16
9 9 0 0 10
10 10 14 14 12
11 11 <NA> <NA> 14
12 12 14 14 16
13 13 <NA> 16 10
14 14 <NA> 16 12
15 15 16 16 14
16 16 16 16 16
I know I can use the count function in dplyr but I have too many unique numeric values so this is not a great solution. In the code below I was able to filter my data so to only retain rows that contain an alphabetical character (although this isn't a solution either).
df %>% filter_all(any_vars(str_detect(., pattern = "[:alpha:]")))
X1.16 A B C
1 1 Name 10 10
2 3 0 test 14
My desired output would be something to the effect of:
Variable n
"Name" 1
"test" 1
You can get the string value with grep and count them using table :
stack(table(grep('[[:alpha:]]', unlist(df), value = TRUE)))[2:1]
If you want a tidyverse answer you can get the data in long format, keep only the rows with characters in it and count them.
library(dplyr)
df %>%
mutate(across(.fns = as.character)) %>%
tidyr::pivot_longer(cols = everything()) %>%
filter(grepl('[[:alpha:]]', value)) %>%
count(value)
# value n
# <chr> <int>
#1 Name 1
#2 test 1
#Ronak and #akrun above beat me to the punch, my solution is very similar - with an extension if you want a count within columns
# Coerce to tibble for ease of reading
df <- df %>%
as_tibble() %>%
mutate(across(.fns = as.character))
df %>%
pivot_longer(cols = everything()) %>%
summarise(Variable = str_subset(value, "[:alpha:]")) %>%
count(Variable, sort = TRUE)
# A tibble: 2 x 2
Variable n
<chr> <int>
1 Name 1
2 test 1
# str_subset is a convenient wrapper around filter & str_detect
Add some extra words to test
# Test on extra word counts - replace 12 and 14 with words
df2 <- df
df2[df2 == 12] <- 'Name'
df2[df2 == 14] <- 'test'
df2
df2 %>%
pivot_longer(cols = everything()) %>%
summarise(Variable = str_subset(value, "[:alpha:]")) %>%
count(Variable, sort = TRUE)
# A tibble: 2 x 2
Variable n
<chr> <int>
1 Name 12
2 test 10
If you want counts by column
df2 %>%
select(-1) %>%
pivot_longer(everything(), names_to = 'col') %>%
group_by(col) %>%
summarise(Variable = str_subset(value, "[:alpha:]")) %>%
count(col, Variable)
# A tibble: 6 x 3
# Groups: col [3]
col Variable n
<chr> <chr> <int>
1 A Name 3
2 A test 2
3 B Name 4
4 B test 3
5 C Name 4
6 C test 4
We can use filter with across
library(dplyr)
library(tidyr)
library(stringr)
library(purrr)
df %>%
select(-1) %>%
mutate(across(everything(), as.character)) %>%
filter(across(everything(), ~ str_detect(., '[:alpha:]')) %>% reduce(`|`)) %>%
pivot_longer(everything()) %>%
filter(str_detect(value, '[:alpha:]')) %>%
count(value)
# A tibble: 2 x 2
# value n
# <chr> <int>
#1 Name 1
#2 test 1

How to summarize several filter-mutates in a case_when in dplyr?

I want to calculate some quantiles for a variable, and capture to which quantile observations belong by creating a new variable. I manage to do every step individually and wonder how to combine them. I tried several case_when versions, none was working. How can I shorten the following:
paneldata = data.frame(id=c(1,1,1,2,2,2,3,3,3,4,4,4), time=seq(1:3),
x=c(21,22,23,24,25,26,27,28,29,30,31,32))
quants <- boxplot(paneldata$x, outline=FALSE)$stats
library(dplyr)
paneldata %>%
filter(x <= quants[2]) %>%
mutate(quantile = 1)
paneldata %>%
filter(x > quants[2] & x < quants[3]) %>%
mutate(quantile = 2)
paneldata %>%
filter(x > quants[3] & x < quants[4]) %>%
mutate(quantile = 3)
paneldata %>%
filter(x >= quants[4]) %>%
mutate(quantile = 4)
You can do it all in a single step with ntile() from dplyr.
library(dplyr)
paneldata %>%
mutate(quantile = ntile(x, 4))
id time x quantile
1 1 1 21 1
2 1 2 22 1
3 1 3 23 1
4 2 1 24 2
5 2 2 25 2
6 2 3 26 2
7 3 1 27 3
8 3 2 28 3
9 3 3 29 3
10 4 1 30 4
11 4 2 31 4
12 4 3 32 4

Summarize values by group, but keep original data

I am trying to figure out how to sum values belonging to category a and b by factor file, but also keep the original data.
library(dplyr)
df <- data.frame(ID = 1:20, values = runif(20), category = rep(letters[1:5], 4), file = as.factor(sort(rep(1:5, 4))))
ID values category file
1 1 0.65699229 a 1
2 2 0.70506478 b 1
3 3 0.45774178 c 1
4 4 0.71911225 d 1
5 5 0.93467225 e 1
6 6 0.25542882 a 2
7 7 0.46229282 b 2
8 8 0.94001452 c 2
9 9 0.97822643 d 2
10 10 0.11748736 e 2
11 11 0.47499708 a 3
12 12 0.56033275 b 3
13 13 0.90403139 c 3
14 14 0.13871017 d 3
15 15 0.98889173 e 3
16 16 0.94666823 a 4
17 17 0.08243756 b 4
18 18 0.51421178 c 4
19 19 0.39020347 d 4
20 20 0.90573813 e 4
so that
df[1,2] will be added to df[2,2] to category 'ab' for file 1
df[6,2] will be added to df[7,2] to category 'ab' for file 2
etc.
So far I have this:
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
summarise(values = sum(values))
Problem
I would like to change the category of the summed values to "ab" and append it to the original data frame in the same pipeline.
Desired output:
ID values category file
1 1 0.65699229 a 1
2 2 0.70506478 b 1
3 3 0.45774178 c 1
4 4 0.71911225 d 1
5 5 0.93467225 e 1
6 6 0.25542882 a 2
7 7 0.46229282 b 2
8 8 0.94001452 c 2
9 9 0.97822643 d 2
10 10 0.11748736 e 2
11 11 0.47499708 a 3
12 12 0.56033275 b 3
13 13 0.90403139 c 3
14 14 0.13871017 d 3
15 15 0.98889173 e 3
16 16 0.94666823 a 4
17 17 0.08243756 b 4
18 18 0.51421178 c 4
19 19 0.39020347 d 4
20 20 0.90573813 e 4
21 21 1.25486225 ab 1
22 22 1.87216325 ab 2
23 23 1.36548126 ab 3
This will get you the result
df %>% bind_rows(
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
mutate(values = sum(values), category = paste0(category,collapse='')) %>%
filter(row_number() == 1 & n() > 1)
) %>% mutate(ID = row_number())
BTW the code pro produce the dataframe in the example is this one:
df <- data.frame(ID = 1:20, values = runif(20), category = rep(letters[1:5], 4), file = as.factor(sort(rep(1:4, 5))))
now lets say you want to sum multiple columns, you need to provide the list in a vector:
cols = c("values") # columns to be sum
df %>% bind_rows(
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
mutate_at(vars(cols), sum) %>%
mutate(category = paste0(category,collapse='')) %>%
filter(row_number() == 1 & n() > 1)
) %>% mutate(ID = row_number())
library(dplyr)
df1 %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
filter(n_distinct(category) > 1) %>%
summarise(values = sum(values)) %>%
mutate(category="ab",
ID=max(df1$ID)+1:n()) %>%
bind_rows(df1, .)
#> Warning in bind_rows_(x, .id): binding factor and character vector,
#> coercing into character vector
#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector
#> ID values category file
#> 1 1 0.62585921 a 1
#> 2 2 0.61865851 b 1
#> 3 3 0.05274456 c 1
#> 4 4 0.68156961 d 1
.
.
.
#> 19 19 0.43239411 d 5
#> 20 20 0.85886314 e 5
#> 21 21 1.24451773 ab 1
#> 22 22 0.99001810 ab 2
#> 23 23 1.25331943 ab 3
This data.table approach uses a self-join to get all of the possible two-character combinations.
library(data.table)
setDT(df)
df_self_join <- df[df, on = .(file), allow.cartesian = T
][category != i.category,
.(category = paste0(i.category, category), values = values + i.values, file)
][order(category), .(ID = .I + nrow(df), values, category, file)]
rbindlist(list(df, df_self_join))
ID values category file
1: 1 0.76984382 a 1
2: 2 0.54311583 b 1
3: 3 0.23462016 c 1
4: 4 0.60179043 d 1
...
20: 20 0.03534223 e 5
21: 21 1.31295965 ab 1
22: 22 0.51666175 ab 2
23: 23 1.02305754 ab 3
24: 24 1.00446399 ac 1
25: 25 0.96910373 ac 2
26: 26 0.87795389 ac 4
#total of 80 rows
Here is pretty close dplyr translation:
library(dplyr)
tib <- as_tibble(df)
inner_join(tib, tib, by = 'file')%>%
filter(ID.x != ID.y)%>%
transmute(category = paste0(category.x, category.y)
, values = values.x + values.y
, file)%>%
arrange(category)%>%
bind_rows(tib, .)%>%
mutate(ID = row_number())%>%
filter(category == 'ab') #filter added to show the "ab" files
# A tibble: 3 x 4
ID values category file
<int> <dbl> <chr> <fct>
1 21 1.31 ab 1
2 22 0.517 ab 2
3 23 1.02 ab 3

dplyr calculate a new column by applying summarise function on another dataframe

I want to create a new column (CNT) in a dataframe called df. The value will be calculated using summarise function from dplyr package. It should return a number since I need to count a column in another dataframe (=cars), however the conditions for filtration is determined by the values in 2 columns of df.
dataframe:
library(dplyr)
df <- data.frame("my_speed" = 11:20, "my_dist" = c(17,20,15,17,21,23,28,36,50,80))
As an example, this is the calculation for the first row of df.
x=df[1,1]
y=df[1,2]
cars %>%
group_by(speed) %>%
filter(speed==x & dist==y) %>%
summarise(count=n()) %>%
select (count)
I am trying to figure out how I can use summarise() or another method to do this easily. NOTE that if summarise() returns no records, we should show zero.
df %>%
rowwise() %>%
filter(speed==my_spped & dist==my_dist) %>%
summarise(count=n()) %>%
select (count) %>%
mutate(CNT=count)
With rowwise, we can get the sum of the logical expression directly instead of doing additional operations
df %>%
rowwise %>%
mutate(CNT = sum((cars$speed == my_speed) & (cars$dist == my_dist)))
# A tibble: 10 x 3
# my_speed my_dist CNT
# <int> <dbl> <int>
# 1 11 17 1
# 2 12 20 1
# 3 13 15 0
# 4 14 17 0
# 5 15 21 0
# 6 16 23 0
# 7 17 28 0
# 8 18 36 0
# 9 19 50 0
#10 20 80 0
We can define a function
library(tidyverse)
get_count <- function(x, y) {
cars %>%
summarise(count = sum(speed == x & dist == y)) %>%
pull(count)
}
and apply it for every row using map2
df %>%
mutate(CNT = map2(my_speed, my_dist, get_count))
# my_speed my_dist CNT
#1 11 17 1
#2 12 20 1
#3 13 15 0
#4 14 17 0
#5 15 21 0
#6 16 23 0
#7 17 28 0
#8 18 36 0
#9 19 50 0
#10 20 80 0
The base R equivalent of the same using apply would be
get_count <- function(x) {
nrow(subset(cars, speed == x[1] & dist == x[2]))
}
df$CNT <- apply(df, 1, get_count)
Solution
library(dplyr)
cars %>%
count(speed, dist) %>% # count unique (speed, dist) pairs
right_join(dat) %>% # join to dat, drop all not in dat
mutate(CNT = coalesce(n, 0L), n = NULL) # replace NA, create CNT, drop n
Data
dat <- data.frame(
speed = 11:20,
dist = c(17, 20, 15, 17, 21, 23, 28, 36, 50, 80)
)
Output
# A tibble: 10 x 3
speed dist CNT
<dbl> <dbl> <int>
1 11 17 1
2 12 20 1
3 13 15 0
4 14 17 0
5 15 21 0
6 16 23 0
7 17 28 0
8 18 36 0
9 19 50 0
10 20 80 0

Resources