finding outliers and counting number of occurrence - r

I have a data frame as below
raw_data <- data.frame(
"id" = c(1, 1, 1, 2, 2),
"salary" = c(10000,15000,20000,40000,50000),
"expenditure" = c(10000,15000,20000,30000,40000))
if a salary greater than 15000 would be flaged as an outlier, and if an expenditure is greater than 10000, it should be flaged as an outlier. But the problem is now, how to count how many times an outlier (both spearately) occured by a specific id. The output should look like the following df
output <- data.frame(
"id"=c(1,1,1,2,2,2,2),
"question_name"=c("expenditure", "salary","expenditure","salary","expenditure","salary","expenditure"),
"values"=c(15000,20000,20000,30000,40000,500000,40000),
"count"=c(1,1,1,1,1,1,1))

Here's a dplyr solution:
raw_data %>%
mutate(salary_flag =
ifelse(salary > 15000, 1, 0),
expenditure_flag = ifelse(expenditure > 10000, 1, 0)) %>%
group_by(id) %>%
mutate(total_outlier = sum(salary_flag) + sum(expenditure_flag))
You are flagging for salary and expenditure, then grouping by id and calculating the sum of all salary_flag and the sum of all expenditure_flag for each id.
id salary expenditure salary_flag expenditure_flag total_outlier
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 10000 10000 0 0 0
2 2 15000 15000 0 1 1
3 3 20000 20000 1 1 2
4 4 40000 30000 1 1 2
5 5 50000 40000 1 1 2
If you're only concerned with the total outliers, #MartinGal provided a very nice option:
raw_data %>%
group_by(id) %>%
mutate(total_outlier = sum(salary>15000, expenditure>10000))
Gives us:
id salary expenditure total_outlier
<int> <dbl> <dbl> <int>
1 1 10000 10000 0
2 2 15000 15000 1
3 3 20000 20000 2
4 4 40000 30000 2
5 5 50000 40000 2
edit:
This seems to get the end result that you're looking for:
raw_data %>%
group_by(id) %>%
summarise(count = sum(salary>15000, expenditure>10000),
value = min(salary)) %>%
mutate(title = "salary") %>%
select(id, title, value, count)
Which gives you:
id title value count
<int> <chr> <dbl> <int>
1 1 salary 10000 0
2 2 salary 15000 1
3 3 salary 20000 2
4 4 salary 40000 2
5 5 salary 50000 2

Raw data is :
raw_data <- data.frame("id" = c(1, 1, 1, 2, 2),
"salary" = c(10000,15000,20000,40000,50000),
"expenditure" = c(10000,15000,15000,30000,40000))
And the solution is :
raw_data <- raw_data %>% filter(salary>15000 | expenditure>10000)
entry_variables <- raw_data %>%select(id,salary,expenditure) %>%
pivot_longer(cols = -id,
names_to = "Question_name", values_to= "Value",
values_drop_na = TRUE) %>%
count(id, Question_name, Value)

You can try the following
raw_data <- data.frame("id" = 1:5,
"salary" = c(10000,15000,20000,40000,50000),
"expenditure" = c(10000,15000,20000,30000,40000))
raw_data$SaleryOutlier <- ifelse(
raw_data$salary > 15000, TRUE, FALSE)
raw_data$ExpenditureOutlier <- ifelse(
raw_data$expenditure > 10000, TRUE, FALSE)
You can then use aggregate function to summarize the data, e.g. for each id by using FUN=sum. This should look like
aggregate(raw_data, by=list(id = raw_data$id), FUN=sum)
This works because TRUE=1.
I hope this helps.
EDIT
Based on your comment, I guess you are looking for
raw_data <- data.frame("id" = c(1, 1, 1, 2, 2),
"salary" = c(10000,15000,20000,40000,50000),
"expenditure" = c(10000,15000,20000,30000,40000))
raw_data$SaleryOutlier <- ifelse(
raw_data$salary > 15000, TRUE, FALSE)
raw_data$ExpenditureOutlier <- ifelse(
raw_data$expenditure > 10000, TRUE, FALSE)
raw_data_aggregate <- aggregate(raw_data, by=list(id = raw_data$id), FUN=sum)
raw_data_aggregate$count <- raw_data_aggregate$SaleryOutlier + raw_data_aggregate$ExpenditureOutlier
EDIT TWO
If you want to aggregate over two variables, just exchange the above aggregate with
raw_data_aggregate <- aggregate(
SalaryOutlier + ExpenditureOutlier ~ id + salary + expenditure, raw_data, FUN=sum)
EDIT THREE
Based on the comments below, I created the following code
raw_data <- data.frame(
"id" = c(1, 1, 1, 2, 2),
"salary" = c(10000,15000,20000,40000,50000),
"expenditure" = c(10000,15000,20000,30000,40000))
# Identify salary outliers
raw_data$SalaryOutlier <- ifelse(
raw_data$salary > 15000, TRUE, FALSE)
# Identify expenditure outliers
raw_data$ExpenditureOutlier <- ifelse(
raw_data$expenditure > 10000, TRUE, FALSE)
# Aggregate over id + salay
raw_data_aggregate_salary <- aggregate(
SalaryOutlier ~ id + salary, raw_data, FUN=sum)
# Aggregate over id + expenditure
raw_data_aggregate_expenditure <- aggregate(
ExpenditureOutlier ~ id + expenditure, raw_data, FUN=sum)
# Just some renaming to fit with desired output data frame.
raw_data_aggregate_salary$question_name <- "salary"
raw_data_aggregate_expenditure$question_name <- "expenditure"
colnames(raw_data_aggregate_salary)[2] <- "values"
colnames(raw_data_aggregate_expenditure)[2] <- "values"
colnames(raw_data_aggregate_salary)[3] <- "count"
colnames(raw_data_aggregate_expenditure)[3] <- "count"
# Bind result together into one df.
raw_data_aggregate <- rbind(
raw_data_aggregate_salary, raw_data_aggregate_expenditure)
# Only select entries where we actually have a count.
raw_data_aggregate <- subset(
raw_data_aggregate,
raw_data_aggregate$count > 0)
# Order to fit with desired output
raw_data_aggregate <- raw_data_aggregate[ order(raw_data_aggregate$id), ]

In a data.table this would look like
raw_data[, flag0 := (salary > 15000) + (expenditure > 10000)]
raw_data[, flag := sum(flag0), by = "id"]
Here flag0 is the flag by row (which can later be deleted if you like) and flag would be the final result.
Edit: Seeing your reply to #Matt, you seem to want the total amount by salary and expenditure seperately. You can do something like
raw_data[, flag_salary := as.integer(salary > 15000)]
raw_data[, flag_expenditure := as.integer(expenditure > 10000)]
raw_data[, flag_salary := sum(flag_salary), by = "id"]
raw_data[, flag_expenditure := sum(flag_expenditure), by = "id"]

Related

Filter using Dataset Position in R

I'm not really familiar with dplyr function in R. However, I want to filter my dataset into certain conditions.
Let's say I've more than 100 of attributes in my dataset. And I want to perform filter with multiple condition.
Can I put my coding filter the position of the column instead of their name as follow:
y = filter(retag, c(4:50) != 8 & c(90:110) == 8)
I've tried few times similar with this coding, however still haven't get the result.
I also did tried coding as follow, but not sure how to add another conditions into the rowSums function.
retag[rowSums((retag!=8)[,c(4:50)])>=1,]
The only example that I found was using the dataset names instead of the position.
Or is there any way to filter using the dataset position as my data quite huge.
You can use a combination of filter() and across(). I didn't have your version of the retag dataframe so I created my own as an example
set.seed(2000)
retag <- tibble(
col1 = runif(n = 1000, min = 0, max = 10) %>% round(0),
col2 = runif(n = 1000, min = 0, max = 10) %>% round(0),
col3 = runif(n = 1000, min = 0, max = 10) %>% round(0),
col4 = runif(n = 1000, min = 0, max = 10) %>% round(0),
col5 = runif(n = 1000, min = 0, max = 10) %>% round(0)
)
# filter where the first, second, and third column all equal 5 and the fourth column does not equal 5
retag %>%
filter(
across(1:3, function(x) x == 5),
across(4, function(x) x != 5)
)
if_all() and if_any() were recently introduced into the tidyverse for the purpose of filtering across multiple variables.
library(dplyr)
filter(retag, if_all(X:Y, ~ .x > 10 & .x < 35))
# # A tibble: 5 x 2
# X Y
# <int> <int>
# 1 11 30
# 2 12 31
# 3 13 32
# 4 14 33
# 5 15 34
filter(retag, if_any(X:Y, ~ .x == 2 | .x == 25))
# # A tibble: 2 x 2
# X Y
# <int> <int>
# 1 2 21
# 2 6 25
Data
retag <- structure(list(X = 1:20, Y = 20:39), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
Here's a base R option.
This will select rows where there is no 8 in column 4 to 50 and there is at least one 8 in column 90 to 110.
result <- retag[rowSums(retag[4:50] == 8, na.rm = TRUE) == 0 &
rowSums(retag[90:110] == 8,na.rm = TRUE) > 0, ]

Conditional statement that modifies two variables

I have the following data frame:
dat<-data.frame(site=c(rep("A", 3), rep("B", 3)),
landuse= rep(c("urban", "dev", "undev"),2),
percent= c(30,30,40, 50, 30, 20))
For each site, I want to filter for rows where the percent is greater than or equal to 50, but for sites where all landuse categories do not meet the minimum criteria, the landuse entry is changed to "mixed" and the percent is changed to 100.
The result data frame would look like this:
result<- data.frame(site= c("A", "B"), landuse=c("mixed", "urban"), percent= c(100, 50))
With dplyr you can try :
library(dplyr)
dat %>%
group_by(site) %>%
summarise(landuse = if (all(percent < 50)) "mixed" else landuse[percent >= 50],
percent = ifelse(landuse == 'mixed', 100, percent))
# site landuse percent
# <chr> <chr> <dbl>
#1 A mixed 100
#2 B urban 50
Does this work:
library(dplyr)
dat %>% group_by(site) %>% mutate(landuse = case_when(all(percent < 50) ~ 'mixed', TRUE ~ landuse),
percent= case_when(landuse == 'mixed' ~ 100, TRUE ~ percent)) %>%
filter(percent >= 50) %>% distinct()
# A tibble: 2 x 3
# Groups: site [2]
site landuse percent
<chr> <chr> <dbl>
1 A mixed 100
2 B urban 50

How do you aggregate rows to a factor variable with three levels?

I have a dataset where some participants have multiple rows and I need to aggregate the data in a way that every participant has only one row. The dataset contains different variable types (e.g., factors, date, age etc.) I have made a code that works and looks like this:
example4 <- SMARTdata_50j_diagc_2016 %>%
group_by( Patient_Id ) %>%
summarise( Groep = first( Groep ),
Ziekenhuis_Nr = first( Ziekenhuis_Nr ),
Ziekenhuistype = first( Ziekenhuistype ),
aantalDBC = n(),
aantalVervolg = sum( as.numeric( Zorgtype_Code ) ),
Leeftijd = mean( Lft_patient_openenDBC ),
MRI_nee_ja = max( ifelse( MRI_nee_ja == 0, 0, 1 ) ),
aantalMRI = sum( MRI_Aantal ),
Artroscopie_nee_ja = max( ifelse( Artroscopie_nee_jaz_jam == 0, 0, 1 ) ),
aantalArtroscopie = sum( Artroscopie_aantal ),
overigDBC = mean( Aantal_overigeDBC_bijopenen ),
DBC_open = min( open_DBC ),
DBC_sluiten = max( sluiten_DBC ) ) %>%
as.data.frame()
This code gives me a single row for each participant. However, I have one more variable that I need to include in the new dataframe, but I do not know how to do that. The variable that I need to add is called 'Diagnose_Code' and is factor with two levels, namely 0 (standing for 1801) and 1 (standing for 1805).
For the participants that have multiple rows (in the original dataframe), there are participants that have both a 0 and a 1 for that variable. Now, in my new dataframe, I want to make a variable for 'Diagnose_Code' with three levels: 0 for if all rows of that participant are 0, 1 for if all rows of that participant are 1, and 2 for if the rows of that participant have both a 0 and a 1.
I do not know how to make this work. I struggled a bit with the ifelse code, but without success. Does anyone know how I can make this work in my code? Thank you in advance!
Using a toy dataset this can be achieved like so:
library(dplyr)
df <- data.frame(
id = rep(1:3, each = 3),
diagnosis_code = c(rep(1,3), rep(0, 3), c(1, 0, 1)),
stringsAsFactors = FALSE
)
df %>%
group_by(id) %>%
summarise(diagnosis_code = case_when(
all(diagnosis_code == 1) ~ 1,
all(diagnosis_code == 0) ~ 0,
TRUE ~ 2
))
#> # A tibble: 3 x 2
#> id diagnosis_code
#> <int> <dbl>
#> 1 1 1
#> 2 2 0
#> 3 3 2
Created on 2020-03-29 by the reprex package (v0.3.0)
Using ifelse should work:
df %>%
group_by(id) %>%
summarise(diag=ifelse(max(diag)!=min(diag), 2,
ifelse(max(diag==1), 1, 0)))
# A tibble: 3 x 2
id diag
<dbl> <dbl>
1 1 2
2 2 1
3 3 0
Data:
df <- data.frame(id=c(1,1,1,2,2,2,3,3,3), diag=c(1,0,0,1,1,1,0,0,0))
df %>%
group_by(Patient_Id) %>%
summarise(Diagnose_Code = case_when(n_distinct(Diagnose_Code) == 2 ~ 3,
sum(Diagnose_Code) == 1 ~ 1,
TRUE ~ 0 ))

R: How to update on data table 1 with occurrence of value in table 2

I have two tables:
Table 1:
ID, Name, ...
1, A
2,B
Table 2:
ID, PRODUCT(3 products), COLOR(3 option), number(1)
1, fan, white,1
1, fan, white,1
1, bed, red, 1
2, fan, white,1
I want to Add number 9 more column to table 1. and put count of each combo in the row.
ID, Name, Fan-white, fan-black, fan-blue, bed-white, bed-black, bed-blue, show-white,
1, A, 2,0....
I tried to count and merge each occurrence. It took a lot of time and code. any Any quick ways?
Here are two different methods to approach this (both involve combining the product and color into a single column). The first uses table which is simpler
df1 <- data.frame(id = c(1, 2),
name = c("A", "B"))
df2 <- data.frame(id = c(1, 1, 1, 2),
prod = c("fan", "fan", "bed", "fan"),
color = c("white", "white", "red", "white"),
number = 1)
df3 <- data.frame(id = df2$id,
prod.color = paste0(df2$prod, "-", df2$color),
number = df2$number)
table(df3)
, , number = 1
prod.color
id bed-red fan-white
1 1 2
2 0 1
The second uses two libraries, but might be closer to what you're looking for as number can be any value
library(dplyr)
library(tidyr)
inner_join(df1, df2, by="id") %>%
mutate(prod.color = paste0(prod, "-", color)) %>%
select(-one_of(c("prod", "color"))) %>%
group_by(id, name, prod.color) %>%
summarise(total = sum(number)) %>%
spread(prod.color, total, fill = 0)
Source: local data frame [2 x 4]
Groups: id, name [2]
id name `bed-red` `fan-white`
* <dbl> <fctr> <dbl> <dbl>
1 1 A 1 2
2 2 B 0 1

multiply columns of data frames

I have been scratching my head over this. I have two data frames: df
df <- data.frame(group = 1:3,
age = seq(30, 50, length.out = 3),
income = seq(100, 500, length.out = 3),
assets = seq(500, 800, length.out = 3))
and weights
weights <- data.frame(age = 5, income = 10)
I would like to multiply these two data frames only for the same column names. I tried something like this:
colwise(function(x) {x * weights[names(x)]})(df)
but that obviously didn't work as colwise does not keep the column name inside the function. I looked at various mapply solutions (example), but I am unable to come up with an answer.
The resulting data.frame should look like this:
structure(list(group = 1:3, age = c(150, 200, 250), income = c(1000,
3000, 5000), assets = c(500, 650, 800)), .Names = c("group",
"age", "income", "assets"), row.names = c(NA, -3L), class = "data.frame")
group age income assets
1 1 150 1000 500
2 2 200 3000 650
3 3 250 5000 800
sweep() is your friend here, for this particular example. It relies upon the names in df and weights being in the right order, but that can be arranged.
> nams <- names(weights)
> df[, nams] <- sweep(df[, nams], 2, unlist(weights), "*")
> df
group age income assets
1 1 150 1000 500
2 2 200 3000 650
3 3 250 5000 800
If the variable names in weights and df are not in the same order, you can make them so:
> df2 <- data.frame(group = 1:3,
+ age = seq(30, 50, length.out = 3),
+ income = seq(100, 500, length.out = 3),
+ assets = seq(500, 800, length.out = 3))
> nams <- c("age", "income") ## order in df2
> weights2 <- weights[, rev(nams)]
> weights2 ## wrong order compared to df2
income age
1 10 5
> df2[, nams] <- sweep(df2[, nams], 2, unlist(weights2[, nams]), "*")
> df2
group age income assets
1 1 150 1000 500
2 2 200 3000 650
3 3 250 5000 800
In other words we reorder all objects so that age and income are in the right order.
Someone might have a slick way to do it with plyr, but this is probably the most straight forward way in base R.
shared.names <- intersect(names(df), names(weights))
cols <- sapply(names(df), USE.NAMES=TRUE, simplify=FALSE, FUN=function(name)
if (name %in% shared.names) df[[name]] * weights[[name]] else df[[name]])
data.frame(do.call(cbind, cols))
# group age income assets
# 1 1 150 1000 500
# 2 2 200 3000 650
# 3 3 250 5000 800
Your data:
df <- data.frame(group = 1:3,
age = seq(30, 50, length.out = 3),
income = seq(100, 500, length.out = 3),
assets = seq(500, 800, length.out = 3))
weights <- data.frame(age = 5, income = 10)
The logic:
# Basic name matching looks like this
names(df[names(df) %in% names(weights)])
# [1] "age" "income"
# Use that in `sapply()`
sapply(names(df[names(df) %in% names(weights)]),
function(x) df[[x]] * weights[[x]])
# age income
# [1,] 150 1000
# [2,] 200 3000
# [3,] 250 5000
The implementation:
# Put it all together, replacing the original data
df[names(df) %in% names(weights)] <- sapply(names(df[names(df) %in% names(weights)]),
function(x) df[[x]] * weights[[x]])
The result:
df
# group age income assets
# 1 1 150 1000 500
# 2 2 200 3000 650
# 3 3 250 5000 800
Here is a data.table solution
library(data.table)
DT <- data.table(df)
W <- data.table(weights)
Use mapply (or Map) to calculate the new columns and add then both at once
by reference.
DT <- data.table(df)
W <- data.table(weights)
DT[, `:=`(names(W), Map('*', DT[,names(W), with = F], W)), with = F]
You could also do this in a for loop using an index resulting from which(%in%). The above approach is much more efficient but this is an alternative.
results <- list()
for ( i in 1:length(which(names(df) %in% names(weights))) ) {
idx1 <- which(names(df) %in% names(weights))[i]
idx2 <- which(names(weights) %in% names(df))[i]
results[[i]] <- dat[,idx1] * weights[idx2]
}
unlist(results)

Resources