How to select customized TOP10 rows by variable? - r

I have the sales and cost data by models. The code below select TOP10 models by sales and all the rest are sum up in new category "Others" which is 11th row.
df <- data.frame (model = c("A","B","C","D","E","F","G","H","I","J","K","L","M","N"),
sale = c(100,300,140,456,345,456,456,780,40,560,560,456,350,500),
cost = c(1340,330,440,443,365,437,478,700,30,460,160,456,650,100))
#TOP10 by sale
order <- df %>%
type.convert(as.is = TRUE) %>%
mutate(pos = row_number(desc(sale)),
model = ifelse(pos>10, 'Others', model),
pos = ifelse(pos>10, 11, pos)) %>%
group_by(model, pos) %>%
summarise(cost= sum(cost), sale= sum(sale), .groups = 'drop') %>%
arrange(pos)
Output:
model pos cost sale
1 H 1 700 780
2 J 2 460 560
3 K 3 160 560
4 N 4 100 500
5 D 5 443 456
6 F 6 437 456
7 G 7 478 456
8 L 8 456 456
9 M 9 650 350
10 E 10 365 345
11 Others 11 2140 580
The sale of category A is put in "other" since it has low a sale (100) and is not in TOP10.
Now, I want to include A in this TOP 10 in any case, no matter how much sale it has. So output should be TOP9 + 'A' + 'Others':
Expected output:
model pos cost sale
1 H 1 700 780
2 J 2 460 560
3 K 3 160 560
4 N 4 100 500
5 D 5 443 456
6 F 6 437 456
7 G 7 478 456
8 L 8 456 456
9 M 9 650 350
10 A 10 100 1340
11 Others 11 2140 580
(Thus change must be done in the given code)

To be able to automate it, I created a function for you. You can easily specify the parameters and get the output quickly. Also, this function can be modified at a later stage based on your requirements:
Top10BySales = function(DataFrame,TopN=10, IncludeModels) {
## Dataframe -> User needs to specify the dataframe
## TopN -> Do you want to limit the analysis by top 10 or change it? Default value is 10
## IncludeModels -> Here you will specify which models you want to include
## Ranking the models based on sales
df1 = DataFrame %>%
arrange(desc(sale)) %>%
mutate(Ranking = 1:nrow(DataFrame),
Include = ifelse(Ranking<=TopN, model, "Other"))
## Grouping the models
df2 = df1 %>%
mutate(Ranking = ifelse(Include!="Other", Ranking, 10+1)) %>%
group_by(Include,Ranking) %>%
summarise(cost= sum(cost), sale= sum(sale), .groups = 'drop')
## Checking the length of IncludeModelsVector
if (length(IncludeModels) == 0) {
df2=df2
} else {
df3 = data.frame(ModelNames = IncludeModels)
df3$Inclusion = ifelse(df3$ModelNames %in% df1$Include, "Yes", "No")
df3 = df3 %>% filter(Inclusion=="No")
df_original = df1 %>% filter(model %in% df3$ModelNames)
df_original$Num = 1:nrow(df_original)
for (i in 1:nrow(df_original)) {
df2[nrow(df2)-df_original[i,]$Num,] = df_original[i,c(1,4,3,2)] ## Replacing the values
}
}
return(df2 %>% arrange(Ranking))
}
Using the function
To use this function, please see the picture below:
Hope this helps!

One way could be making use of bind_rows after removing the 10th line and adding only where model == A:
library(tidyverse)
#TOP10 by sale
df %>%
type.convert(as.is = TRUE) %>%
mutate(pos = row_number(desc(sale)),
model = ifelse(pos>10, 'Others', model),
pos = ifelse(pos>10, 11, pos)) %>%
group_by(model, pos) %>%
summarise(cost= sum(cost), sale= sum(sale), .groups = 'drop') %>%
arrange(pos) %>%
slice(-10) %>%
bind_rows(df %>%
filter(model == "A")) %>%
mutate(pos = replace_na(pos, 10)) %>%
arrange(pos)
model pos cost sale
<chr> <dbl> <dbl> <dbl>
1 H 1 700 780
2 J 2 460 560
3 K 3 160 560
4 N 4 100 500
5 D 5 443 456
6 F 6 437 456
7 G 7 478 456
8 L 8 456 456
9 M 9 650 350
10 A 10 1340 100
11 Others 11 2140 580

You can mutate in 2 steps, before grouping and manipulate the pos variable to fix the order. This solves the problem in the comments to the other answer.
order <- df %>%
type.convert(as.is = TRUE) %>%
mutate(pos = row_number(desc(sale))) %>%
mutate(pos = ifelse(model == "A" & pos > 10, 11, ifelse(pos > 10, 12, pos)),
model = ifelse(pos>11, 'Others', model)) %>%
group_by(model, pos) %>%
summarise(cost= sum(cost), sale= sum(sale), .groups = 'drop') %>%
arrange(pos)

Related

condition statement in apply function not work correctly in R

I had a dataframe, whose ID column had many duplicated names. So I used table() function to get the frequency of IDs. like this:
library(dplyr)
id <- runif(1000,1000,3000) %>% round() %>% as.character()
freq <- rep(1:50,20)
data <- data.frame(id,freq)
GetID <- function(a){
if (a[2]==1) newid <- a[1] else newid <- paste(a[1],1:a[2],sep = "-");
return(newid)}
idlist <- data %>% apply(., 1, GetID)
idlist2 <- unlist(idlist) %>% as.data.frame()
I wanted to get a new ID vector. If the freq equals 1, the new ID equals the old one. If the freq is larger than 1, the new ID is the old id combined with its order.
However, it seems the if statement didn't work correctly. All of new id had order number.
do you have to use a function? if not:
id <- runif(1000,1000,3000)
freq <- rep(1:50,20)
num <- 1:length(id)
data <- data.frame(num,id,freq)
data2 <- data %>% filter(freq == 1) %>% mutate(newid = id)
data3 <- data %>% filter(freq != 1) %>% mutate(newid = paste(id,freq,sep = "-"))
result <- rbind(data2,data3) %>% arrange(num)
You can group_by id and if number of rows is greater than 1 then paste row_number() with id or just use id.
library(dplyr)
data %>%
group_by(id) %>%
mutate(newID = if(n() > 1) paste(id, row_number(), sep = '-')
else as.character(id)) %>%
arrange(id)
# id freq newID
# <chr> <int> <chr>
# 1 1002 49 1002-1
# 2 1002 31 1002-2
# 3 1003 26 1003
# 4 1005 11 1005-1
# 5 1005 28 1005-2
# 6 1007 37 1007
# 7 1013 33 1013
# 8 1016 7 1016
# 9 1020 11 1020
#10 1024 28 1024
# … with 990 more rows

R: Calculate distance between consecutive points per group and group them

i am having hard time with this one...So i am trying to find points per group that are close to each other and furthermore group them. Let me explain You on base of the example data below:
Group X Y Z
1 110 3762 431 10
2 112 4950 880 10
3 113 5062 873 20
4 113 5225 874 30
5 113 5262 875 10
6 113 5300 874 20
structure(list(Group = c(110, 112, 113, 113, 113, 113), X = c(3762,
4950, 5062, 5225, 5262, 5300), Y = c(431, 880, 873, 874, 875,
874), Z = c(10, 10, 20, 30, 10, 20)), row.names = c(NA, -6L), class = "data.frame")
As we can see we have grouping column Group, X & Y Columns are our coordinates and Z Column should be further summarised when points are defined as "Close" (Euclidean distance < 100).
What i have tried:
I have calculated sucesfully Euclidean distance between points using this function:
for(i in 1:nrow(test)) {
if(i > 1 && test$Group[i] == test$Group[i-1]) {
test$Distance[i] <- sqrt(((test$X[i] - test$X[i-1]) ^ 2) + ((test$Y[i] - test$Y[i-1]) ^ 2))
} else {
test$Distance[i] <- NA
}
}
Which gives me this:
Group X Y Z Distance
1 110 3762 431 10 NA
2 112 4950 880 10 NA
3 113 5062 873 20 NA
4 113 5225 874 30 163.00307
5 113 5262 875 10 37.01351
6 113 5300 874 20 38.01316
And here everything complicates as there are NA´s for the first row for each Group etc....
What i wanna achieve:
I would like to find points per goup that their distance is not greater then 100 (Distance < 100), and on base of that summarise it (simple sum of Z column). So manually done:
Group Z Grouped
1 110 10 no
2 112 10 no
3 113 20 no
4 113 60 yes
Thanks for help!
That was difficult. I'm not sure I have figured it out completely.
#get data and libraries
library(tidyverse)
df <- read.table(text = "
Group X Y Z Distance
1 110 3762 431 10 NA
2 112 4950 880 10 NA
3 113 5062 873 20 NA
4 113 5225 874 30 163.00307
5 113 5262 875 10 37.01351
6 113 5300 874 20 38.01316", header = T, stringsAsFactors = F)
df %>%
group_by(Group) %>%
do(melt(outer(.$Distance, .$Distance, `-`))) %>%
filter(between(value, -100, 0) | between(value, 0, 100)) %>%
distinct(Var1) %>%
mutate(grouped = 1) %>%
rename(row = Var1) -> rows
df %>%
group_by(Group) %>%
mutate(row = row_number()) %>%
left_join(rows, by = c("row", "Group")) %>%
mutate(grouped = ifelse(is.na(grouped), "no", "yes")) %>%
group_by(Group, grouped) %>%
mutate(Z = ifelse(!is.na(grouped), sum(Z), Z)) %>%
distinct(Group, Z, grouped)
# A tibble: 4 x 3
# Groups: Group, grouped [4]
Group Z grouped
<int> <int> <chr>
1 110 10 no
2 112 10 no
3 113 20 no
4 113 60 yes
Hope it's what you were looking for, if not maybe it gave you some new ideas.
UPDATE: And now what I hope will really help you:
df %>%
group_by(Group) %>%
mutate(int1 = lead(Distance) < 100 | Distance < 100,
int1 = replace(int1, is.na(int1), FALSE),
int2 = rleid(int1),
int2 = replace(int2, !int1 | is.na(int1), NA)) -> df2
df2 %>%
filter(int1) %>%
group_by(Group, int2) %>%
summarise(Z = sum(Z),
Grouped = "yes") %>%
select(Group, Z, Grouped) %>%
bind_rows(df2 %>%
filter(!int1) %>%
mutate(Grouped = "no") %>%
select(Group, Z, Grouped)) %>%
arrange(Group)
# A tibble: 4 x 3
# Groups: Group [3]
Group Z Grouped
<int> <int> <chr>
1 110 10 no
2 112 10 no
3 113 60 yes
4 113 20 no
I worked out a little use case that can get you started. It is a base approach using a for loop and aggregation based on vector of columns to which you could apply a paired vector of functions by which to aggregate.
df <- read.table(text = "
Group X Y Z Distance
1 110 3762 431 10 NA
2 112 4950 880 10 NA
3 113 5062 873 20 NA
4 113 5225 874 30 163.00307
5 113 5262 875 10 37.01351
6 113 5300 874 20 38.01316
7 114 5300 874 30 NA
8 114 5300 874 20 38.01316", header = T, stringsAsFactors = F)
aggregateIt <- function(df = data, #data.frame
returnRaw = F, #to get the raw unaggregted df (only first case from column `grouped` by `subgroup` usable in this application)
colsToAgg = c("Z1", "Z2", "Z3"), #cols to aggregate
how = c("sum", "sum", "max")) #how to aggregate the columns, `Z1` by sum, `Z2` by sum and `Z3` by max
{
count <- 1L
result <- vector("integer", nrow(df))
grouped <- vector("character", nrow(df))
for(i in seq_len(length(result)-1L)){
if(df$Group[i] != df$Group[i+1L]) {
result[i] <- count
grouped[i] <- "no"
count <- count + 1L
if((i+1L) == length(result)) {
result[i+1L] <- count
grouped[i+1L] <- "no"
}
} else {
if(df$Distance[i+1L] > 100L) {
result[i] <- count
grouped[i] <- "no"
count <- count + 1L
if((i+1L) == length(result)) {
result[i+1L] <- count
grouped[i+1L] <- "no"
}
} else {
result[i] <- count
grouped[i] <- "yes"
if((i+1L) == length(result)) {
result[i+1L] <- count
grouped[i+1L] <- "yes"
}
}
}
}
df <- within(df, {subgroup <- result; grouped <- grouped})
if(returnRaw) return(df)
A <- Reduce(function(a, b) merge(a, b, by = "subgroup"),
lapply(seq_along(how), function(x) aggregate(.~subgroup, df[, c(colsToAgg[x], "subgroup")], how[x])))
B <- df[!duplicated(df$subgroup, fromLast = F), c("Group", "subgroup", "grouped")]
out <- merge(A, B, by = "subgroup")
return(out[, c("Group", colsToAgg, "grouped")])
}
aggregateIt(df = df, colsToAgg = "Z", how = "sum")
# Group Z grouped
#1 110 10 no
#2 112 10 no
#3 113 20 no
#4 113 60 yes
#5 114 50 yes
Not claiming this is most efficient solution but it points out the solution. Hope this helps!

Apply Bins to Data Frame Groups without making subset Data Frames

I have a data frame containing fish population sampling data. I would like to create bins to count how many fish are in a given length group for each species.
The below code accomplishes this task for 2 species. Doing this for all species in the data frame doesn't seem like the most elegant way to achieve this goal.
Plus I would like to apply this code to other lakes with different species. It would be great to find an "automated" way to apply these bins to each species group in the data frame.
The data frame looks like:
Species TL WT
BLG 75 6
BLG 118 27
LMB 200 98
LMB 315 369
RBS 112 23
RES 165 73
SPB 376 725
YEP 155 33
ss = read.csv("SS_West Point.csv" , na.strings="." , header=T)
blg = ss %>% subset(Species == "BLG")
lmb = ss %>% subset(Species == "LMB")
blgn = blg %>% summarise(n = n())
lmbn = lmb %>% summarise(n = n())
### 20mm Length Groups - BLG ###
blg20 = blg %>% group_by(gr=cut(TL , breaks = seq(0 , 1000 , by = 20))) %>%
summarise(n = n()) %>% mutate(freq = n , percent = ((n/blgn$n)*100) ,
cumfreq = cumsum(freq) , cumpercent = cumsum(percent))
### 20mm Length Groups - BLG ###
lmb20 = lmb %>% group_by(gr=cut(TL , breaks = seq(0 , 1000 , by = 20))) %>%
summarise(n = n()) %>% mutate(freq = n , percent = ((n/lmbn$n)*100) ,
cumfreq = cumsum(freq) , cumpercent = cumsum(percent))
I've successfully used do() to run linear models on this data frame but can't seem to get it to work on cut(). Here is how I used do() on lm():
ssl = ss %>% mutate(lTL = log10(TL) , lWT = log10(WT)) %>% group_by(Species)
m = ssl %>% do(lm(lWT~lTL , data =.)) %>% mutate(wp = 10^(.fitted))
Does this do what you expect?
ss20 <- ss %>%
add_count(Species) %>%
rename(Species_count = n) %>%
# I added Species_count to the grouping so it goes along for the ride in summarization
group_by(Species, Species_count, gr=cut(TL , breaks = seq(0 , 1000 , by = 20))) %>%
summarise(n = n()) %>%
mutate(freq = n, percent = ((n/Species_count)*100),
cumfreq = cumsum(freq) , cumpercent = cumsum(percent)) %>%
ungroup()
> ss20
# A tibble: 8 x 8
Species Species_count gr n freq percent cumfreq cumpercent
<chr> <int> <fct> <int> <int> <dbl> <int> <dbl>
1 BLG 2 (60,80] 1 1 50 1 50
2 BLG 2 (100,120] 1 1 50 2 100
3 LMB 2 (180,200] 1 1 50 1 50
4 LMB 2 (300,320] 1 1 50 2 100
5 RBS 1 (100,120] 1 1 100 1 100
6 RES 1 (160,180] 1 1 100 1 100
7 SPB 1 (360,380] 1 1 100 1 100
8 YEP 1 (140,160] 1 1 100 1 100

How to filter out rows where the value is very different from the other values?

I would like to remove rows where the value of a particular variable, HEIGHT_CM is very different from the other values for the same id, PATIENT_ID.
Difference of >2 from the other values
Data:
df <- read.table(text = "PATIENT_ID MEASUREMENT_TAKEN_DATE HEIGHT_CM
1 20140305 163
1 20140409 163
1 20140528 164
1 20140730 164
1 20141210 99.7
2 20140305 155
2 20140527 157
2 20141111 78
3 20140721 90
4 20140528 168
4 20140627 167
4 20140917 167
4 20141002 70", header = TRUE, stringsAsFactors = FALSE)
Desired Output:
df <- read.table(text = "PATIENT_ID MEASUREMENT_TAKEN_DATE HEIGHT_CM
1 20140305 163
1 20140409 163
1 20140528 164
1 20140730 164
2 20140305 155
2 20140527 157
3 20140721 90
4 20140528 168
4 20140627 167
4 20140917 167", header = TRUE, stringsAsFactors = FALSE)
library(dplyr)
df <- df %>%
group_by(PATIENT_ID) %>%
mutate(difference = abs(mean(HEIGHT_CM, na.rm = T) - HEIGHT_CM)) %>%
filter(difference <= 50)
Here you filter for difference >= 50, where difference is between the mean of the group and the single value. This gets the output you showed us.
A statistical way to remove outliers that gives your exact result :
library(dplyr)
df1 <- df %>% group_by(PATIENT_ID) %>% summarize(s=sd(HEIGHT_CM),m=mean(HEIGHT_CM))
df %>% inner_join(df1,by="PATIENT_ID") %>%
filter(is.na(s)|((HEIGHT_CM>m-s)&(HEIGHT_CM<m+s))) %>% select(-c(4,5))

R - dplyr - filter top_n rows based on multiple conditions

Hi, this is my first post,
I hope to get it right and reproducible.
I was wondering if there is a more elegant solution than my approach below
I have a dataframe and would like to use conditional filters and extract rows that meet these conditions.
As output I would like the top_n rows that meet the conditional criteria (different conditions for top_n output from different columns), whilst preserving all other columns.
Example dataframe:
set.seed(123)
df1 <- data.frame(
A = as.numeric(1:10),
B = sample(seq(as.Date('2000/01/01'), as.Date('2018/01/01'), by="day"), size=10),
C = as.numeric(sample(20:90, size = 10)),
D = sample(c("yes", "no"), size=10, replace = TRUE),
E = as.numeric(sample(1000:2000, size = 10))
)
df1 #check output
> df1 #check output
A B C D E
1 1 2005-03-06 87 no 1963
2 2 2014-03-11 51 no 1902
3 3 2007-05-12 66 no 1690
4 4 2015-11-22 58 no 1793
5 5 2016-12-02 26 no 1024
6 6 2000-10-26 79 no 1475
7 7 2009-07-01 35 no 1754
8 8 2016-01-19 22 no 1215
9 9 2009-11-30 40 yes 1315
10 10 2008-03-17 85 yes 1229
Conditions I would like to use for filtering:
A) if column E is between 1000 and 1500 return top 2 rows weighted on column A
B) if column E is between 1000 and 2000 return top 2 rows weighted on column B
C) if column E is between 1000 and 1400 return top 2 rows weighted on column C
I have come up with the following solution but it is cumbersome and I wondered if there is a better approach.
library("dplyr")
library("tidyr")
A<- df1 %>% dplyr::filter(E >= 1000 & E <= 1500) %>% top_n( n = 2, wt = A) %>% arrange(-A) %>% mutate(condition = "-cond_A")
B<- df1 %>% dplyr::filter(E >= 1000 & E <= 2000) %>% top_n( n = 2, wt = B) %>% arrange(B) %>% mutate(condition = "cond_B")
C<- df1 %>% dplyr::filter(E >= 1000 & E <= 1400) %>% top_n( n = 2, wt = C) %>% arrange(-C) %>% mutate(condition = "-cond_C")
my desired output is the following:
spread(as.data.frame(distinct(bind_rows(A,B,C))),condition, condition)
A B C D E -cond_A -cond_C cond_B
1 5 2016-12-02 26 no 1024 <NA> <NA> cond_B
2 8 2016-01-19 22 no 1215 <NA> <NA> cond_B
3 9 2009-11-30 40 yes 1315 -cond_A -cond_C <NA>
4 10 2008-03-17 85 yes 1229 -cond_A -cond_C <NA>
Would be great if you could tell me a better approach!
that's great, thank you so much!
In my comments I asked if you could have more arguments to map2, and I realised that pmap can do just that.
pmap(list(c(1500, 2000, 1400), c(1000, 1700, 1300), names(df1)[1:3]),
~ df1 %>%
filter(E >= ..2 & E <= ..1) %>%
top_n(n=2, wt = !! rlang::sym(..3)) %>%
arrange_at(..3, funs(desc(.))) %>%
mutate(condition = paste0("-cond", ..3))) %>%
bind_rows %>%
distinct %>%
spread(condition, condition)
We could use map2 from purrr to loop through the <= condition which changes and also the wt argument that takes the column names (based on the OP's code)
library(purrr)
library(dplyr)
library(tidyr)
map2(c(1500, 2000, 1400), names(df1)[1:3],
~ df1 %>%
filter(E >= 1000 & E <= .x) %>%
top_n(n=2, wt = !! rlang::sym(.y)) %>%
arrange_at(.y, funs(desc(.))) %>%
mutate(condition = paste0("-cond", .y))) %>%
bind_rows %>%
distinct %>%
spread(condition, condition)
# A B C D E -condA -condB -condC
#1 5 2016-12-02 26 no 1024 <NA> -condB <NA>
#2 8 2016-01-19 22 no 1215 <NA> -condB <NA>
#3 9 2009-11-30 40 yes 1315 -condA <NA> -condC
#4 10 2008-03-17 85 yes 1229 -condA <NA> -condC

Resources