find highest value within factor levels - r

if I have the following dataframe:
value factorA factorB
1 a e
2 a f
3 a g
1 b k
2 b l
3 b m
1 c e
2 c g
how can I get for each factorA the highest value and the entry from factorB associated with it i.e.
value factorA factorB
3 a g
3 b m
2 c g
Is this possible without first using
blocks<-split(factorA, list(), drop=TRUE)
and then sorting each block$a as this will be performed many times and number of blocks will always change.

Here is one option, using base R functions:
maxRows <- by(df, df$factorA, function(X) X[which.max(X$value),])
do.call("rbind", maxRows)
# value factorA factorB
# a 3 a g
# b 3 b m
# c 2 c g

With your data
df<- structure(list(value = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L), factorA = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"),
factorB = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 3L), .Label = c("e",
"f", "g", "k", "l", "m"), class = "factor")), .Names = c("value",
"factorA", "factorB"), class = "data.frame", row.names = c(NA,
-8L))
Using ddply function in plyr package
> df2<-ddply(df,c('factorA'),function(x) x[which(x$value==max(x$value)),])
value factorA factorB
1 3 a g
2 3 b m
3 2 c g
Or,
> rownames(df2) <- df2$factorA
> df2
value factorA factorB
a 3 a g
b 3 b m
c 2 c g

Related

Subset specific rows per group [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 3 years ago.
Improve this question
I have a dataframe such that:
Group V2 V3 V4
1 D F W
1 T A L
1 P F P
2 T F L
2 R R O
2 D Y L
2 D F I
...
And I have list such that:
[1] 1 3
[2] 4
[3] 2 3 4
Each element of the list indicates which row I want to keep for each group. So I only want to keep row 1 and 3 of Group==1 in the dataframe; the 4th row for the second group; rows 2 3 and 4 for the 3rd group etc.
I have tried hard but I haven't found a straightforward way although I'm pretty sure there must be one using apply or something similar.
You can do,
do.call(rbind, Map(function(x, y) x[y,], split(df, df$Group), l1))
# Group V2 V3 V4
#1.1 1 D F W
#1.3 1 P F P
#2 2 D F I
where,
l1 <- list(c(1, 3), 4)
Having the folowing objects to work with, a data.frame and a list, similar to yours:
df <- read.table(text = "Group V2 V3 V4
1 D F W
1 T A L
1 P F P
2 T F L
2 R R O
2 D Y L
2 D F I
3 E F I
3 F F I
3 G F I
3 T F I", header = T)
l <- list(c(1, 3), 4, c(2:4))
do.call(rbind, lapply(seq_along(l), function(i) df[df$Group == i,][l[[i]],]))
# Group V2 V3 V4
#1 1 D F W
#3 1 P F P
#7 2 D F I
#9 3 F F I
#10 3 G F I
#11 3 T F I
yields the same result as the simpler data.table approach:
library(data.table)
dt <- as.data.table(df)
dt[, .SD[l[[.GRP]]], Group]
or
dt[, .SD[l[[unlist(.BY)]]], Group]
# Group V2 V3 V4
#1: 1 D F W
#2: 1 P F P
#3: 2 D F I
#4: 3 F F I
#5: 3 G F I
#6: 3 T F I
An option using tidyverse
library(tidyverse)
df %>%
group_split(Group) %>%
map2_df(l, ~ .x %>%
slice(.y))
# A tibble: 6 x 4
# Group V2 V3 V4
# <int> <fct> <fct> <fct>
#1 1 D F W
#2 1 P F P
#3 2 D F I
#4 3 F F I
#5 3 G F I
#6 3 T F I
data
df <- structure(list(Group = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L), V2 = structure(c(1L, 7L, 5L, 7L, 6L, 1L, 1L, 2L, 3L,
4L, 7L), .Label = c("D", "E", "F", "G", "P", "R", "T"), class = "factor"),
V3 = structure(c(2L, 1L, 2L, 2L, 3L, 4L, 2L, 2L, 2L, 2L,
2L), .Label = c("A", "F", "R", "Y"), class = "factor"), V4 = structure(c(5L,
2L, 4L, 2L, 3L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("I",
"L", "O", "P", "W"), class = "factor")), class = "data.frame",
row.names = c(NA,
-11L))
l <- list(c(1, 3), 4, 2:4)

How to keep values occur only in one group?

I am working on a dataframe likes:
groups . values
a . 1
a . 1
a 2
b . 2
b . 3
b . 3
c . 4
c . 5
c . 6
d . 6
d . 7
d . 2
The problem is to turn it into something like:
groups . values
a . 1
a . 1
b . 3
b . 3
c . 4
c . 5
d . 7
I want to keep rows whose values only occur in ONE group. For example, value 2 is deleted because it occurs in three different groups, but value 1 is kept although it occur twice in ONLY ONE group.
Is there any functions from dplyr package can handle this problem? or I have to write my own function?
As you asked for a dplyr solution:
df %>% group_by(values) %>% filter(n_distinct(groups) == 1)
# # A tibble: 7 x 2
# # Groups: values [5]
# groups values
# <chr> <int>
#1 a 1
#2 a 1
#3 b 3
#4 b 3
#5 c 4
#6 c 5
#7 d 7
with
df <- structure(list(groups = c("a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d", "d"),
values = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 5L, 6L, 6L, 7L, 2L)),
row.names = c(NA, -12L), class = "data.frame")
Group by values and see if column groups has only one element. This can be done with ave.
i <- as.logical(with(df1, ave(as.numeric(groups), values, FUN = function(x) length(unique(x)) == 1)))
df1[i, ]
# groups values
#1 a 1
#2 a 1
#5 b 3
#6 b 3
#7 c 4
#8 c 5
#11 d 7
Data in dput format.
df1 <-
structure(list(groups = structure(c(1L, 1L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L), .Label = c("a", "b",
"c", "d"), class = "factor"), values = c(1L, 1L, 2L,
2L, 3L, 3L, 4L, 5L, 6L, 6L, 7L, 2L)),
class = "data.frame", row.names = c(NA, -12L))
x[x$values %in% names(which(colSums(table(x)>0)==1)),]
where
x = structure(list(groups = c("a", "a", "a", "b", "b", "b", "c",
"c", "c", "d", "d", "d"), values = c(1L, 1L, 2L, 2L, 3L, 3L,
4L, 5L, 6L, 6L, 7L, 2L)), row.names = c(NA, -12L), class = "data.frame")
or, a data.table solution:
setDT(x)[, .SD[uniqueN(groups)==1], values]
Using sqldf package for your original data frame df:
library(sqldf)
result <- sqldf("SELECT * FROM df
WHERE `values` IN (
SELECT `values` from (
SELECT `values`, groups, count(*) as num from df
GROUP BY `values`, groups) t
GROUP BY `values`
HAVING COUNT(1) = 1
)")

R - how to avoid repeating filter & row bind

Because I am working on a very large dataset, I need to slice my dataset by groups in order to pursue my computations.
I have a person-period (melt) dataset that looks like this
group id var time
1 A 1 a 1
2 A 1 b 2
3 A 1 a 3
4 A 2 b 1
5 A 2 b 2
6 A 2 b 3
7 B 1 a 1
8 B 1 a 2
9 B 1 a 3
10 B 2 c 1
11 B 2 c 2
12 B 2 c 3
I need to do this simple transformation
library(reshape2)
library(dplyr)
dt %>% dcast(group + id ~ time, value.var = 'var')
In order to get
group id 1 2 3
1 A 1 a b a
2 A 2 b b b
3 B 1 a a a
4 B 2 c c c
So far, so good.
However, because my database is too big, I need to do this separately for each different groups, such as
a = dt %>% filter(group == 'A') %>% dcast(group + id ~ time, value.var ='var')
b = dt %>% filter(group == 'B') %>% dcast(group + id ~ time, value.var = 'var')
bind_rows(a,b)
My problem is that I would like to avoid doing it by hand. I mean, having to store separately each groups, a = ..., b = ..., c = ..., and so on
Any idea how I could have a single pipe stream that would separate each group, compute the transformation and put it back together in a dataframe ?
dt = structure(list(group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"),
id = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor"), var = structure(c(1L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 3L, 3L, 3L), .Label = c("a",
"b", "c"), class = "factor"), time = structure(c(1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1",
"2", "3"), class = "factor")), .Names = c("group", "id",
"var", "time"), row.names = c(NA, -12L), class = "data.frame")
Package purrr can be useful for working with lists. First split the dataset by group and then use map_df to dcast each list but return everything in a single data.frame.
library(purrr)
dt %>%
split(.$group) %>%
map_df(~dcast(.x, group + id ~ time, value.var = "var"))
group id 1 2 3
1 A 1 a b a
2 A 2 b b b
3 B 1 a a a
4 B 2 c c c
lapply is your friend here:
do.call(rbind, lapply(unique(dt$Group), function(grp, dt){
dt %>% filter(Group == grp) %>% dcast(group + id ~ time, value.var = "var")
}, dt = dt))

Remove the rows that have the same column A value but different column B value from df (but not vice-versa) in R

I’m trying to remove all the rows that have the same value in the "lan" column of my dataframe but different value for my "id" column (but not vice-versa).
Using an example dataset:
require(dplyr)
t <- structure(list(id = c(1L, 2L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
4L), lan = structure(c(1L, 2L, 3L, 4L, 4L, 5L, 5L, 5L, 6L, 1L,
7L), .Label = c("a", "b", "c", "d", "e", "f", "g"), class = "factor"),
value = c(0.22988498, 0.848989831, 0.538065821, 0.916571913,
0.304183372, 0.983348167, 0.356128559, 0.054102854, 0.400934593,
0.001026817, 0.488452667)), .Names = c("id", "lan", "value"
), class = "data.frame", row.names = c(NA, -11L))
t
I need to get rid of rows 1 and 10 because they have the same lan (a) but different id.
I've tried the following, without success:
a<-t[(!duplicated(t$id)),]
c<-a[duplicated(a$lan)|duplicated(a$lan, fromLast=TRUE),]
d<-t[!(t$lan %in% c$lan),]
Thanks for your help!
And an alternative using dplyr:
t2 <- t %>%
group_by(lan,id) %>%
summarise(value=sum(value)) %>%
group_by(lan) %>%
summarise(number=n()) %>%
filter(number>1) %>%
select(lan)
> t[!t$lan %in% t2$lan ,]
id lan value
2 2 b 0.84898983
3 2 c 0.53806582
4 3 d 0.91657191
5 3 d 0.30418337
6 4 e 0.98334817
7 4 e 0.35612856
8 4 e 0.05410285
9 4 f 0.40093459
11 4 g 0.48845267
You could use duplicated on "lan", to get the logical index of all elements that are duplicates, repeat the same with both columns together ('id', 'lan'), to get the elements not duplicated, check which of these elements are TRUE in both cases, negate, and subset.
indx1 <- with(t, duplicated(lan)|duplicated(lan,fromLast=TRUE))
indx2 <- !(duplicated(t[1:2])|duplicated(t[1:2],fromLast=TRUE))
t[!(indx1 & indx2),]
# id lan value
#2 2 b 0.84898983
#3 2 c 0.53806582
#4 3 d 0.91657191
#5 3 d 0.30418337
#6 4 e 0.98334817
#7 4 e 0.35612856
#8 4 e 0.05410285
#9 4 f 0.40093459
#11 4 g 0.48845267

ggplot2 stacked bar not ordering by manually-defined factor order

I would like to manually define the order of the items in each stacked bar. From all of the research I've done, I should be able to do this by manually defining the order of these factors prior to plotting. For some reason I have not been successful.
Here is the raw data:
df <- structure(list(cross_valid = structure(c(1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("1",
"2"), class = "factor"), variable = structure(c(1L, 1L, 2L, 2L,
3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L), .Label = c("a",
"b", "c", "d", "e", "f", "g", "h", "i"), class = "factor"), value = c(0,
0, 0, 0, 3.546, 0, 0, 0, 28.0927688833, 4.689, 0.4887, 1.134,
20.886690705, 16.8620595883, 14.8086, 18.648, 20.5713, 44.046
)), row.names = c(NA, -18L), class = "data.frame", .Names = c("cross_valid",
"variable", "value"))
Looks like:
> head (df)
cross_valid variable value
1 1 a 0.000
2 2 a 0.000
3 1 b 0.000
4 2 b 0.000
5 1 c 3.546
6 2 c 0.000
Current order and levels of df$variable:
> df$variable
[1] a a b b c c d d e e f f g g h h i i
Levels: a b c d e f g h i
Now I change the order of df$variable:
df$variable <- factor(df$variable, levels = unique(c("i","a","b","e","g","f","h")),ordered=TRUE)
Now plot the graph:
library(ggplot2)
p <- ggplot() + geom_bar(data=df,aes(x=cross_valid,y=value,fill=variable),stat='identity')
p <- p + scale_fill_manual("",values=c('a'='darkred','b'='blue','c'='black','d'='darkolivegreen1','e'='green','f'='darkorchid','g'='yellow',
'h'='snow4','i'='darkgray'),
breaks=c('i','h','g','f','e','d','c','b','a'),
labels=c('i','h','g','f','e','d','c','b','a'))
p
Which produces the following plot:
I defined 'i' and 'h' to be on opposite ends of the bar, yet they are still next to each other. Are there any ideas on why this might be happening? Maybe something is odd with my data?
Thanks
-al
EDIT 1:
Following #MrFlick's advice I removed breaks, but still find that 'i' and 'h' are still next to each other in the bar, even though the levels have defined them to be at opposite ends of the bar.
> df$variable
[1] a a b b c c d d e e f f g g h h i i
Levels: i < a < b < c < d < e < g < f < h
Edited plot code:
p <- ggplot() + geom_bar(data=df,aes(x=cross_valid,y=value,fill=variable),stat='identity')
p <- p + scale_fill_manual("",values=c('a'='darkred','b'='blue','c'='black','d'='darkolivegreen1','e'='green','f'='darkorchid','g'='yellow',
'h'='snow4','i'='darkgray'))
p
Produces:
Following works for me:
df$variable <- relevel(df$variable, "i")
library(ggplot2)
p <- ggplot() +
geom_bar(data=df,aes(x=cross_valid,y=value,fill=variable,order=variable),stat='identity') +
scale_fill_manual("",
values=c('a'='darkred','b'='blue','c'='black','d'='darkolivegreen1','e'='green','f'='darkorchid','g'='yellow','h'='snow4','i'='darkgray'),
breaks=c('h','g','f','e','d','c','b','a','i'),
labels=c('h','g','f','e','d','c','b','a','i'))
p
I used relevel to change factor level order and added an order parameter to the aes.
Edit: Changing the order of breaks and labels also adjusts the legend accordingly.
Second Edit: Sorry, solution was already posted in comments above, didn't see that before answering...

Resources