Reorder one row in tibble - move it to the last row - r

How do I rearrange the rows in tibble?
I wish to reorder rows such that: row with x = "c" goes to the bottom of the tibble, everything else remains same.
library(dplyr)
tbl <- tibble(x = c("a", "b", "c", "d", "e", "f", "g", "h"),
y = 1:8)

An alternative to dplyr::arrange(), using base R:
tbl[order(tbl$x == "c"), ] # Thanks to Merijn van Tilborg
Output:
# x y
# <chr> <int>
# 1 a 1
# 2 b 2
# 3 d 4
# 4 e 5
# 5 f 6
# 6 g 7
# 7 h 8
# 8 c 3

tbl |> dplyr::arrange(x == "c")

Using forcats, convert to factor having c the last, then arrange. This doesn't change the class of the column x.
library(forcats)
tbl %>%
arrange(fct_relevel(x, "c", after = Inf))
# # A tibble: 8 x 2
# x y
# <chr> <int>
# 1 a 1
# 2 b 2
# 3 d 4
# 4 e 5
# 5 f 6
# 6 g 7
# 7 h 8
# 8 c 3
If the order of x is important, it is better to keep it as factor class, below will change the class from character to factor with c being last:
tbl %>%
mutate(x = fct_relevel(x, "c", after = Inf)) %>%
arrange(x)

Related

Enumerate a grouping variable in a tibble

I would like to know how to use row_number or anything else to transform a variable group into a integer
tibble_test <- tibble(A = letters[1:10], group = c("A", "A", "A", "B", "B", "C", "C", "C", "C", "D"))
# to get the enumeration inside each group of 'group'
tibble_test %>%
group_by(group) %>%
mutate(G1 = row_number())
But I would like to have this output:
# A tibble: 10 x 4
A group G1 G2
<chr> <chr> <dbl> <dbl>
1 a A 1 1
2 b A 2 1
3 c A 3 1
4 d B 1 2
5 e B 2 2
6 f C 1 3
7 g C 2 3
8 h C 3 3
9 i C 4 3
10 j D 1 4
My question is: how to get this column G2, I know i could transform the 'group' var into a factor then integer (after the tibble is arranged) but I would like to know if it can be done using a counting.
You just need one more step and include the group indices with group_indices(). Be aware that how your data is arranged/sorted will affect the index.
library(dplyr)
tibble_test <- tibble(A = letters[1:10], group = c("A", "A", "A", "B", "B", "C", "C", "C", "C", "D"))
# to get the enumeration inside each group of 'group'
tibble_test %>%
group_by(group) %>%
mutate(G1 = row_number(),
G2 = group_indices())
# A tibble: 10 x 4
# Groups: group [4]
A group G1 G2
<chr> <chr> <int> <int>
1 a A 1 1
2 b A 2 1
3 c A 3 1
4 d B 1 2
5 e B 2 2
6 f C 1 3
7 g C 2 3
8 h C 3 3
9 i C 4 3
10 j D 1 4

How to add a row to data frame based on a condition

I've a dataframe which I want to add a row on the basis of the following conditions. The conditions are when column a is equal to C and column b is equal to 3 or 5.
Here is my dataframe
df <- data.frame(a = c("A", "B", "C", "D", "C", "A", "C", "E"),
b = c(seq(8)), stringsAsFactors = TRUE)
Whenever the condition is TRUE I want to add a row below where the condition is met add 3. I have tried the following
rbind(df, data.frame(a="add", b = "3"))
# a b
# 1 A 1
# 2 B 2
# 3 C 3
# 4 D 4
# 5 C 5
# 6 A 6
# 7 C 7
# 8 E 8
# 9 add 3
This is not the output I want. The output I want is
# a b
# 1 A 1
# 2 B 2
# 3 C 3
# 4 add 3
# 5 D 4
# 6 C 5
# 7 add 3
# 8 A 6
# 9 C 7
# 10 E 8
How can I do that? I am new to R and thank you for your help.
lens = ifelse(df$b %in% c(3, 5) & df$a == "C", 2, 1)
ind = rep(1:NROW(df), lens)
df2 = df[ind,]
df2$a = as.character(df2$a)
df2$a[cumsum(lens)[which(lens == 2)]] = "add"
df2$b[cumsum(lens)[which(lens == 2)]] = 3
df2
# a b
#1 A 1
#2 B 2
#3 C 3
#3.1 add 3
#4 D 4
#5 C 5
#5.1 add 3
#6 A 6
#7 C 7
#8 E 8
A solution using the tidyverse package.
library(tidyverse)
df2 <- df %>%
mutate(Group = lag(cumsum(a == "C" & b %in% c(3, 5)), default = FALSE)) %>%
group_split(Group) %>%
map_dfr(~ .x %>% bind_rows(tibble(a = "add", b = 3))) %>%
slice(-n()) %>%
select(-Group)
df2
# # A tibble: 10 x 2
# a b
# <chr> <dbl>
# 1 A 1
# 2 B 2
# 3 C 3
# 4 add 3
# 5 D 4
# 6 C 5
# 7 add 3
# 8 A 6
# 9 C 7
# 10 E 8
In base R, we can find out position where a = "c" and b is 3 or 5. Repeat those rows in the dataframe and replace them with required values.
pos <- which(df$a == "C" & df$b %in% c(3, 5))
df <- df[sort(c(seq(nrow(df)), pos)), ]
df[seq_along(pos) + pos, ] <- list("add", 3)
row.names(df) <- NULL
df
# a b
#1 A 1
#2 B 2
#3 C 3
#4 add 3
#5 D 4
#6 C 5
#7 add 3
#8 A 6
#9 C 7
#10 E 8
data
df <- data.frame(a = c("A", "B", "C", "D", "C", "A", "C", "E"),
b = c(seq(8)), stringsAsFactors = FALSE)

rstudio dplyr group _by multiple column

In Rstudio, I have a dataframe which contains 4 columns and I need to get the list of every different triplet of the 3 first columns sorted decreasingly by the sum on the 4th column. For example, with:
A B C 2
D E F 5
A B C 4
G H I 5
D E F 3
I need as a result:
D E F 8
A B C 6
G H I 5
I've tried the following different approach but I can't manage to have exactly the result I need:
df_list<-df_raw_data %>%
group_by(param1, param2, param3) %>%
summarise_all(total = sum(param4))
arrange(df_list, desc(total))
and:
df_list<-unique(df_raw_data[, c('param1', 'param2', 'param3')])
cbind(df_list, total)
for(i in 1:nrow(df_raw_data))
{
filter ???????????
}
I would prefer to use the dplyr package since it's a more elegant solution.
EDIT: Okay, thanks for your working answers. I think that I've lost some time figuring out that the plyr package shouldn't be loaded after dplyr...
We can use group_by_at to select the columns to group.
library(dplyr)
dat2 <- dat %>%
group_by_at(vars(-V4)) %>%
summarise(V4 = sum(V4)) %>%
ungroup()
dat2
# # A tibble: 3 x 4
# V1 V2 V3 V4
# <chr> <chr> <chr> <int>
# 1 A B C 6
# 2 D E F 8
# 3 G H I 5
Or use group_by_if to select columns to group based on column types.
dat2 <- dat %>%
group_by_if(is.character) %>%
summarise(V4 = sum(V4)) %>%
ungroup()
dat2
# # A tibble: 3 x 4
# V1 V2 V3 V4
# <chr> <chr> <chr> <int>
# 1 A B C 6
# 2 D E F 8
# 3 G H I 5
DATA
dat <- read.table(text = "A B C 2
D E F 5
A B C 4
G H I 5
D E F 3",
header = FALSE, stringsAsFactors = FALSE)
Would this be what you are looking for?
df <- data_frame(var1 = c("A", "D", "A", "G", "D"),
var2 = c("B", "E", "B", "H", "E"),
var3 = c("C", "F", "C", "I", "F"),
var4 = c(2, 5, 4, 5, 3))
df %>% group_by(var1, var2, var3) %>%
summarise(sum = sum(var4)) %>%
arrange(desc(sum))

How do I remove offsetting rows in a tibble?

I am trying to remove rows that have offsetting values.
library(dplyr)
a <- c(1, 1, 1, 1, 2, 2, 2, 2,2,2)
b <- c("a", "b", "b", "b", "c", "c","c", "d", "d", "d")
d <- c(10, 10, -10, 10, 20, -20, 20, 30, -30, 30)
o <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")
df <- tibble(ID = a, SEQ = b, VALUE = d, OTHER = o)
Generates this ordered table that is grouped by ID and SEQ.
> df
# A tibble: 10 x 4
ID SEQ VALUE OTHER
<dbl> <chr> <dbl> <chr>
1 1 a 10 A
2 1 b 10 B
3 1 b -10 C
4 1 b 10 D
5 2 c 20 E
6 2 c -20 F
7 2 c 20 G
8 2 d 30 H
9 2 d -30 I
10 2 d 30 J
I want to drop the row pairs (2,3), (5,6), (8,9) because VALUE negates the VALUE in the matching previous row.
I want the resulting table to be
> df2
# A tibble: 4 x 4
ID SEQ VALUE OTHER
<dbl> <chr> <dbl> <chr>
1 1 a 10 A
2 1 b 10 D
3 2 c 20 G
4 2 d 30 J
I know that I can't use group_by %>% summarize, because I need to keep the value that is in OTHER. I've looked at the dplyr::lag() function but I don't see how that can help. I believe that I could loop through the table with some type of for each loop and generate a logical vector that can be used to drop the rows, but I was hoping for a more elegant solution.
What about:
vec <- cbind(
c(head(df$VALUE,-1) + df$VALUE[-1], 9999) ,
df$VALUE + c(9999, head(df$VALUE,-1))
)
vec <- apply(vec,1,prod)
vec <- vec!=0
df[vec,]
# A tibble: 4 x 4
ID SEQ VALUE OTHER
<dbl> <chr> <dbl> <chr>
1 1 a 10 A
2 1 b 50 D
3 2 c 60 G
4 2 d 70 J
The idea is to take your VALUE field and subtract it with a slightly subset version of it. When the result is 0, than you remove the line.
Here's another solution with dplyr. Not sure about the edge case you mentioned in the comments, but feel free to test it with my solution:
library(dplyr)
df %>%
group_by(ID, SEQ) %>%
mutate(diff = VALUE + lag(VALUE),
diff2 = VALUE + lead(VALUE)) %>%
mutate_at(vars(diff:diff2), funs(coalesce(., 1))) %>%
filter((diff != 0 & diff2 != 0)) %>%
select(-diff, -diff2)
Result:
# A tibble: 4 x 4
# Groups: ID, SEQ [4]
ID SEQ VALUE OTHER
<dbl> <chr> <dbl> <chr>
1 1 a 10 A
2 1 b 50 D
3 2 c 60 G
4 2 d 70 J
Note:
This solution first creates two diff columns, one adding the lag, another adding the lead of VALUE to each VALUE. Only the offset columns will either have a zero in diff or in diff2, so I filtered out those rows, resulting in the desired output.

Group data frame by elements from a variable containing lists of elements

I would like to perform a a non-trivial group_by, grouping and summarizing a data frame by single elements of lists found in one of its variables.
df <- data.frame(x = 1:5)
df$y <- list("A", c("A", "B"), "C", c("B", "D", "C"), "E")
df
x y
1 1 A
2 2 A, B
3 3 C
4 4 B, D, C
5 5 E
Now grouping by y (and say counting no. of rows), which is a variable holding lists of elements, the required end results should be:
data.frame(group = c("A", "B", "C", "D", "E"), n = c(2,2,2,1,1))
group n
1 A 2
2 B 2
3 C 2
4 D 1
5 E 1
Because "A" appears in 2 rows, "B" in 2 rows, etc.
Note: the sum of n is not necessarily equal to number of rows in the data frame.
We can use simple base R solution with table to calculate the frequency after unlisting the list and then create a data.table based on that table object
tbl <- table(unlist(df$y))
data.frame(group = names(tbl), n = as.vector(tbl))
# group n
#1 A 2
#2 B 2
#3 C 2
#4 D 1
#5 E 1
Or another option with tidyverse
library(dplyr)
library(tidyr)
unnest(df) %>%
group_by(group = y) %>%
summarise(n=n())
# <chr> <int>
#1 A 2
#2 B 2
#3 C 2
#4 D 1
#5 E 1
Or as #alexis_laz mentioned in the comments, an alternative is as.data.frame.table
as.data.frame(table(group = unlist(df$y)), responseName = "n")
simple base R solution: (actually this is dup question, unable to locate it though)
sapply(unique(unlist(df$y)), function(x) sum(grepl(x, df$y))
# A B C D E
# 2 2 2 1 1

Resources