I have a list here, and I wish to mutate a new column with unique values for each list relative to the mutation. For example, I want to mutate a column named ID as n >= 1.
Naturally, on a dataframe I would do this:
dat %>% mutate(id = row_number())
For a list, I would do this:
dat%>% map(~ mutate(., ID = row_number()))
And I would get an output likeso:
dat <- list(data.frame(x=c("a", "b" ,"c", "d", "e" ,"f" ,"g") ), data.frame(y=c("p", "lk", "n", "m", "g", "f", "t")))
[[1]]
x id
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
7 g 7
[[2]]
y id
1 p 1
2 lk 2
3 n 3
4 m 4
5 g 5
6 f 6
7 t 7
Though, how would I mutate a new column ID such that the row number continues from the first list.
Expected output:
[[1]]
x id
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
7 g 7
[[2]]
y id
1 p 8
2 lk 9
3 n 10
4 m 11
5 g 12
6 f 13
7 t 14
An option is to bind them into a single dataset, create the 'id' with row_number(), split by 'grp', loop over the list and remove any columns that have all NA values
library(dplyr)
library(purrr)
dat %>%
bind_rows(.id = 'grp') %>%
mutate(id = row_number()) %>%
group_split(grp) %>%
map(~ .x %>%
select(where(~ any(!is.na(.))), -grp))
-output
#[[1]]
# A tibble: 7 x 2
# x id
# <chr> <int>
#1 a 1
#2 b 2
#3 c 3
#4 d 4
#5 e 5
#6 f 6
#7 g 7
#[[2]]
# A tibble: 7 x 2
# y id
# <chr> <int>
#1 p 8
#2 lk 9
#3 n 10
#4 m 11
#5 g 12
#6 f 13
#7 t 14
Or an easier approach is to unlist (assuming single column), get the sequence, add a new column with map2
map2(dat, relist(seq_along(unlist(dat)), skeleton = dat),
~ .x %>% mutate(id = .y))
Or using a for loop
dat[[1]]$id <- seq_len(nrow(dat[[1]]))
for(i in seq_along(dat)[-1]) dat[[i]]$id <-
seq(tail(dat[[i-1]]$id, 1) + 1, length.out = nrow(dat[[i]]), by = 1)
Related
Let' say I have two data.frames
name_df = read.table(text = "player_name
a
b
c
d
e
f
g", header = T)
game_df = read.table(text = "game_id winner_name loser_name
1 a b
2 b a
3 a c
4 a d
5 b c
6 c d
7 d e
8 e f
9 f a
10 g f
11 g a
12 f e
13 a d", header = T)
name_df contains a unique list of all the winner_name or loser_name values in game_df. I want to create a new data.frame that has, for each person in the name_df a row if a given name (e.g. a) appears in either the winner_name or loser_name column
So I essentially want to merge game_df with name_df, but the key column (name) can appear in either winner_name or loser_name.
So, for just a and b the final output would look something like:
final_df = read.table(text = "player_name game_id winner_name loser_name
a 1 a b
a 2 b a
a 3 a c
a 4 a d
a 9 f a
a 11 g a
a 13 a d
b 1 a b
b 2 b a
b 5 b c", header = T)
We can loop over the elements in 'name_df' for 'player_name', filter the rows from 'game_df' for either the 'winner_name' or 'loser_name'
library(dplyr)
library(purrr)
map_dfr(setNames(name_df$player_name, name_df$player_name),
~ game_df %>%
filter(winner_name %in% .x|loser_name %in% .x), .id = 'player_name')
Or if there are many columns, use if_any
map_dfr(setNames(name_df$player_name, name_df$player_name),
~ {
nm1 <- .x
game_df %>%
filter(if_any(c(winner_name, loser_name), ~ . %in% nm1))
}, .id = 'player_name')
Dedicated to our teacher and mentor dear #akrun
I think we can also make use of the add_row() function you first taught me the other day. Unbelievable!!!
library(dplyr)
library(purrr)
library(tibble)
game_df %>%
rowwise() %>%
mutate(player_name = winner_name) %>%
group_split(game_id) %>%
map_dfr(~ add_row(.x, game_id = .x$game_id, winner_name = .x$winner_name,
loser_name = .x$loser_name, player_name = .x$loser_name)) %>%
arrange(player_name) %>%
relocate(player_name)
# A tibble: 26 x 4
player_name game_id winner_name loser_name
<chr> <int> <chr> <chr>
1 a 1 a b
2 a 2 b a
3 a 3 a c
4 a 4 a d
5 a 9 f a
6 a 11 g a
7 a 13 a d
8 b 1 a b
9 b 2 b a
10 b 5 b c
# ... with 16 more rows
This can be directly expressed in SQL:
library(sqldf)
sqldf("select *
from name_df
left join game_df on winner_name = player_name or loser_name = player_name")
Without using purrr. I think this is appropriate use case of tidyr::unite with argument remove = F where we can first unite the winners' and losers' names and then use tidyr::separate_rows to split new column into rows.
library(tidyr)
library(dplyr)
game_df %>% unite(Player_name, winner_name, loser_name, remove = F, sep = ', ') %>%
separate_rows(Player_name) %>%
relocate(Player_name) %>%
arrange(Player_name)
# A tibble: 26 x 4
Player_name game_id winner_name loser_name
<chr> <int> <chr> <chr>
1 a 1 a b
2 a 2 b a
3 a 3 a c
4 a 4 a d
5 a 9 f a
6 a 11 g a
7 a 13 a d
8 b 1 a b
9 b 2 b a
10 b 5 b c
# ... with 16 more rows
A Base R approach :
result <- do.call(rbind, lapply(name_df$player_name, function(x)
cbind(plaername = x,
subset(game_df, winner_name == x | loser_name == x))))
rownames(result) <- NULL
result
# playername game_id winner_name loser_name
#1 a 1 a b
#2 a 2 b a
#3 a 3 a c
#4 a 4 a d
#5 a 9 f a
#6 a 11 g a
#7 a 13 a d
#8 b 1 a b
#...
#...
I have a single row data frame like this:
X1 X2 X3
1 [['1','2','3'], ['4','6','5'], ['7','8']] ['9','10','11','12','13']
I would like create a new dataframe from that using columns X2 and X3 that looks like this:
ID Group
1 A
2 A
3 A
4 B
5 B
6 B
7 C
8 C
9 D
10 D
11 D
12 D
13 D
So each number in the dataframe is grouped by the square brackets in the orignal dataframe.
Can anyone recommend a good way of doing this in R.
One option would be to split the 'X2' at the , followed by the ], concatenate with 'X3', extract the numeric elements with str_extract_all into a list, stack it to a two column data.frame
library(stringr)
v1 <- c(strsplit(df1$X2, "\\],\\s*")[[1]], df1$X3)
out <- stack(setNames(str_extract_all(v1, "\\d+"), LETTERS[1:4]))
names(out) <- c("ID", "Group")
out
# ID Group
#1 1 A
#2 2 A
#3 3 A
#4 4 B
#5 6 B
#6 5 B
#7 7 C
#8 8 C
#9 9 D
#10 10 D
#11 11 D
#12 12 D
#13 13 D
Or using tidyverse
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = -X1) %>%
separate_rows(value, sep="(?<=\\]),\\s*") %>%
transmute(Group = LETTERS[row_number()], ID = value) %>%
mutate(ID = str_extract_all(ID, "\\d+")) %>%
unnest(c(ID))
# A tibble: 13 x 2
# Group ID
# <chr> <chr>
# 1 A 1
# 2 A 2
# 3 A 3
# 4 B 4
# 5 B 6
# 6 B 5
# 7 C 7
# 8 C 8
# 9 D 9
#10 D 10
#11 D 11
#12 D 12
#13 D 13
data
df1 <- structure(list(X1 = 1L, X2 = "[['1','2','3'], ['4','6','5'], ['7','8']]",
X3 = "['9','10','11','12','13']"), class = "data.frame", row.names = c(NA,
-1L))
I was wondering if there's a more elegant way of taking a dataframe, grouping by x to see how many x's occur in the dataset, then mutating to find the first occurrence of every x (y)
test <- data.frame(x = c("a", "b", "c", "d",
"c", "b", "e", "f", "g"),
y = c(1,1,1,1,2,2,2,2,2))
x y
1 a 1
2 b 1
3 c 1
4 d 1
5 c 2
6 b 2
7 e 2
8 f 2
9 g 2
Current Output
output <- test %>%
group_by(x) %>%
summarise(count = n())
x count
<fct> <int>
1 a 1
2 b 2
3 c 2
4 d 1
5 e 1
6 f 1
7 g 1
Desired Output
x count first_seen
<fct> <int> <dbl>
1 a 1 1
2 b 2 1
3 c 2 1
4 d 1 1
5 e 1 2
6 f 1 2
7 g 1 2
I can filter the test dataframe for the first occurrences then use a left_join but was hoping there's a more elegant solution using mutate?
# filter for first occurrences of y
right <- test %>%
group_by(x) %>%
filter(y == min(y)) %>%
slice(1) %>%
ungroup()
# bind to the output dataframe
left_join(output, right, by = "x")
We can use first after grouping by 'x' to create a new column, use that also in group_by and get the count with n()
library(dplyr)
test %>%
group_by(x) %>%
group_by(first_seen = first(y), add = TRUE) %>%
summarise(count = n())
# A tibble: 7 x 3
# Groups: x [7]
# x first_seen count
# <fct> <dbl> <int>
#1 a 1 1
#2 b 1 2
#3 c 1 2
#4 d 1 1
#5 e 2 1
#6 f 2 1
#7 g 2 1
I have a question. Why not keep it simple? for example
test %>%
group_by(x) %>%
summarise(
count = n(),
first_seen = first(y)
)
#> # A tibble: 7 x 3
#> x count first_seen
#> <chr> <int> <dbl>
#> 1 a 1 1
#> 2 b 2 1
#> 3 c 2 1
#> 4 d 1 1
#> 5 e 1 2
#> 6 f 1 2
#> 7 g 1 2
While trying to work out this question Identify duplicates of one value with different values in another column; I felt that the solution was closer but I couldn't because the dplyr mutate function refers to the pre-mutated state's max when I use max(ID) in the below code and not post-mutated value (like recursively).
The objective is to assign a new unique ID value for the rows where the current Address has mismatch with the previous Address of the same ID value.
The code I tried:
df <- read.table(text = 'ID Address
1 X
1 X
1 Y
2 Z
2 Z
3 A
3 B
4 C
4 D
4 E
5 F
5 F
5 F
', header= T, stringsAsFactors = F)
df %>% group_by(ID) %>% mutate(flag = ifelse(lag(Address)==Address,F,T)) %>%
mutate(flag = ifelse(is.na(flag),F,flag)) %>% ungroup() %>%
mutate(newID = ifelse(flag | is.na(flag), max(ID)+1,ID))%>%
select(ID = newID,Address)
Received Output:
# A tibble: 13 x 2
ID Address
<dbl> <chr>
1 1 X
2 1 X
3 6 Y
4 2 Z
5 2 Z
6 3 A
7 6 B
8 4 C
9 6 D
10 6 E
11 5 F
12 5 F
13 5 F
Expected Output:
ID Address
1 X
1 X
6 Y
2 Z
2 Z
3 A
7 B
4 C
8 D
9 E
5 F
5 F
5 F
Any help would be appreciated!
Edit:
Ideal code: Where I should've been able to use newID which is the current mutating variable to use.
> df %>% group_by(ID) %>% mutate(flag = ifelse(lag(Address)==Address,F,T)) %>%
+ mutate(flag = ifelse(is.na(flag),F,flag)) %>% ungroup() %>%
+ mutate(newID = ifelse(flag | is.na(flag), max(newID)+1,ID))%>%
+ select(ID = newID,Address)
One problem is the max(ID) + 1 which will give the constant value and the second problem is the ifelse itself which requires equal length vector for 'yes' and 'no'. In the below solution, we replace the max(ID) + 1 with max(ID) + seq_len(sum(flag)) and instead of ifelse used replace
df %>%
group_by(ID) %>%
mutate(flag = lag(Address, default = Address[1])!= Address) %>%
ungroup() %>%
mutate(newID = replace(ID, flag, max(ID) + seq_len(sum(flag))))%>%
select(ID = newID,Address)
# A tibble: 13 x 2
# ID Address
# <dbl> <chr>
# 1 1 X
# 2 1 X
# 3 6 Y
# 4 2 Z
# 5 2 Z
# 6 3 A
# 7 7 B
# 8 4 C
# 9 8 D
#10 9 E
#11 5 F
#12 5 F
#13 5 F
In addition, the two ifelse statements to create the 'flag' can be replaced by a single statement
Here is sample data:
df <- data.frame(t(data.frame(seq(1,10,1)))); rownames(df) <- NULL;
colnames(df) <- letters[1:ncol(df)]
df
I would like to arrange the new data.frame so that it always has 6 columns, the next row (after splinting since ncol>6) would contain the next 6 column names and next row their values. The last row if ncol<6 the values are filled with empty string including the column names.
Here is desired output:
a b c d e f
1 1 2 3 4 5 6
2 g h i j
3 7 8 9 10
Another example:
df <- data.frame(t(data.frame(seq(1,15,1)))); rownames(df) <- NULL;
colnames(df) <- letters[1:ncol(df)]
df
a b c d e f
1 1 2 3 4 5 6
2 g h i j k l
3 7 8 9 10 11 12
4 m n o
5 13 14 15
EDIT:
The way to approach it possibly is to:
n <- 6
ncl <- nrow(df)
s <- split(df, rep(1:ceiling(ncl/n), each=n, length.out=ncl))
s
s1 <- split(rownames(df), rep(1:ceiling(ncl/n), each=n, length.out=ncl))
s1
combine every second split of s and s1
s1[c(TRUE,FALSE)]
Here's a way, not so pretty, but this is an ugly question :D
library(tibble)
library(dplyr)
df1 <- matrix(c(names(df),rep('',6 - ncol(df)%%6)) %>% unlist, ncol=6,byrow=T) %>% as_tibble %>% rowid_to_column()
df2 <- matrix(c(df ,rep('',6 - ncol(df)%%6)) %>% unlist, ncol=6,byrow=T) %>% as_tibble %>% rowid_to_column()
bind_rows(df1,df2) %>% arrange(rowid) %>% select(-1) %>% setNames(.[1,]) %>% slice(-1)
# # A tibble: 3 x 6
# a b c d e f
# <chr> <chr> <chr> <chr> <chr> <chr>
# 1 1 2 3 4 5 6
# 2 g h i j
# 3 7 8 9 10
For the life of me I can't figure out a use-case for this... but for sake of the provided examples...
seq(1, ncol(df), by = 6) %>% {
starts <- .
ends <- c(lead(.,1,NULL)-1, ncol(df))
base_df <- df[,starts[[1]]:ends[[1]]]
rbind(base_df, rbind.pages(Map(function(s, e){
d <- df[,seq(s, e)]
data.frame(rbind(colnames(d), d)) %>% setNames(colnames(base_df)[1:length(.)])
}, s = starts[-1], e = ends[-1]))
) %>%
mutate_all(function(x){
ifelse(!is.na(x), x, "")
})
}
a b c d e f
1 1 2 3 4 5 6
2 g h i j k l
3 7 8 9 10 11 12
4 m n o
5 13 14 15
EDIT to coerce NA to 'empty string'