I have a single row data frame like this:
X1 X2 X3
1 [['1','2','3'], ['4','6','5'], ['7','8']] ['9','10','11','12','13']
I would like create a new dataframe from that using columns X2 and X3 that looks like this:
ID Group
1 A
2 A
3 A
4 B
5 B
6 B
7 C
8 C
9 D
10 D
11 D
12 D
13 D
So each number in the dataframe is grouped by the square brackets in the orignal dataframe.
Can anyone recommend a good way of doing this in R.
One option would be to split the 'X2' at the , followed by the ], concatenate with 'X3', extract the numeric elements with str_extract_all into a list, stack it to a two column data.frame
library(stringr)
v1 <- c(strsplit(df1$X2, "\\],\\s*")[[1]], df1$X3)
out <- stack(setNames(str_extract_all(v1, "\\d+"), LETTERS[1:4]))
names(out) <- c("ID", "Group")
out
# ID Group
#1 1 A
#2 2 A
#3 3 A
#4 4 B
#5 6 B
#6 5 B
#7 7 C
#8 8 C
#9 9 D
#10 10 D
#11 11 D
#12 12 D
#13 13 D
Or using tidyverse
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = -X1) %>%
separate_rows(value, sep="(?<=\\]),\\s*") %>%
transmute(Group = LETTERS[row_number()], ID = value) %>%
mutate(ID = str_extract_all(ID, "\\d+")) %>%
unnest(c(ID))
# A tibble: 13 x 2
# Group ID
# <chr> <chr>
# 1 A 1
# 2 A 2
# 3 A 3
# 4 B 4
# 5 B 6
# 6 B 5
# 7 C 7
# 8 C 8
# 9 D 9
#10 D 10
#11 D 11
#12 D 12
#13 D 13
data
df1 <- structure(list(X1 = 1L, X2 = "[['1','2','3'], ['4','6','5'], ['7','8']]",
X3 = "['9','10','11','12','13']"), class = "data.frame", row.names = c(NA,
-1L))
Related
data <- tibble(time = c(1,1,2,2), a = c(1,2,3,4), b =c(4,3,2,1), c = c(1,1,1,1))
The result will look like this
result <- tibble(
t = c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2),
firm1 = c("a","a","a","b","b","b","c","c","c","a","a","a","b","b","b","c","c","c"),
firm2 = c("a","b","c","a","b","c","a","b","c","a","b","c","a","b","c","a","b","c"),
value = c(6,10,5,10,14,9,5,9,4,14,10,9,10,6,5,9,5,4))
result
The function could be
function(x, y){sum(x, y)}
Basically I am looking for a tidy solution to expand.grid data at each point of time and apply functions across columns. Can anyone help?
I tried this, but I could not have time in front of the pairs.
expected_result<-expand.grid(names(data[-1]), names(data[-1])) %>%
mutate(value = map2(Var1, Var2, ~ fun1(data[.x], data[.y])))
expected_result
Use exand.grid you get all possible combination of columns, split the data by time and apply fun for each row of tmp.
library(dplyr)
library(purrr)
tmp <- expand.grid(firm1 = names(data[-1]), firm2 = names(data[-1]))
fun <- function(x, y) sum(x, y)
result <- data %>%
group_split(time) %>%
map_df(~cbind(time = .x$time[1], tmp,
value = apply(tmp, 1, function(x) fun(.x[[x[1]]], .x[[x[2]]]))))
result
# time firm1 firm2 value
#1 1 a a 6
#2 1 b a 10
#3 1 c a 5
#4 1 a b 10
#5 1 b b 14
#6 1 c b 9
#7 1 a c 5
#8 1 b c 9
#9 1 c c 4
#10 2 a a 14
#11 2 b a 10
#12 2 c a 9
#13 2 a b 10
#14 2 b b 6
#15 2 c b 5
#16 2 a c 9
#17 2 b c 5
#18 2 c c 4
You may also do this in base R -
result <- do.call(rbind, by(data, data$time, function(x) {
cbind(time = x$time[1], tmp,
value = apply(tmp, 1, function(y) fun(x[[y[1]]], x[[y[2]]])))
}))
We may use
library(dplyr)
library(tidyr)
library(purrr)
data1 <- data %>%
group_by(time) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop') %>%
pivot_longer(cols = -time) %>%
group_split(time)
map_dfr(data1, ~ {dat <- .x
crossing(firm1 = dat$name, firm2 = dat$name) %>%
mutate(value = c(outer(dat$value, dat$value, FUN = `+`))) %>%
mutate(time = first(dat$time), .before = 1)})
-output
# A tibble: 18 × 4
time firm1 firm2 value
<dbl> <chr> <chr> <dbl>
1 1 a a 6
2 1 a b 10
3 1 a c 5
4 1 b a 10
5 1 b b 14
6 1 b c 9
7 1 c a 5
8 1 c b 9
9 1 c c 4
10 2 a a 14
11 2 a b 10
12 2 a c 9
13 2 b a 10
14 2 b b 6
15 2 b c 5
16 2 c a 9
17 2 c b 5
18 2 c c 4
Let' say I have two data.frames
name_df = read.table(text = "player_name
a
b
c
d
e
f
g", header = T)
game_df = read.table(text = "game_id winner_name loser_name
1 a b
2 b a
3 a c
4 a d
5 b c
6 c d
7 d e
8 e f
9 f a
10 g f
11 g a
12 f e
13 a d", header = T)
name_df contains a unique list of all the winner_name or loser_name values in game_df. I want to create a new data.frame that has, for each person in the name_df a row if a given name (e.g. a) appears in either the winner_name or loser_name column
So I essentially want to merge game_df with name_df, but the key column (name) can appear in either winner_name or loser_name.
So, for just a and b the final output would look something like:
final_df = read.table(text = "player_name game_id winner_name loser_name
a 1 a b
a 2 b a
a 3 a c
a 4 a d
a 9 f a
a 11 g a
a 13 a d
b 1 a b
b 2 b a
b 5 b c", header = T)
We can loop over the elements in 'name_df' for 'player_name', filter the rows from 'game_df' for either the 'winner_name' or 'loser_name'
library(dplyr)
library(purrr)
map_dfr(setNames(name_df$player_name, name_df$player_name),
~ game_df %>%
filter(winner_name %in% .x|loser_name %in% .x), .id = 'player_name')
Or if there are many columns, use if_any
map_dfr(setNames(name_df$player_name, name_df$player_name),
~ {
nm1 <- .x
game_df %>%
filter(if_any(c(winner_name, loser_name), ~ . %in% nm1))
}, .id = 'player_name')
Dedicated to our teacher and mentor dear #akrun
I think we can also make use of the add_row() function you first taught me the other day. Unbelievable!!!
library(dplyr)
library(purrr)
library(tibble)
game_df %>%
rowwise() %>%
mutate(player_name = winner_name) %>%
group_split(game_id) %>%
map_dfr(~ add_row(.x, game_id = .x$game_id, winner_name = .x$winner_name,
loser_name = .x$loser_name, player_name = .x$loser_name)) %>%
arrange(player_name) %>%
relocate(player_name)
# A tibble: 26 x 4
player_name game_id winner_name loser_name
<chr> <int> <chr> <chr>
1 a 1 a b
2 a 2 b a
3 a 3 a c
4 a 4 a d
5 a 9 f a
6 a 11 g a
7 a 13 a d
8 b 1 a b
9 b 2 b a
10 b 5 b c
# ... with 16 more rows
This can be directly expressed in SQL:
library(sqldf)
sqldf("select *
from name_df
left join game_df on winner_name = player_name or loser_name = player_name")
Without using purrr. I think this is appropriate use case of tidyr::unite with argument remove = F where we can first unite the winners' and losers' names and then use tidyr::separate_rows to split new column into rows.
library(tidyr)
library(dplyr)
game_df %>% unite(Player_name, winner_name, loser_name, remove = F, sep = ', ') %>%
separate_rows(Player_name) %>%
relocate(Player_name) %>%
arrange(Player_name)
# A tibble: 26 x 4
Player_name game_id winner_name loser_name
<chr> <int> <chr> <chr>
1 a 1 a b
2 a 2 b a
3 a 3 a c
4 a 4 a d
5 a 9 f a
6 a 11 g a
7 a 13 a d
8 b 1 a b
9 b 2 b a
10 b 5 b c
# ... with 16 more rows
A Base R approach :
result <- do.call(rbind, lapply(name_df$player_name, function(x)
cbind(plaername = x,
subset(game_df, winner_name == x | loser_name == x))))
rownames(result) <- NULL
result
# playername game_id winner_name loser_name
#1 a 1 a b
#2 a 2 b a
#3 a 3 a c
#4 a 4 a d
#5 a 9 f a
#6 a 11 g a
#7 a 13 a d
#8 b 1 a b
#...
#...
I have a list here, and I wish to mutate a new column with unique values for each list relative to the mutation. For example, I want to mutate a column named ID as n >= 1.
Naturally, on a dataframe I would do this:
dat %>% mutate(id = row_number())
For a list, I would do this:
dat%>% map(~ mutate(., ID = row_number()))
And I would get an output likeso:
dat <- list(data.frame(x=c("a", "b" ,"c", "d", "e" ,"f" ,"g") ), data.frame(y=c("p", "lk", "n", "m", "g", "f", "t")))
[[1]]
x id
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
7 g 7
[[2]]
y id
1 p 1
2 lk 2
3 n 3
4 m 4
5 g 5
6 f 6
7 t 7
Though, how would I mutate a new column ID such that the row number continues from the first list.
Expected output:
[[1]]
x id
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
7 g 7
[[2]]
y id
1 p 8
2 lk 9
3 n 10
4 m 11
5 g 12
6 f 13
7 t 14
An option is to bind them into a single dataset, create the 'id' with row_number(), split by 'grp', loop over the list and remove any columns that have all NA values
library(dplyr)
library(purrr)
dat %>%
bind_rows(.id = 'grp') %>%
mutate(id = row_number()) %>%
group_split(grp) %>%
map(~ .x %>%
select(where(~ any(!is.na(.))), -grp))
-output
#[[1]]
# A tibble: 7 x 2
# x id
# <chr> <int>
#1 a 1
#2 b 2
#3 c 3
#4 d 4
#5 e 5
#6 f 6
#7 g 7
#[[2]]
# A tibble: 7 x 2
# y id
# <chr> <int>
#1 p 8
#2 lk 9
#3 n 10
#4 m 11
#5 g 12
#6 f 13
#7 t 14
Or an easier approach is to unlist (assuming single column), get the sequence, add a new column with map2
map2(dat, relist(seq_along(unlist(dat)), skeleton = dat),
~ .x %>% mutate(id = .y))
Or using a for loop
dat[[1]]$id <- seq_len(nrow(dat[[1]]))
for(i in seq_along(dat)[-1]) dat[[i]]$id <-
seq(tail(dat[[i-1]]$id, 1) + 1, length.out = nrow(dat[[i]]), by = 1)
I have a list of data frames, I want to add a column to each data frame and this column would be the concatenation of the row number and another variable.
I have managed to do that using a for loop but it is taking a lot of time when dealing with a large dataset, is there a way to avoid a for loop?
my_data_vcf <-lapply(my_vcf_files,read.table, stringsAsFactors = FALSE)
for i in 1:length(my_data_vcf){
for(j in 1:length(my_data_vcf[[i]]){
my_data_vcf[[i]] <- cbind(my_data_vcf[[i]], "Id" = paste(c(variable,j), collapse = "_"))}}
You can use lapply; since you don't provide a minimal sample dataset, I'm generating some sample data.
# Sample list of data.frame's
lst <- list(
data.frame(one = letters[1:10], two = 1:10),
data.frame(one = letters[11:20], two = 11:20))
# Concatenate row number with entries in second column
lapply(lst, function(x) { x$three <- paste(1:nrow(x), x$two, sep = "_"); x })
#[1]]
# one two three
#1 a 1 1_1
#2 b 2 2_2
#3 c 3 3_3
#4 d 4 4_4
#5 e 5 5_5
#6 f 6 6_6
#7 g 7 7_7
#8 h 8 8_8
#9 i 9 9_9
#10 j 10 10_10
#
#[[2]]
# one two three
#1 k 11 1_11
#2 l 12 2_12
#3 m 13 3_13
#4 n 14 4_14
#5 o 15 5_15
#6 p 16 6_16
#7 q 17 7_17
#8 r 18 8_18
#9 s 19 9_19
#10 t 20 10_20
One way we can do this is to create a nested data frame using enframe from the tibble package. Once we've done that, we can unnest the data and use mutate to concatenate the row number and a column:
library(tidyverse)
# using Maurits Evers' data, treating stringsAsFactors
lst <- list(
data.frame(one = letters[1:10], two = 1:10, stringsAsFactors = F),
data.frame(one = letters[11:20], two = 11:20, stringsAsFactors = F)
)
lst %>%
enframe() %>%
unnest(value) %>%
group_by(name) %>%
mutate(three = paste(row_number(), two, sep = "_")) %>%
nest()
Returns:
# A tibble: 2 x 2
name data
<int> <list>
1 1 <tibble [10 × 3]>
2 2 <tibble [10 × 3]>
If we unnest the data, we can see that var three is the concatenation of var two and the row number:
lst %>%
enframe() %>%
unnest(value) %>%
group_by(name) %>%
mutate(three = paste(row_number(), two, sep = "_")) %>%
nest() %>%
unnest(data)
Returns:
# A tibble: 20 x 4
name one two three
<int> <chr> <int> <chr>
1 1 a 1 1_1
2 1 b 2 2_2
3 1 c 3 3_3
4 1 d 4 4_4
5 1 e 5 5_5
6 1 f 6 6_6
7 1 g 7 7_7
8 1 h 8 8_8
9 1 i 9 9_9
10 1 j 10 10_10
11 2 k 11 1_11
12 2 l 12 2_12
13 2 m 13 3_13
14 2 n 14 4_14
15 2 o 15 5_15
16 2 p 16 6_16
17 2 q 17 7_17
18 2 r 18 8_18
19 2 s 19 9_19
20 2 t 20 10_20
I have a data frame like so
Type Number Species
A 1 G
A 2 R
A 7 Q
A 4 L
B 4 S
B 5 T
B 3 H
B 9 P
C 12 K
C 11 T
C 6 U
C 5 Q
Where I have used group_by(Type)
My goal is to collapse this data by having NUMBER be the top 2 values in the number column, and then making a new column(Number_2) that is the second 2 values.
Also I would want the Species values for the bottom two numbers to be deleted, so that the species corresponds to the higher number in the row
I would like to use dplyr and the final would look like this
Type Number Number_2 Species
A 7 1 Q
A 4 2 L
B 5 3 T
B 9 4 P
C 12 6 K
C 11 5 T
as of now the order that number_2 is in doesn't matter, as long as it is in the same type....
I don't know if this is possible but if it is does anyone know how...
thanks!
You can try
library(data.table)
setDT(df1)[order(-Number), list(Number1=Number[1:2],
Number2=Number[3:4],
Species=Species[1:2]), keyby = Type]
# Type Number1 Number2 Species
#1: A 7 2 Q
#2: A 4 1 L
#3: B 9 4 P
#4: B 5 3 T
#5: C 12 6 K
#6: C 11 5 T
Or using dplyr with do
library(dplyr)
df1 %>%
group_by(Type) %>%
arrange(desc(Number)) %>%
do(data.frame(Type=.$Type[1L],
Number1=.$Number[1:2],
Number2 = .$Number[3:4],
Species=.$Species[1:2], stringsAsFactors=FALSE))
# Type Number1 Number2 Species
#1 A 7 2 Q
#2 A 4 1 L
#3 B 9 4 P
#4 B 5 3 T
#5 C 12 6 K
#6 C 11 5 T
Here's a different dplyr approach.
library(dplyr)
# Start creating the data set with top 2 values and store as df1:
df1 <- df %>%
group_by(Type) %>%
top_n(2, Number) %>%
ungroup() %>%
arrange(Type, Number)
# Then, get the anti-joined data (the not top 2 values), arrange, rename and select
# the number colummn and cbind to df1:
out <- df %>%
anti_join(df1, c("Type","Number")) %>%
arrange(Type, Number) %>%
select(Number2 = Number) %>%
cbind(df1, .)
This results in:
> out
# Type Number Species Number2
#1 A 4 L 1
#2 A 7 Q 2
#3 B 5 T 3
#4 B 9 P 4
#5 C 11 T 5
#6 C 12 K 6
This could be another option using ddply
library(plyr)
ddply(dat[order(Number)], .(Type), summarize,
Number1 = Number[4:3], Number2 = Number[2:1], Species = Species[4:3])
# Type Number1 Number2 Species
#1 A 7 2 Q
#2 A 4 1 L
#3 B 9 4 P
#4 B 5 3 T
#5 C 12 6 K
#6 C 11 5 T