Converting a row of data into a data frame in R

Converting a row of data into a data frame in R - r

I have a single row data frame like this:
X1 X2 X3
1 [['1','2','3'], ['4','6','5'], ['7','8']] ['9','10','11','12','13']
I would like create a new dataframe from that using columns X2 and X3 that looks like this:
ID Group
1 A
2 A
3 A
4 B
5 B
6 B
7 C
8 C
9 D
10 D
11 D
12 D
13 D
So each number in the dataframe is grouped by the square brackets in the orignal dataframe.
Can anyone recommend a good way of doing this in R.

One option would be to split the 'X2' at the , followed by the ], concatenate with 'X3', extract the numeric elements with str_extract_all into a list, stack it to a two column data.frame
library(stringr)
v1 <- c(strsplit(df1$X2, "\\],\\s*")[[1]], df1$X3)
out <- stack(setNames(str_extract_all(v1, "\\d+"), LETTERS[1:4]))
names(out) <- c("ID", "Group")
out
# ID Group
#1 1 A
#2 2 A
#3 3 A
#4 4 B
#5 6 B
#6 5 B
#7 7 C
#8 8 C
#9 9 D
#10 10 D
#11 11 D
#12 12 D
#13 13 D
Or using tidyverse
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = -X1) %>%
separate_rows(value, sep="(?<=\\]),\\s*") %>%
transmute(Group = LETTERS[row_number()], ID = value) %>%
mutate(ID = str_extract_all(ID, "\\d+")) %>%
unnest(c(ID))
# A tibble: 13 x 2
# Group ID
# <chr> <chr>
# 1 A 1
# 2 A 2
# 3 A 3
# 4 B 4
# 5 B 6
# 6 B 5
# 7 C 7
# 8 C 8
# 9 D 9
#10 D 10
#11 D 11
#12 D 12
#13 D 13
data
df1 <- structure(list(X1 = 1L, X2 = "[['1','2','3'], ['4','6','5'], ['7','8']]",
X3 = "['9','10','11','12','13']"), class = "data.frame", row.names = c(NA,
-1L))

Related

R: expand grid of all possible combinations within groups and apply functions across all the pairs

data <- tibble(time = c(1,1,2,2), a = c(1,2,3,4), b =c(4,3,2,1), c = c(1,1,1,1))
The result will look like this
result <- tibble(
t = c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2),
firm1 = c("a","a","a","b","b","b","c","c","c","a","a","a","b","b","b","c","c","c"),
firm2 = c("a","b","c","a","b","c","a","b","c","a","b","c","a","b","c","a","b","c"),
value = c(6,10,5,10,14,9,5,9,4,14,10,9,10,6,5,9,5,4))
result
The function could be
function(x, y){sum(x, y)}
Basically I am looking for a tidy solution to expand.grid data at each point of time and apply functions across columns. Can anyone help?
I tried this, but I could not have time in front of the pairs.
expected_result<-expand.grid(names(data[-1]), names(data[-1])) %>%
mutate(value = map2(Var1, Var2, ~ fun1(data[.x], data[.y])))
expected_result

Use exand.grid you get all possible combination of columns, split the data by time and apply fun for each row of tmp.
library(dplyr)
library(purrr)
tmp <- expand.grid(firm1 = names(data[-1]), firm2 = names(data[-1]))
fun <- function(x, y) sum(x, y)
result <- data %>%
group_split(time) %>%
map_df(~cbind(time = .x$time[1], tmp,
value = apply(tmp, 1, function(x) fun(.x[[x[1]]], .x[[x[2]]]))))
result
# time firm1 firm2 value
#1 1 a a 6
#2 1 b a 10
#3 1 c a 5
#4 1 a b 10
#5 1 b b 14
#6 1 c b 9
#7 1 a c 5
#8 1 b c 9
#9 1 c c 4
#10 2 a a 14
#11 2 b a 10
#12 2 c a 9
#13 2 a b 10
#14 2 b b 6
#15 2 c b 5
#16 2 a c 9
#17 2 b c 5
#18 2 c c 4
You may also do this in base R -
result <- do.call(rbind, by(data, data$time, function(x) {
cbind(time = x$time[1], tmp,
value = apply(tmp, 1, function(y) fun(x[[y[1]]], x[[y[2]]])))
}))

We may use
library(dplyr)
library(tidyr)
library(purrr)
data1 <- data %>%
group_by(time) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop') %>%
pivot_longer(cols = -time) %>%
group_split(time)
map_dfr(data1, ~ {dat <- .x
crossing(firm1 = dat$name, firm2 = dat$name) %>%
mutate(value = c(outer(dat$value, dat$value, FUN = `+`))) %>%
mutate(time = first(dat$time), .before = 1)})
-output
# A tibble: 18 × 4
time firm1 firm2 value
<dbl> <chr> <chr> <dbl>
1 1 a a 6
2 1 a b 10
3 1 a c 5
4 1 b a 10
5 1 b b 14
6 1 b c 9
7 1 c a 5
8 1 c b 9
9 1 c c 4
10 2 a a 14
11 2 a b 10
12 2 a c 9
13 2 b a 10
14 2 b b 6
15 2 b c 5
16 2 c a 9
17 2 c b 5
18 2 c c 4

Extract rows where value appears in any of multiple columns

Let' say I have two data.frames
name_df = read.table(text = "player_name
a
b
c
d
e
f
g", header = T)
game_df = read.table(text = "game_id winner_name loser_name
1 a b
2 b a
3 a c
4 a d
5 b c
6 c d
7 d e
8 e f
9 f a
10 g f
11 g a
12 f e
13 a d", header = T)
name_df contains a unique list of all the winner_name or loser_name values in game_df. I want to create a new data.frame that has, for each person in the name_df a row if a given name (e.g. a) appears in either the winner_name or loser_name column
So I essentially want to merge game_df with name_df, but the key column (name) can appear in either winner_name or loser_name.
So, for just a and b the final output would look something like:
final_df = read.table(text = "player_name game_id winner_name loser_name
a 1 a b
a 2 b a
a 3 a c
a 4 a d
a 9 f a
a 11 g a
a 13 a d
b 1 a b
b 2 b a
b 5 b c", header = T)

We can loop over the elements in 'name_df' for 'player_name', filter the rows from 'game_df' for either the 'winner_name' or 'loser_name'
library(dplyr)
library(purrr)
map_dfr(setNames(name_df$player_name, name_df$player_name),
~ game_df %>%
filter(winner_name %in% .x|loser_name %in% .x), .id = 'player_name')
Or if there are many columns, use if_any
map_dfr(setNames(name_df$player_name, name_df$player_name),
~ {
nm1 <- .x
game_df %>%
filter(if_any(c(winner_name, loser_name), ~ . %in% nm1))
}, .id = 'player_name')

Dedicated to our teacher and mentor dear #akrun
I think we can also make use of the add_row() function you first taught me the other day. Unbelievable!!!
library(dplyr)
library(purrr)
library(tibble)
game_df %>%
rowwise() %>%
mutate(player_name = winner_name) %>%
group_split(game_id) %>%
map_dfr(~ add_row(.x, game_id = .x$game_id, winner_name = .x$winner_name,
loser_name = .x$loser_name, player_name = .x$loser_name)) %>%
arrange(player_name) %>%
relocate(player_name)
# A tibble: 26 x 4
player_name game_id winner_name loser_name
<chr> <int> <chr> <chr>
1 a 1 a b
2 a 2 b a
3 a 3 a c
4 a 4 a d
5 a 9 f a
6 a 11 g a
7 a 13 a d
8 b 1 a b
9 b 2 b a
10 b 5 b c
# ... with 16 more rows

This can be directly expressed in SQL:
library(sqldf)
sqldf("select *
from name_df
left join game_df on winner_name = player_name or loser_name = player_name")

Without using purrr. I think this is appropriate use case of tidyr::unite with argument remove = F where we can first unite the winners' and losers' names and then use tidyr::separate_rows to split new column into rows.
library(tidyr)
library(dplyr)
game_df %>% unite(Player_name, winner_name, loser_name, remove = F, sep = ', ') %>%
separate_rows(Player_name) %>%
relocate(Player_name) %>%
arrange(Player_name)
# A tibble: 26 x 4
Player_name game_id winner_name loser_name
<chr> <int> <chr> <chr>
1 a 1 a b
2 a 2 b a
3 a 3 a c
4 a 4 a d
5 a 9 f a
6 a 11 g a
7 a 13 a d
8 b 1 a b
9 b 2 b a
10 b 5 b c
# ... with 16 more rows

A Base R approach :
result <- do.call(rbind, lapply(name_df$player_name, function(x)
cbind(plaername = x,
subset(game_df, winner_name == x | loser_name == x))))
rownames(result) <- NULL
result
# playername game_id winner_name loser_name
#1 a 1 a b
#2 a 2 b a
#3 a 3 a c
#4 a 4 a d
#5 a 9 f a
#6 a 11 g a
#7 a 13 a d
#8 b 1 a b
#...
#...

Mutate new column with unique values for each list

I have a list here, and I wish to mutate a new column with unique values for each list relative to the mutation. For example, I want to mutate a column named ID as n >= 1.
Naturally, on a dataframe I would do this:
dat %>% mutate(id = row_number())
For a list, I would do this:
dat%>% map(~ mutate(., ID = row_number()))
And I would get an output likeso:
dat <- list(data.frame(x=c("a", "b" ,"c", "d", "e" ,"f" ,"g") ), data.frame(y=c("p", "lk", "n", "m", "g", "f", "t")))
[[1]]
x id
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
7 g 7
[[2]]
y id
1 p 1
2 lk 2
3 n 3
4 m 4
5 g 5
6 f 6
7 t 7
Though, how would I mutate a new column ID such that the row number continues from the first list.
Expected output:
[[1]]
x id
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
7 g 7
[[2]]
y id
1 p 8
2 lk 9
3 n 10
4 m 11
5 g 12
6 f 13
7 t 14

An option is to bind them into a single dataset, create the 'id' with row_number(), split by 'grp', loop over the list and remove any columns that have all NA values
library(dplyr)
library(purrr)
dat %>%
bind_rows(.id = 'grp') %>%
mutate(id = row_number()) %>%
group_split(grp) %>%
map(~ .x %>%
select(where(~ any(!is.na(.))), -grp))
-output
#[[1]]
# A tibble: 7 x 2
# x id
# <chr> <int>
#1 a 1
#2 b 2
#3 c 3
#4 d 4
#5 e 5
#6 f 6
#7 g 7
#[[2]]
# A tibble: 7 x 2
# y id
# <chr> <int>
#1 p 8
#2 lk 9
#3 n 10
#4 m 11
#5 g 12
#6 f 13
#7 t 14
Or an easier approach is to unlist (assuming single column), get the sequence, add a new column with map2
map2(dat, relist(seq_along(unlist(dat)), skeleton = dat),
~ .x %>% mutate(id = .y))
Or using a for loop
dat[[1]]$id <- seq_len(nrow(dat[[1]]))
for(i in seq_along(dat)[-1]) dat[[i]]$id <-
seq(tail(dat[[i-1]]$id, 1) + 1, length.out = nrow(dat[[i]]), by = 1)

Manipulating columns of a list of dataframes in R

I have a list of data frames, I want to add a column to each data frame and this column would be the concatenation of the row number and another variable.
I have managed to do that using a for loop but it is taking a lot of time when dealing with a large dataset, is there a way to avoid a for loop?
my_data_vcf <-lapply(my_vcf_files,read.table, stringsAsFactors = FALSE)
for i in 1:length(my_data_vcf){
for(j in 1:length(my_data_vcf[[i]]){
my_data_vcf[[i]] <- cbind(my_data_vcf[[i]], "Id" = paste(c(variable,j), collapse = "_"))}}

You can use lapply; since you don't provide a minimal sample dataset, I'm generating some sample data.
# Sample list of data.frame's
lst <- list(
data.frame(one = letters[1:10], two = 1:10),
data.frame(one = letters[11:20], two = 11:20))
# Concatenate row number with entries in second column
lapply(lst, function(x) { x$three <- paste(1:nrow(x), x$two, sep = "_"); x })
#[1]]
# one two three
#1 a 1 1_1
#2 b 2 2_2
#3 c 3 3_3
#4 d 4 4_4
#5 e 5 5_5
#6 f 6 6_6
#7 g 7 7_7
#8 h 8 8_8
#9 i 9 9_9
#10 j 10 10_10
#
#[[2]]
# one two three
#1 k 11 1_11
#2 l 12 2_12
#3 m 13 3_13
#4 n 14 4_14
#5 o 15 5_15
#6 p 16 6_16
#7 q 17 7_17
#8 r 18 8_18
#9 s 19 9_19
#10 t 20 10_20

One way we can do this is to create a nested data frame using enframe from the tibble package. Once we've done that, we can unnest the data and use mutate to concatenate the row number and a column:
library(tidyverse)
# using Maurits Evers' data, treating stringsAsFactors
lst <- list(
data.frame(one = letters[1:10], two = 1:10, stringsAsFactors = F),
data.frame(one = letters[11:20], two = 11:20, stringsAsFactors = F)
)
lst %>%
enframe() %>%
unnest(value) %>%
group_by(name) %>%
mutate(three = paste(row_number(), two, sep = "_")) %>%
nest()
Returns:
# A tibble: 2 x 2
name data
<int> <list>
1 1 <tibble [10 × 3]>
2 2 <tibble [10 × 3]>
If we unnest the data, we can see that var three is the concatenation of var two and the row number:
lst %>%
enframe() %>%
unnest(value) %>%
group_by(name) %>%
mutate(three = paste(row_number(), two, sep = "_")) %>%
nest() %>%
unnest(data)
Returns:
# A tibble: 20 x 4
name one two three
<int> <chr> <int> <chr>
1 1 a 1 1_1
2 1 b 2 2_2
3 1 c 3 3_3
4 1 d 4 4_4
5 1 e 5 5_5
6 1 f 6 6_6
7 1 g 7 7_7
8 1 h 8 8_8
9 1 i 9 9_9
10 1 j 10 10_10
11 2 k 11 1_11
12 2 l 12 2_12
13 2 m 13 3_13
14 2 n 14 4_14
15 2 o 15 5_15
16 2 p 16 6_16
17 2 q 17 7_17
18 2 r 18 8_18
19 2 s 19 9_19
20 2 t 20 10_20

Rearranging data frame columns in R (mutate, dplyr)

I have a data frame like so
Type Number Species
A 1 G
A 2 R
A 7 Q
A 4 L
B 4 S
B 5 T
B 3 H
B 9 P
C 12 K
C 11 T
C 6 U
C 5 Q
Where I have used group_by(Type)
My goal is to collapse this data by having NUMBER be the top 2 values in the number column, and then making a new column(Number_2) that is the second 2 values.
Also I would want the Species values for the bottom two numbers to be deleted, so that the species corresponds to the higher number in the row
I would like to use dplyr and the final would look like this
Type Number Number_2 Species
A 7 1 Q
A 4 2 L
B 5 3 T
B 9 4 P
C 12 6 K
C 11 5 T
as of now the order that number_2 is in doesn't matter, as long as it is in the same type....
I don't know if this is possible but if it is does anyone know how...
thanks!

You can try
library(data.table)
setDT(df1)[order(-Number), list(Number1=Number[1:2],
Number2=Number[3:4],
Species=Species[1:2]), keyby = Type]
# Type Number1 Number2 Species
#1: A 7 2 Q
#2: A 4 1 L
#3: B 9 4 P
#4: B 5 3 T
#5: C 12 6 K
#6: C 11 5 T
Or using dplyr with do
library(dplyr)
df1 %>%
group_by(Type) %>%
arrange(desc(Number)) %>%
do(data.frame(Type=.$Type[1L],
Number1=.$Number[1:2],
Number2 = .$Number[3:4],
Species=.$Species[1:2], stringsAsFactors=FALSE))
# Type Number1 Number2 Species
#1 A 7 2 Q
#2 A 4 1 L
#3 B 9 4 P
#4 B 5 3 T
#5 C 12 6 K
#6 C 11 5 T

Here's a different dplyr approach.
library(dplyr)
# Start creating the data set with top 2 values and store as df1:
df1 <- df %>%
group_by(Type) %>%
top_n(2, Number) %>%
ungroup() %>%
arrange(Type, Number)
# Then, get the anti-joined data (the not top 2 values), arrange, rename and select
# the number colummn and cbind to df1:
out <- df %>%
anti_join(df1, c("Type","Number")) %>%
arrange(Type, Number) %>%
select(Number2 = Number) %>%
cbind(df1, .)
This results in:
> out
# Type Number Species Number2
#1 A 4 L 1
#2 A 7 Q 2
#3 B 5 T 3
#4 B 9 P 4
#5 C 11 T 5
#6 C 12 K 6

This could be another option using ddply
library(plyr)
ddply(dat[order(Number)], .(Type), summarize,
Number1 = Number[4:3], Number2 = Number[2:1], Species = Species[4:3])
# Type Number1 Number2 Species
#1 A 7 2 Q
#2 A 4 1 L
#3 B 9 4 P
#4 B 5 3 T
#5 C 12 6 K
#6 C 11 5 T

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Converting a row of data into a data frame in R - r

Related

R: expand grid of all possible combinations within groups and apply functions across all the pairs

Extract rows where value appears in any of multiple columns

Mutate new column with unique values for each list

Manipulating columns of a list of dataframes in R

Rearranging data frame columns in R (mutate, dplyr)

Categories

Resources