I have a list of data frames, I want to add a column to each data frame and this column would be the concatenation of the row number and another variable.
I have managed to do that using a for loop but it is taking a lot of time when dealing with a large dataset, is there a way to avoid a for loop?
my_data_vcf <-lapply(my_vcf_files,read.table, stringsAsFactors = FALSE)
for i in 1:length(my_data_vcf){
for(j in 1:length(my_data_vcf[[i]]){
my_data_vcf[[i]] <- cbind(my_data_vcf[[i]], "Id" = paste(c(variable,j), collapse = "_"))}}
You can use lapply; since you don't provide a minimal sample dataset, I'm generating some sample data.
# Sample list of data.frame's
lst <- list(
data.frame(one = letters[1:10], two = 1:10),
data.frame(one = letters[11:20], two = 11:20))
# Concatenate row number with entries in second column
lapply(lst, function(x) { x$three <- paste(1:nrow(x), x$two, sep = "_"); x })
#[1]]
# one two three
#1 a 1 1_1
#2 b 2 2_2
#3 c 3 3_3
#4 d 4 4_4
#5 e 5 5_5
#6 f 6 6_6
#7 g 7 7_7
#8 h 8 8_8
#9 i 9 9_9
#10 j 10 10_10
#
#[[2]]
# one two three
#1 k 11 1_11
#2 l 12 2_12
#3 m 13 3_13
#4 n 14 4_14
#5 o 15 5_15
#6 p 16 6_16
#7 q 17 7_17
#8 r 18 8_18
#9 s 19 9_19
#10 t 20 10_20
One way we can do this is to create a nested data frame using enframe from the tibble package. Once we've done that, we can unnest the data and use mutate to concatenate the row number and a column:
library(tidyverse)
# using Maurits Evers' data, treating stringsAsFactors
lst <- list(
data.frame(one = letters[1:10], two = 1:10, stringsAsFactors = F),
data.frame(one = letters[11:20], two = 11:20, stringsAsFactors = F)
)
lst %>%
enframe() %>%
unnest(value) %>%
group_by(name) %>%
mutate(three = paste(row_number(), two, sep = "_")) %>%
nest()
Returns:
# A tibble: 2 x 2
name data
<int> <list>
1 1 <tibble [10 × 3]>
2 2 <tibble [10 × 3]>
If we unnest the data, we can see that var three is the concatenation of var two and the row number:
lst %>%
enframe() %>%
unnest(value) %>%
group_by(name) %>%
mutate(three = paste(row_number(), two, sep = "_")) %>%
nest() %>%
unnest(data)
Returns:
# A tibble: 20 x 4
name one two three
<int> <chr> <int> <chr>
1 1 a 1 1_1
2 1 b 2 2_2
3 1 c 3 3_3
4 1 d 4 4_4
5 1 e 5 5_5
6 1 f 6 6_6
7 1 g 7 7_7
8 1 h 8 8_8
9 1 i 9 9_9
10 1 j 10 10_10
11 2 k 11 1_11
12 2 l 12 2_12
13 2 m 13 3_13
14 2 n 14 4_14
15 2 o 15 5_15
16 2 p 16 6_16
17 2 q 17 7_17
18 2 r 18 8_18
19 2 s 19 9_19
20 2 t 20 10_20
Related
data <- tibble(time = c(1,1,2,2), a = c(1,2,3,4), b =c(4,3,2,1), c = c(1,1,1,1))
The result will look like this
result <- tibble(
t = c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2),
firm1 = c("a","a","a","b","b","b","c","c","c","a","a","a","b","b","b","c","c","c"),
firm2 = c("a","b","c","a","b","c","a","b","c","a","b","c","a","b","c","a","b","c"),
value = c(6,10,5,10,14,9,5,9,4,14,10,9,10,6,5,9,5,4))
result
The function could be
function(x, y){sum(x, y)}
Basically I am looking for a tidy solution to expand.grid data at each point of time and apply functions across columns. Can anyone help?
I tried this, but I could not have time in front of the pairs.
expected_result<-expand.grid(names(data[-1]), names(data[-1])) %>%
mutate(value = map2(Var1, Var2, ~ fun1(data[.x], data[.y])))
expected_result
Use exand.grid you get all possible combination of columns, split the data by time and apply fun for each row of tmp.
library(dplyr)
library(purrr)
tmp <- expand.grid(firm1 = names(data[-1]), firm2 = names(data[-1]))
fun <- function(x, y) sum(x, y)
result <- data %>%
group_split(time) %>%
map_df(~cbind(time = .x$time[1], tmp,
value = apply(tmp, 1, function(x) fun(.x[[x[1]]], .x[[x[2]]]))))
result
# time firm1 firm2 value
#1 1 a a 6
#2 1 b a 10
#3 1 c a 5
#4 1 a b 10
#5 1 b b 14
#6 1 c b 9
#7 1 a c 5
#8 1 b c 9
#9 1 c c 4
#10 2 a a 14
#11 2 b a 10
#12 2 c a 9
#13 2 a b 10
#14 2 b b 6
#15 2 c b 5
#16 2 a c 9
#17 2 b c 5
#18 2 c c 4
You may also do this in base R -
result <- do.call(rbind, by(data, data$time, function(x) {
cbind(time = x$time[1], tmp,
value = apply(tmp, 1, function(y) fun(x[[y[1]]], x[[y[2]]])))
}))
We may use
library(dplyr)
library(tidyr)
library(purrr)
data1 <- data %>%
group_by(time) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop') %>%
pivot_longer(cols = -time) %>%
group_split(time)
map_dfr(data1, ~ {dat <- .x
crossing(firm1 = dat$name, firm2 = dat$name) %>%
mutate(value = c(outer(dat$value, dat$value, FUN = `+`))) %>%
mutate(time = first(dat$time), .before = 1)})
-output
# A tibble: 18 × 4
time firm1 firm2 value
<dbl> <chr> <chr> <dbl>
1 1 a a 6
2 1 a b 10
3 1 a c 5
4 1 b a 10
5 1 b b 14
6 1 b c 9
7 1 c a 5
8 1 c b 9
9 1 c c 4
10 2 a a 14
11 2 a b 10
12 2 a c 9
13 2 b a 10
14 2 b b 6
15 2 b c 5
16 2 c a 9
17 2 c b 5
18 2 c c 4
I have a list here, and I wish to mutate a new column with unique values for each list relative to the mutation. For example, I want to mutate a column named ID as n >= 1.
Naturally, on a dataframe I would do this:
dat %>% mutate(id = row_number())
For a list, I would do this:
dat%>% map(~ mutate(., ID = row_number()))
And I would get an output likeso:
dat <- list(data.frame(x=c("a", "b" ,"c", "d", "e" ,"f" ,"g") ), data.frame(y=c("p", "lk", "n", "m", "g", "f", "t")))
[[1]]
x id
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
7 g 7
[[2]]
y id
1 p 1
2 lk 2
3 n 3
4 m 4
5 g 5
6 f 6
7 t 7
Though, how would I mutate a new column ID such that the row number continues from the first list.
Expected output:
[[1]]
x id
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
7 g 7
[[2]]
y id
1 p 8
2 lk 9
3 n 10
4 m 11
5 g 12
6 f 13
7 t 14
An option is to bind them into a single dataset, create the 'id' with row_number(), split by 'grp', loop over the list and remove any columns that have all NA values
library(dplyr)
library(purrr)
dat %>%
bind_rows(.id = 'grp') %>%
mutate(id = row_number()) %>%
group_split(grp) %>%
map(~ .x %>%
select(where(~ any(!is.na(.))), -grp))
-output
#[[1]]
# A tibble: 7 x 2
# x id
# <chr> <int>
#1 a 1
#2 b 2
#3 c 3
#4 d 4
#5 e 5
#6 f 6
#7 g 7
#[[2]]
# A tibble: 7 x 2
# y id
# <chr> <int>
#1 p 8
#2 lk 9
#3 n 10
#4 m 11
#5 g 12
#6 f 13
#7 t 14
Or an easier approach is to unlist (assuming single column), get the sequence, add a new column with map2
map2(dat, relist(seq_along(unlist(dat)), skeleton = dat),
~ .x %>% mutate(id = .y))
Or using a for loop
dat[[1]]$id <- seq_len(nrow(dat[[1]]))
for(i in seq_along(dat)[-1]) dat[[i]]$id <-
seq(tail(dat[[i-1]]$id, 1) + 1, length.out = nrow(dat[[i]]), by = 1)
I have a single row data frame like this:
X1 X2 X3
1 [['1','2','3'], ['4','6','5'], ['7','8']] ['9','10','11','12','13']
I would like create a new dataframe from that using columns X2 and X3 that looks like this:
ID Group
1 A
2 A
3 A
4 B
5 B
6 B
7 C
8 C
9 D
10 D
11 D
12 D
13 D
So each number in the dataframe is grouped by the square brackets in the orignal dataframe.
Can anyone recommend a good way of doing this in R.
One option would be to split the 'X2' at the , followed by the ], concatenate with 'X3', extract the numeric elements with str_extract_all into a list, stack it to a two column data.frame
library(stringr)
v1 <- c(strsplit(df1$X2, "\\],\\s*")[[1]], df1$X3)
out <- stack(setNames(str_extract_all(v1, "\\d+"), LETTERS[1:4]))
names(out) <- c("ID", "Group")
out
# ID Group
#1 1 A
#2 2 A
#3 3 A
#4 4 B
#5 6 B
#6 5 B
#7 7 C
#8 8 C
#9 9 D
#10 10 D
#11 11 D
#12 12 D
#13 13 D
Or using tidyverse
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = -X1) %>%
separate_rows(value, sep="(?<=\\]),\\s*") %>%
transmute(Group = LETTERS[row_number()], ID = value) %>%
mutate(ID = str_extract_all(ID, "\\d+")) %>%
unnest(c(ID))
# A tibble: 13 x 2
# Group ID
# <chr> <chr>
# 1 A 1
# 2 A 2
# 3 A 3
# 4 B 4
# 5 B 6
# 6 B 5
# 7 C 7
# 8 C 8
# 9 D 9
#10 D 10
#11 D 11
#12 D 12
#13 D 13
data
df1 <- structure(list(X1 = 1L, X2 = "[['1','2','3'], ['4','6','5'], ['7','8']]",
X3 = "['9','10','11','12','13']"), class = "data.frame", row.names = c(NA,
-1L))
I am trying to find the best way to iterate through each column of a data frame, group by that column, and produce a summary.
Here is my attempt:
library(tidyverse)
data = data.frame(
a = sample(LETTERS[1:3], 100, replace=TRUE),
b = sample(LETTERS[1:8], 100, replace=TRUE),
c = sample(LETTERS[3:15], 100, replace=TRUE),
d = sample(LETTERS[16:26], 100, replace=TRUE),
value = rnorm(100)
)
myfunction <- function(x) {
groupVars <- select_if(x, is.factor) %>% colnames()
results <- list()
for(i in 1:length(groupVars)) {
results[[i]] <- x %>%
group_by_at(.vars = vars(groupVars[i])) %>%
summarise(
n = n()
)
}
return(results)
}
test <- myfunction(data)
The function returns:
[[1]]
# A tibble: 3 x 2
a n
<fct> <int>
1 A 37
2 B 34
3 C 29
...
...
...
My question is, is this the best way to do this? Is there a way to avoid using a for loop? Can I use purrr and map somehow to do this?
Thank you
An option is to use map
library(tidyverse)
map(data[1:4], ~data.frame(x = {{.x}}) %>% count(x))
#$a
## A tibble: 3 x 2
# x n
# <fct> <int>
#1 A 39
#2 B 32
#3 C 29
#
#$b
## A tibble: 8 x 2
# x n
# <fct> <int>
#1 A 14
#2 B 11
#3 C 16
#4 D 10
#5 E 12
#6 F 10
#7 G 13
#8 H 14
#...
The output is a list. Note that I have ignored the last column of data, as it doesn't seem to be relevant here.
If you want columns in the list data.frames to be named according to the columns from your original data, we can use imap
imap(data[1:4], ~tibble(!!.y := {{.x}}) %>% count(!!sym(.y)))
#$a
## A tibble: 3 x 2
# a n
# <fct> <int>
#1 A 23
#2 B 35
#3 C 42
#
#$b
## A tibble: 8 x 2
# b n
# <fct> <int>
#1 A 15
#2 B 10
#3 C 13
#4 D 5
#5 E 19
#6 F 9
#7 G 13
#8 H 16
#...
Or making use of tibble::enframe (thanks #camille)
imap(data[1:4], ~enframe(.x, value = .y) %>% count(!!sym(.y)))
You could reshape the data and group by both the column and the letter. This gives you one dataframe instead of a list of them, but you could get the list if you really want it with split.
set.seed(123)
library(tidyverse)
data = data.frame(
a = sample(LETTERS[1:3], 100, replace=TRUE),
b = sample(LETTERS[1:8], 100, replace=TRUE),
c = sample(LETTERS[3:15], 100, replace=TRUE),
d = sample(LETTERS[16:26], 100, replace=TRUE),
value = rnorm(100)
)
data %>%
pivot_longer(cols = -value, names_to = "column", values_to = "letter") %>%
group_by(column, letter) %>%
summarise(n = n())
#> # A tibble: 35 x 3
#> # Groups: column [4]
#> column letter n
#> <chr> <fct> <int>
#> 1 a A 33
#> 2 a B 32
#> 3 a C 35
#> 4 b A 8
#> 5 b B 11
#> 6 b C 12
#> 7 b D 14
#> 8 b E 8
#> 9 b F 17
#> 10 b G 16
#> # … with 25 more rows
Created on 2019-10-30 by the reprex package (v0.3.0)
You can simply call:
apply(data, 2,table)
You can drop the last list element if you want.
Here is sample data:
df <- data.frame(t(data.frame(seq(1,10,1)))); rownames(df) <- NULL;
colnames(df) <- letters[1:ncol(df)]
df
I would like to arrange the new data.frame so that it always has 6 columns, the next row (after splinting since ncol>6) would contain the next 6 column names and next row their values. The last row if ncol<6 the values are filled with empty string including the column names.
Here is desired output:
a b c d e f
1 1 2 3 4 5 6
2 g h i j
3 7 8 9 10
Another example:
df <- data.frame(t(data.frame(seq(1,15,1)))); rownames(df) <- NULL;
colnames(df) <- letters[1:ncol(df)]
df
a b c d e f
1 1 2 3 4 5 6
2 g h i j k l
3 7 8 9 10 11 12
4 m n o
5 13 14 15
EDIT:
The way to approach it possibly is to:
n <- 6
ncl <- nrow(df)
s <- split(df, rep(1:ceiling(ncl/n), each=n, length.out=ncl))
s
s1 <- split(rownames(df), rep(1:ceiling(ncl/n), each=n, length.out=ncl))
s1
combine every second split of s and s1
s1[c(TRUE,FALSE)]
Here's a way, not so pretty, but this is an ugly question :D
library(tibble)
library(dplyr)
df1 <- matrix(c(names(df),rep('',6 - ncol(df)%%6)) %>% unlist, ncol=6,byrow=T) %>% as_tibble %>% rowid_to_column()
df2 <- matrix(c(df ,rep('',6 - ncol(df)%%6)) %>% unlist, ncol=6,byrow=T) %>% as_tibble %>% rowid_to_column()
bind_rows(df1,df2) %>% arrange(rowid) %>% select(-1) %>% setNames(.[1,]) %>% slice(-1)
# # A tibble: 3 x 6
# a b c d e f
# <chr> <chr> <chr> <chr> <chr> <chr>
# 1 1 2 3 4 5 6
# 2 g h i j
# 3 7 8 9 10
For the life of me I can't figure out a use-case for this... but for sake of the provided examples...
seq(1, ncol(df), by = 6) %>% {
starts <- .
ends <- c(lead(.,1,NULL)-1, ncol(df))
base_df <- df[,starts[[1]]:ends[[1]]]
rbind(base_df, rbind.pages(Map(function(s, e){
d <- df[,seq(s, e)]
data.frame(rbind(colnames(d), d)) %>% setNames(colnames(base_df)[1:length(.)])
}, s = starts[-1], e = ends[-1]))
) %>%
mutate_all(function(x){
ifelse(!is.na(x), x, "")
})
}
a b c d e f
1 1 2 3 4 5 6
2 g h i j k l
3 7 8 9 10 11 12
4 m n o
5 13 14 15
EDIT to coerce NA to 'empty string'