List to dataframe conversion in R: Keeping List indexes - r

I am transforming a list of character vectors into a dataframe using R. How can I get the list indices also into the dataframe?
list1 = list(c('kip','kroket'),'ei','koe')
print(list1)
##[[1]]
##[1] "kip" "kroket"
##[[2]]
##[1] "ei"
##[[3]]
##[1] "koe"
df = data.frame(col1 = unlist(x))
print(df)
## col1
##1 kip
##2 kroket
##3 ei
##4 koe
The preferred output would look like this:
## col1 col2
##1 kip 1
##2 kroket 1
##3 ei 2
##4 koe 3

An idea via base R,
data.frame(v1 = unlist(list1), v2 = rep(seq(length(list1)), lengths(list1)))
# v1 v2
#1 kip 1
#2 kroket 1
#3 ei 2
#4 koe 3

tidyverse method
list1 %>% map(~as_tibble(.)) %>% bind_rows(.id="index")
# A tibble: 4 x 2
index value
<chr> <chr>
1 1 kip
2 1 kroket
3 2 ei
4 3 koe

We can name the list along with it's length and then use stack
names(list1) <- seq_along(list1)
stack(list1)
# values ind
#1 kip 1
#2 kroket 1
#3 ei 2
#4 koe 3
Or another option could be using enframe and unnest
list1 %>% tibble::enframe() %>% tidyr::unnest()

Related

Rename variables (columns) in datasets that are in a list

I have a list of datasets with different variables. I need to rename them according to the naming convention in the name dataframe below.
df1 <- data.frame(x1= c(1,2,3), x2=c(1,2,3))
df2 <- data.frame(x1= c(1,2,3), x3=c(1,2,3))
df3 <- data.frame(x4= c(1,2,3), x5=c(1,2,3))
mylist <- list(df1,df2,df3)
name <- data.frame(old= c("x1","x2","x3","x4","x5"), new=c("A","B","A","A","C"))
I can do this one by one, but I am wondering how to be more efficient and rename them all at once
newdf <- map_if(mylist, ~ "x1" %in% colnames(.x),
.f = list(. %>% rename("A"="x1")))
I was hoping something like this would work, but it doesn't:
for (i in nrow(name)) {
newdf <- map_if(mylist, ~ name[i,1] %in% colnames(.x),
.f = list(. %>% rename(name[2] = name[1])))
}
You can use setnames from data.table, which can take a list of old and new names.
library(data.table)
library(purrr)
map(mylist, ~ setnames(.x, name$old, name$new, skip_absent=TRUE))
Output
[[1]]
A B
1 1 1
2 2 2
3 3 3
[[2]]
A A
1 1 1
2 2 2
3 3 3
[[3]]
A C
1 1 1
2 2 2
3 3 3
Column names must be unique, so there is a typo (?) in your example (as "x1" and "x3" would both be re-labelled as "A").
If we fix the typo, here is an option using map and rename_with.
name <- data.frame(old= c("x1","x2","x3","x4","x5"), new=c("A","B","C","D","E"))
library(tidyverse)
mylist %>%
map(function(df) df %>% rename_with(~ name$new[match(.x, name$old)]))
#[[1]]
# A B
#1 1 1
#2 2 2
#3 3 3
#
#[[2]]
# A C
#1 1 1
#2 2 2
#3 3 3
#
#[[3]]
# D E
#1 1 1
#2 2 2
#3 3 3
You could use set_names + recode:
library(tidyverse)
map(mylist, set_names, ~ recode(.x, !!!deframe(name)))
[[1]]
A B
1 1 1
2 2 2
3 3 3
[[2]]
A A
1 1 1
2 2 2
3 3 3
[[3]]
A C
1 1 1
2 2 2
3 3 3

how to split a string column in R based on equal length and get them in different rows

library(tidyr)
library(dplyr)
mydf
V1 V2
2 1 abcdef
3 2 abcd
4 3 bghj
5 4 kl
6 5 uilm
I want to get my data frame that in result V2 column should be separated in the length of 2 in separate rows
V1 V2
1 1 ab
2 1 cd
3 1 ef
4 2 ab
5 2 cd
6 3 bg
7 3 hj
8 4 kl
9 5 ui
10 5 lm
Here is a base R option splitting the string every 2 characters -
mydf <- data.frame(V1 = 1:5, V2 = c('abcdef', 'abcd', 'bghj', 'kl', 'ulim'))
tmp <- strsplit(mydf$V2, '(?<=..)', perl = TRUE)
result <- mydf[rep(1:nrow(mydf), lengths(tmp)), ]
result$V2 <- unlist(tmp)
rownames(result) <- NULL
result
# V1 V2
#1 1 ab
#2 1 cd
#3 1 ef
#4 2 ab
#5 2 cd
#6 3 bg
#7 3 hj
#8 4 kl
#9 5 ul
#10 5 im
Another tidyverse approach. Basically it adds some extra character like # which may not be present elsewhere and then use tidyr::separate_rows
library(tidyverse)
df %>% mutate(V2 = map_chr(strsplit(V2, '(?<=..)', perl = T), ~paste(.x, collapse = '#'))) %>%
separate_rows(V2)
#> # A tibble: 10 x 2
#> V1 V2
#> <int> <chr>
#> 1 1 ab
#> 2 1 cd
#> 3 1 ef
#> 4 2 ab
#> 5 2 cd
#> 6 3 bg
#> 7 3 hj
#> 8 4 kl
#> 9 5 ul
#> 10 5 im
Created on 2021-06-04 by the reprex package (v2.0.0)
You can also use the following solution:
library(dplyr)
library(tidyr)
library(stringr)
df %>%
rowwise() %>%
mutate(V2 = list(str_sub(V2, seq(1, nchar(V2)-1, 2), seq(2, nchar(V2), 2)))) %>%
unnest_longer(col = V2)
# A tibble: 10 x 2
V1 V2
<int> <chr>
1 1 ab
2 1 cd
3 1 ef
4 2 ab
5 2 cd
6 3 bg
7 3 hj
8 4 kl
9 5 ui
10 5 lm
You can define a function to sub-string every other character and apply it row wise on V2 to create a nested column of character vector. Then, unnest the column.
library(tidyverse)
mydf <- read.table(
text = "
V1 V2
1 abcdef
2 abcd
3 bghj
4 kl
5 uilm",
header = TRUE
)
get_string <- function(str) {
n <- seq(1, nchar(str), 2)
map_chr(n, ~ str_sub(str, ., . + 1))
}
mydf %>%
rowwise() %>%
mutate(V2 = list(get_string(V2))) %>%
ungroup() %>%
unnest(V2)
# # A tibble: 10 x 2
# V1 V2
# <int> <chr>
# 1 1 ab
# 2 1 cd
# 3 1 ef
# 4 2 ab
# 5 2 cd
# 6 3 bg
# 7 3 hj
# 8 4 kl
# 9 5 ui
# 10 5 lm

How to reduce factor levels depending on other attribute?

I have a dataframe of two columns id and result, and I want to assign factor levels to result depending on id. So that for id "1", result c("a","b","c","d") will have factor levels 1,2,3,4.
For id "2", result c("22","23","24") will have factor levels 1,2,3.
id <- c(1,1,1,1,2,2,2)
result <- c("a","b","c","d","22","23","24")
I tried to group them by split, but they will be converted to a list instead of a data frame, which causes a length problem for modeling. Can you help please?
Though the question was closed as a duplicate by user #Ronak Shah, I don't believe it is the same question.
After numbering the row by group the new column must be coerced to class "factor".
library(dplyr)
id <- c(1,1,1,1,2,2,2)
result <- c("a","b","c","d","22","23","24")
df <- data.frame(id, result)
df %>%
group_by(id) %>%
mutate(fac = row_number()) %>%
ungroup() %>%
mutate(fac = factor(fac))
# A tibble: 7 x 3
# id result fac
# <dbl> <fct> <fct>
#1 1 a 1
#2 1 b 2
#3 1 c 3
#4 1 d 4
#5 2 22 1
#6 2 23 2
#7 2 24 3
Edit.
If there are repeated values in result, coerce as.integer/factor to get numbers, then coerce those numbers to factor.
id2 <- c(1,1,1,1,2,2,2,2)
result2 <- c("a","b","c","d","22", "22","23","24")
df2 <- data.frame(id = id2, result = result2)
df2 %>%
group_by(id) %>%
mutate(fac = as.integer(factor(result))) %>%
ungroup() %>%
mutate(fac = factor(fac))
# A tibble: 8 x 3
# id result fac
# <dbl> <fct> <fct>
#1 1 a 1
#2 1 b 2
#3 1 c 3
#4 1 d 4
#5 2 22 1
#6 2 22 1
#7 2 23 2
#8 2 24 3
After grouping by id, we can use match with unique to assign unique number to each result. Using #Rui Barradas' dataframe df2
library(dplyr)
df2 %>%
group_by(id) %>%
mutate(ans = match(result, unique(result))) %>%
ungroup %>%
mutate(ans = factor(ans))
# id result ans
# <dbl> <fct> <fct>
#1 1 a 1
#2 1 b 2
#3 1 c 3
#4 1 d 4
#5 2 22 1
#6 2 22 1
#7 2 23 2
#8 2 24 3

Count number of rows in another dataframe

I have two dataframes df1 and df2:
df1 <- data.frame(id=1:5, var1=c("c3e", "d3r", "ff2", "dfl", "df4"))
df2 <- data.frame(id=1:10, var1=c("d3r", "d3r", "c3e", "dfl", "dfl", "dfl", "c3e", "df4", "c3e", "c3e"))
How can I best create a new column in df1 giving the number of appearances of each value of df1$var1 in df2? E.g. 'c3e' appears four times, 'd3r' twice etc.
We can loop through each value of df1$var1 and calculate number of times that value occur in df2s var1.
df1$count <- sapply(df1$var1, function(x) sum(df2$var1 %in% x))
df1
# id var1 count
#1 1 c3e 4
#2 2 d3r 2
#3 3 ff2 0
#4 4 dfl 3
#5 5 df4 1
df1$count <- table(df2$var1)[df1$var1]
df1$count[is.na(df1$count)] <- 0 # change NA to 0
df1
# id var1 count
# 1 1 c3e 4
# 2 2 d3r 2
# 3 3 ff2 0
# 4 4 dfl 3
# 5 5 df4 1
#Jaap had also a nice suggestion:
df1$count <- table(factor(df2$var1, levels = df1$var1))
Here is one option with data.table
library(data.table)
setDT(df1)[setDT(df2)[df1, .N, on = .(var1), by = .EACHI],
count := N , on = .(var1)]
df1
# id var1 count
#1: 1 c3e 4
#2: 2 d3r 2
#3: 3 ff2 0
#4: 4 dfl 3
#5: 5 df4 1

lapply aggregate columns in multiple dataframes R

I have several dataframes in a list in R. There are entries in each of those DF I would like to summarise. Im trying to get into lapply so that would be my preferred way (though if theres a better solution I would be happy to know it and why).
My Sample data:
df1 <- data.frame(Count = c(1,2,3), ID = c("A","A","C"))
df2 <- data.frame(Count = c(1,1,2), ID = c("C","B","C"))
dfList <- list(df1,df2)
> head(dfList)
[[1]]
Count ID
1 1 A
2 2 A
3 3 C
[[2]]
Count ID
1 1 C
2 1 B
3 2 C
I tried to implement this in lapply with
dfList_agg<-lapply(dfList, function(i) {
aggregate(i[[1:length(i)]][1L], by=list(names(i[[1:length(i)]][2L])), FUN=sum)
})
However this gives me a error "arguments must have same length". What am I doing wrong?
My desired output would be the sum of Column "Count" by "ID" which looks like this:
>head(dfList_agg)
[[1]]
Count ID
1 3 A
2 3 C
[[2]]
Count ID
1 3 C
2 1 B
I think you've overcomplicated it. Try this...
dfList_agg<-lapply(dfList, function(i) {
aggregate(i[,1], by=list(i[,2]), FUN=sum)
})
dflist_agg
[[1]]
Group.1 x
1 A 3
2 C 3
[[2]]
Group.1 x
1 B 1
2 C 3
Here is a third option
lapply(dfList, function(x) aggregate(. ~ ID, data = x, FUN = "sum"))
#[[1]]
# ID Count
#1 A 3
#2 C 3
#
#[[2]]
#ID Count
#1 B 1
#2 C 3
I guess this is what you need
library(dplyr)
lapply(dfList,function(x) ddply(x,.(ID),summarize,Count=sum(Count)))
An option with tidyverse would be
library(tidyverse)
map(dfList, ~ .x %>%
group_by(ID) %>%
summarise(Count = sum(Count)) %>%
select(names(.x)))
#[[1]]
# A tibble: 2 x 2
# Count ID
# <dbl> <fctr>
#1 3.00 A
#2 3.00 C
#[[2]]
# A tibble: 2 x 2
# Count ID
# <dbl> <fctr>
#1 1.00 B
#2 3.00 C

Resources