Extract values from list of lists with R - r

I have list of lists similar to this sample:
z <- list(list(num1=list((list(tab1=list(list(a=1, b=2, c=5), list(a=3, b=4), list(d=4,e=7)))))),list(num2=list((list(tab2=list(list(a=1, b=2), list(a=3, b=4)))))))
I would like to extract the figures out of the last list of lists names:
Desired output list (since 1 list entries are shorter) or as dataframe with columns corresponding to main list:
[1] a b c a b d e
[2] a b a b
dataframe:
column1 column2
a a
b b
c a
a b
b ""
d ""
e ""
I have tried various combinations of sapply(z, "[[", c("a","b"...) but failed, since the sublist names varies.
EDIT: Sorry, I needed the actual values not the last node (letters)! Additionally, each numeric value has column name, not set in the example above; it is like this:
[[1]]$num1[[1]]$tab1[[1]]$a
Name
1
So the desired solution are values:
[1]
1 2 5 3 4 4 7
[2]
1 2 3 4
I would actually need the numeric values instead of the letters. If you could adjust your solution to this I would be grateful. Thanks.

Try
lapply(z, function(x) as.numeric(unlist(x)))
## [[1]]
## [1] 1 2 5 3 4 4 7
##
## [[2]]
## [1] 1 2 3 4

z1 <- lapply(z, function(x) names(unlist(x)))
z1 <- lapply(z1, function(x) gsub(".*\\.", "", x))
n <- max(sapply(z1, length))
z1 <- lapply(z1, `length<-`, value = n)
setNames(as.data.frame(z1), paste0("Column", seq_along(z1)))
# Column1 Column2
#1 a a
#2 b b
#3 c a
#4 a b
#5 b <NA>
#6 d <NA>
#7 e <NA>

A bit far-fetched and everything but elegant, here is a way to get what you want :
lista<-unlist(lapply(strsplit(names(unlist(z)),"\\."),function(vec) vec[3]))
names(lista)<-unlist(lapply(strsplit(names(unlist(z)),"\\."),function(vec) vec[1]))
uninames<-unique(names(lista))
res<-sapply(uninames,function(x,vec){vec[names(vec)==x]},lista)
> res
$num1
num1 num1 num1 num1 num1 num1 num1
"a" "b" "c" "a" "b" "d" "e"
$num2
num2 num2 num2 num2
"a" "b" "a" "b"
UPDATE
To get the numbers :
a<-unlist(z)
b<-names(unique(z))
res<-sapply(unique(b),function(name,vec,l_name){vec[l_name==name]},a,b)
>res
$num1
num1.tab1.a num1.tab1.b num1.tab1.c num1.tab1.a num1.tab1.b num1.tab1.d num1.tab1.e
1 2 5 3 4 4 7
$num2
num2.tab2.a num2.tab2.b num2.tab2.a num2.tab2.b
1 2 3 4

Related

Removing cells from rows with NA

I have the following data ( with variable number of columns)
> df1<-data.frame(F1=c(1,5,"NA",9),F2=c(2,5,"a","NA"),F3=c(1,"NA","o","NA"))
> df1
F1 F2 F3
1 1 2 1
2 5 5 NA
3 NA a o
4 9 NA NA
and I want to remove the NA cells from the rows and shrink the columns only to the cells with information in it.
> df2
F1 F2 F3
1 1 2 1
2 5 5
3 a o
4 9
Thanks!
Firstly, you can use this function to move all non-NA cells to the left:
df1 <- data.frame(F1=c(1,5,NA,9),F2=c(2,5,"a",NA),F3=c(1,NA,"o",NA))
df1 <- as.data.frame(t(apply(df1,1, function(x) { return(c(x[!is.na(x)],x[is.na(x)]) )} )))
colnames(df1) <- c("F1", "F2", "F3")
Output:
> print(df1)
F1 F2 F3
1 1 2 1
2 5 5 <NA>
3 a o <NA>
4 9 <NA> <NA>
Secondly, in order to apply blank cells instead of NA-observations, you could try:
df1 <- sapply(df1, as.character)
df1[is.na(df1)] <- " "
df1 <- as.data.frame(df1)
Output:
> print(df1)
F1 F2 F3
1 1 2 1
2 5 5
3 a o
4 9
Note: I changed your string "NA" to simply NA in order to detect the observations better. I'm not sure if you actually want the NA values to be observed as strings.
We can try the code below
df1[] <- t(apply(
df1,
1,
function(v) {
v[order(v == "NA")]
}
))
which gives
> df1
F1 F2 F3
1 1 2 1
2 5 5 NA
3 a o NA
4 9 NA NA
Based on your preference I chose to complete each row after the omission of "NA" values, with "" as a sort of blank values. But you could choose to fill them with real NA values:
library(dplyr)
library(purrr)
df1 %>%
pmap_dfr(~ {x <- c(...)[c(...) != "NA"]
setNames(c(x, rep("", ncol(df1) - length(x))),
names(df1))})
# A tibble: 4 x 3
F1 F2 F3
<chr> <chr> <chr>
1 1 "2" "1"
2 5 "5" ""
3 a "o" ""
4 9 "" ""
First convert you data frame to character (else this will be an issue if you have both numeric and characters), then row apply, shift the values, pad with NAs
df2=sapply(df1,as.character)
t(
sapply(1:nrow(df1),function(i){
tmp=df1[i,df1[i,]!="NA"]
if (length(tmp)<ncol(df1)) {
tmp=c(tmp,rep("NA",ncol(df1)-length(tmp)))
}
tmp
})
)
F1 F2 F3
[1,] "1" "2" "1"
[2,] "5" "5" "NA"
[3,] "a" "o" "NA"
[4,] "9" "NA" "NA"

How to split character value properly

I have a data frame which consists of some composite information. I would like to split the vector a into the vectors "a" and "d", where "a" corresponds only to the numeric ID 898, 3467 ,234 ,222 and vector "d" contains the corresponding character values.
Data:
a<-c("898_Me","3467_You or ", "234_Hi-hi", "222_what")
b<-c(1,8,3,8)
c<-c(2,4,6,2)
df<-data.frame(a,b,c)
What I tried so far:
a<-str(df$a)
a<-strsplit(df$a, split)
But that just doesn't work out with my regular expression skills.
The required output table might have the form:
a d b c
898 Me 1 2
3467 You or 8 3
234 Hi-hi 3 6
222 what 8 2
library(tidyr)
a<-c("898_Me","3467_You or ", "234_Hi-hi", "222_what")
b<-c(1,8,3,8)
c<-c(2,4,6,2)
df <-data.frame(a,b,c)
final_df <- separate(df , a , c("a" , "d") , sep = "_")
# a d b c
#1 898 Me 1 2
#2 3467 You or 8 4
#3 234 Hi-hi 3 6
#4 222 what 8 2
final_df$d
# [1] "Me" "You or " "Hi-hi" "what"
strsplit is right, but you need to pass the character to split with:
do.call(rbind, strsplit(as.character(df$a), "_"))
# [,1] [,2]
# [1,] "898" "Me"
# [2,] "3467" "You or "
# [3,] "234" "Hi-hi"
# [4,] "222" "what"
Or
library(stringi)
stri_split_fixed(df$a, "_", simplify = TRUE)
With your example, Here is my solution in base R:
df$a2 <- gsub("[^0-9]", "", a)
df$d <- gsub("[0-9]", "", a)
That gives:
> df
a b c a2 d
1 898_Me 1 2 898 _Me
2 3467_You or 8 4 3467 _You or
3 234_Hi-hi 3 6 234 _Hi-hi
4 222_what 8 2 222 _what
Not elegant but it preserves original data and easy to apply.

R: change data frame structure using values from one variable as new variable

df1 <- data.frame(
name = c("a", "b", "b", "c"),
score = c(1, 1, 2, 1)
)
How can I get a new data frame with variables/columns from df$name and with each 'corresponding' df$score. I figure that its actually a two-step problem:
First I would need to make a list of (in this example) unequal length vectors like this:
$a
[1] 1
$b
[1] 1 2
$c
[1] 1
Second, NAs need to be padded so one get vectors of equal length before making the desired data frame
that would be like:
a b c
1 1 1 1
2 NA 2 NA
I cannot find any simple means to do this - Im sure there must be!
If the solution can be delivered using dplyr it would be fantastic! Thanks!
To split the data:
(s <- split(df1$score, df1$name))
# $a
# [1] 1
#
# $b
# [1] 1 2
#
# $c
# [1] 1
To create the new data frame:
as.data.frame(sapply(s, `length<-`, max(vapply(s, length, 1L))))
# a b c
# 1 1 1 1
# 2 NA 2 NA
Slightly more efficient would be to use vapply in place of sapply
len <- max(vapply(s, length, 1L))
as.data.frame(vapply(s, `length<-`, double(len), len))
# a b c
# 1 1 1 1
# 2 NA 2 NA

Sorting elements of a tie

Given this table
df <- data.frame(col1 = c(letters[3:5], "b","a"),
col2 = c(2:3, 1,1,1))
How can I tell R to return "a".
That means, from the three characters with value of 1 (a tie for the lowest value), I want to select only the first in alphabetical order
I think you want order
with(df, col1[order(col2, col1)][1])
# [1] a
# Levels: a b c d e
or
as.character(with(df, col1[order(col2, col1)][1]))
# [1] "a"
You can order column 1 by the ordered values in column 2 with
df[with(df, order(col2, col1)),]
# col1 col2
# 5 a 1
# 4 b 1
# 3 e 1
# 1 c 2
# 2 d 3
Try:
> min(as.character(df[df$col2==min(df$col2),1]))
[1] "a"
For explanation:
# first find col1 list in rows with minimum of df$col2
> xx = df[df$col2==min(df$col2),1]
> xx
[1] e b a
Levels: a b c d e
# Now find the minimum amongst these after converting factor to character:
> min(as.character(xx))
[1] "a"
>

Renaming duplicate strings in R

I have an R dataframe that has two columns of strings. In one of the columns (say, Column1) there are duplicate values. I need to relabel that column so that it would have the duplicated strings renamed with ordered suffixes, like in the Column1.new
Column1 Column2 Column1.new
1 A 1_1
1 B 1_2
2 C 2_1
2 D 2_2
3 E 3
4 F 4
Any ideas of how to do this would be appreciated.
Cheers,
Antti
Let's say your data (ordered by Column1) is within an object called tab. First create a run length object
c1.rle <- rle(tab$Column1)
c1.rle
##lengths: int [1:4] 2 2 1 1
##values : int [1:4] 1 2 3 4
That gives you values of Column1 and the according number of appearences of each element. Then use that information to create the new column with unique identifiers:
tab$Column1.new <- paste0(rep(c1.rle$values, times = c1.rle$lengths), "_",
unlist(lapply(c1.rle$lengths, seq_len)))
Not sure, if this is appropriate in your situation, but you could also just paste together Column1 and Column2, to create an unique identifier...
May be a little more of a workaround, but parts of this may be more useful and simpler for someone with not quite the same needs. make.names with the unique=T attribute adds a dot and numbers names that are repeated:
x <- make.names(tab$Column1,unique=T)
> print(x)
[1] "X1" "X1.1" "X2" "X2.1" "X3" "X4"
This might be enough for some folks. Here you can then grab the first entries of elements that are repeated, but not elements that are not repeated, then add a .0 to the end.
y <- rle(tab$Column1)
tmp <- !duplicated(tab$Column1) & (tab$Column1 %in% y$values[y$lengths>1])
x[tmp] <- str_replace(x[tmp],"$","\\.0")
> print(x)
[1] "X1.0" "X1.1" "X2.0" "X2.1" "X3" "X4"
Replace the dots and remove the X
x <- str_replace(x,"X","")
x <- str_replace(x,"\\.","_")
> print(x)
[1] "1_0" "1_1" "2_0" "2_1" "3" "4"
Might be good enough for you. But if you want the indexing to start at 1, grab the numbers, add one then put them back.
z <- str_match(x,"_([0-9]*)$")[,2]
z <- as.character(as.numeric(z)+1)
x <- str_replace(x,"_([0-9]*)$",paste0("_",z))
> print(x)
[1] "1_1" "1_2" "2_1" "2_2" "3" "4"
Like I said, more of a workaround here, but gives some options.
d <- read.table(text='Column1 Column2
1 A
1 B
2 C
2 D
3 E
4 F', header=TRUE)
transform(d,
Column1.new = ifelse(duplicated(Column1) | duplicated(Column1, fromLast=TRUE),
paste(Column1, ave(Column1, Column1, FUN=seq_along), sep='_'),
Column1))
# Column1 Column2 Column1.new
# 1 1 A 1_1
# 2 1 B 1_2
# 3 2 C 2_1
# 4 2 D 2_2
# 5 3 E 3
# 6 4 F 4
#Cão answer only with base R:
x=read.table(text="
Column1 Column2 #Column1.new
1 A #1_1
1 B #1_2
2 C #2_1
2 D #2_2
3 E #3
4 F #4", stringsAsFactors=F, header=T)
string<-x$Column1
mstring <- make.unique(as.character(string) )
mstring<-sub("(.*)(\\.)([0-9]+)","\\1_\\3",mstring)
y <- rle(string)
tmp <- !duplicated(string) & (string %in% y$values[y$lengths>1])
mstring[tmp]<-gsub("(.*)","\\1_0", mstring[tmp])
end <- sub(".*_([0-9]+)","\\1",grep("_([0-9]*)$",mstring,value=T) )
beg <- sub("(.*_)[0-9]+","\\1",grep("_([0-9]*)$",mstring,value=T) )
newend <- as.numeric(end)+1
mstring[grep("_([0-9]*)$",mstring)]<-paste0(beg,newend)
x$Column1New<-mstring
x
It's a very old post, and I am probably missing something obvious, but what is wrong with(?):
tab$Column1 <- make.unique(tab$Column1.sep="_")
Albeit I believe this requires character input.

Resources