Auto-generate code in R to create data frame [duplicate] - r

I miss a way to add data to an SO answer in a transparent manner. My experience is that the structure object from dput() at times confuses inexperienced users unnecessary. I do however not have the patience to copy/paste it into a simple data frame each time and would like to automate it. Something similar to dput(), but in a simplified version.
Say I by copy/pasting and some other hos have data like this,
Df <- data.frame(A = c(2, 2, 2, 6, 7, 8),
B = c("A", "G", "N", NA, "L", "L"),
C = c(1L, 3L, 5L, NA, NA, NA))
looks like this,
Df
#> A B C
#> 1 2 A 1
#> 2 2 G 3
#> 3 2 N 5
#> 4 6 <NA> NA
#> 5 7 L NA
#> 6 8 L NA
Within one integer, one factor and one numeric vector,
str(Df)
#> 'data.frame': 6 obs. of 3 variables:
#> $ A: num 2 2 2 6 7 8
#> $ B: Factor w/ 4 levels "A","G","L","N": 1 2 4 NA 3 3
#> $ C: int 1 3 5 NA NA NA
Now, I would like to share this on SO, but I do not always have the orginal data frame it came from. More often than not I pipe() it in form SO and the only way I know to get it out is dput(). Like,
dput(Df)
#> structure(list(A = c(2, 2, 2, 6, 7, 8), B = structure(c(1L, 2L,
#> 4L, NA, 3L, 3L), .Label = c("A", "G", "L", "N"), class = "factor"),
#> C = c(1L, 3L, 5L, NA, NA, NA)), .Names = c("A", "B", "C"), row.names = c(NA,
#> -6L), class = "data.frame")
but, as I said at the top, these structures can look quite confusing. For that reason I am looking for a way to compress dput()'s output in some way. I imagine an output that looks something like this,
dput_small(Df)
#> data.frame(A = c(2, 2, 2, 6, 7, 8), B = c("A", "G", "N", NA, "L", "L"),
#> C = c(1L, 3L, 5L, NA, NA, NA))
Is that possible? I realize there's other classes, like lists, tbl, tbl_df, etc.

Edit: leaving the older solution at the bottom because it got a bounty and many votes but proposing an improved answer
You can use the {constructive} package, now only on GitHub but might be on CRAN by the time you read this :
# remotes::install_github("cynkra/constructive")
Df <- data.frame(A = c(2, 2, 2, 6, 7, 8),
B = c("A", "G", "N", NA, "L", "L"),
C = c(1L, 3L, 5L, NA, NA, NA))
constructive::construct(Df)
#> data.frame(
#> A = c(2, 2, 2, 6, 7, 8),
#> B = c("A", "G", "N", NA, "L", "L"),
#> C = c(1L, 3L, 5L, NA, NA, NA)
#> )
It has custom constructors to many common classes so it should be able to reproduce most objects faithfully in a human readable way.
Old solution:
3 solutions :
a wrapper around dput (handles standard data.frames, tibbles and lists)
a read.table solution (for data.frames)
a tibble::tribble solution (for data.frames, returning a tibble)
All include n and random parameter which allow one to dput only the head of the data or sample it on the fly.
dput_small1(Df)
# Df <- data.frame(
# A = c(2, 2, 2, 6, 7, 8),
# B = structure(c(1L, 2L, 4L, NA, 3L, 3L), .Label = c("A", "G", "L",
# "N"), class = "factor"),
# C = c(1L, 3L, 5L, NA, NA, NA) ,
# stringsAsFactors=FALSE)
dput_small2(Df,stringsAsFactors=TRUE)
# Df <- read.table(sep="\t", text="
# A B C
# 2 A 1
# 2 G 3
# 2 N 5
# 6 NA NA
# 7 L NA
# 8 L NA", header=TRUE, stringsAsFactors=TRUE)
dput_small3(Df)
# Df <- tibble::tribble(
# ~A, ~B, ~C,
# 2, "A", 1L,
# 2, "G", 3L,
# 2, "N", 5L,
# 6, NA_character_, NA_integer_,
# 7, "L", NA_integer_,
# 8, "L", NA_integer_
# )
# Df$B <- factor(Df$B)
Wrapper around dput
This option that gives an output very close to the one proposed in the question. It's quite general because it's actually wrapped around dput, but applied separately on columns.
multiline means 'keep dput's default output laid out into multiple lines'.
dput_small1<- function(x,
name=as.character(substitute(x)),
multiline = TRUE,
n=if ('list' %in% class(x)) length(x) else nrow(x),
random=FALSE,
seed = 1){
name
if('tbl_df' %in% class(x)) create_fun <- "tibble::tibble" else
if('list' %in% class(x)) create_fun <- "list" else
if('data.table' %in% class(x)) create_fun <- "data.table::data.table" else
create_fun <- "data.frame"
if(random) {
set.seed(seed)
if(create_fun == "list") x <- x[sample(1:length(x),n)] else
x <- x[sample(1:nrow(x),n),]
} else {
x <- head(x,n)
}
line_sep <- if (multiline) "\n " else ""
cat(sep='',name," <- ",create_fun,"(\n ",
paste0(unlist(
Map(function(item,nm) paste0(nm,if(nm=="") "" else " = ",paste(capture.output(dput(item)),collapse=line_sep)),
x,if(is.null(names(x))) rep("",length(x)) else names(x))),
collapse=",\n "),
if(create_fun == "data.frame") ",\n stringsAsFactors = FALSE)" else "\n)")
}
dput_small1(list(1,2,c=3,d=4),"my_list",random=TRUE,n=3)
# my_list <- list(
# 2,
# d = 4,
# c = 3
# )
read.table solution
For data.frames I find it comfortable however to have the input in a more explicit/tabular format.
This can be reached using read.table, then reformatting automatically the type of columns that read.table wouldn't get right. Not as general as first solution but will work smoothly for 95% of the cases found on SO.
dput_small2 <- function(df,
name=as.character(substitute(df)),
sep='\t',
header=TRUE,
stringsAsFactors = FALSE,
n= nrow(df),
random=FALSE,
seed = 1){
name
if(random) {
set.seed(seed)
df <- df[sample(1:nrow(df),n),]
} else {
df <- head(df,n)
}
cat(sep='',name,' <- read.table(sep="',sub('\t','\\\\t',sep),'", text="\n ',
paste(colnames(df),collapse=sep))
df <- head(df,n)
apply(df,1,function(x) cat(sep='','\n ',paste(x,collapse=sep)))
cat(sep='','", header=',header,', stringsAsFactors=',stringsAsFactors,')')
sapply(names(df), function(x){
if(is.character(df[[x]]) & suppressWarnings(identical(as.character(as.numeric(df[[x]])),df[[x]]))){ # if it's a character column containing numbers
cat(sep='','\n',name,'$',x,' <- as.character(', name,'$',x,')')
} else if(is.factor(df[[x]]) & !stringsAsFactors) { # if it's a factor and conversion is not automated
cat(sep='','\n',name,'$',x,' <- factor(', name,'$',x,')')
} else if(inherits(df[[x]], "POSIXct")){
cat(sep='','\n',name,'$',x,' <- as.POSIXct(', name,'$',x,')')
} else if(inherits(df[[x]], "Date")){
cat(sep='','\n',name,'$',x,' <- as.Date(', name,'$',x,')')
}})
invisible(NULL)
}
Simplest case
dput_small2(iris,n=6)
will print:
iris <- read.table(sep="\t", text="
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
5.1 3.5 1.4 0.2 setosa
4.9 3.0 1.4 0.2 setosa
4.7 3.2 1.3 0.2 setosa
4.6 3.1 1.5 0.2 setosa
5.0 3.6 1.4 0.2 setosa
5.4 3.9 1.7 0.4 setosa", header=TRUE, stringsAsFactors=FALSE)
which in turn when executed will return :
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# 1 5.1 3.5 1.4 0.2 setosa
# 2 4.9 3.0 1.4 0.2 setosa
# 3 4.7 3.2 1.3 0.2 setosa
# 4 4.6 3.1 1.5 0.2 setosa
# 5 5.0 3.6 1.4 0.2 setosa
# 6 5.4 3.9 1.7 0.4 setosa
str(iris)
# 'data.frame': 6 obs. of 5 variables:
# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4
# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9
# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7
# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4
# $ Species : chr " setosa" " setosa" " setosa" " setosa" ...
more complex
dummy data:
test <- data.frame(a=1:5,
b=as.character(6:10),
c=letters[1:5],
d=factor(letters[6:10]),
e=Sys.time()+(1:5),
stringsAsFactors = FALSE)
This:
dput_small2(test,'df2')
will print:
df2 <- read.table(sep="\t", text="
a b c d e
1 6 a f 2018-02-15 11:53:17
2 7 b g 2018-02-15 11:53:18
3 8 c h 2018-02-15 11:53:19
4 9 d i 2018-02-15 11:53:20
5 10 e j 2018-02-15 11:53:21", header=TRUE, stringsAsFactors=FALSE)
df2$b <- as.character(df2$b)
df2$d <- factor(df2$d)
df2$e <- as.POSIXct(df2$e)
which in turn when executed will return :
# a b c d e
# 1 1 6 a f 2018-02-15 11:53:17
# 2 2 7 b g 2018-02-15 11:53:18
# 3 3 8 c h 2018-02-15 11:53:19
# 4 4 9 d i 2018-02-15 11:53:20
# 5 5 10 e j 2018-02-15 11:53:21
str(df2)
# 'data.frame': 5 obs. of 5 variables:
# $ a: int 1 2 3 4 5
# $ b: chr "6" "7" "8" "9" ...
# $ c: chr "a" "b" "c" "d" ...
# $ d: Factor w/ 5 levels "f","g","h","i",..: 1 2 3 4 5
# $ e: POSIXct, format: "2018-02-15 11:53:17" "2018-02-15 11:53:18" "2018-02-15 11:53:19" "2018-02-15 11:53:20" ...
all.equal(df2,test)
# [1] "Component “e”: Mean absolute difference: 0.4574251" # only some rounding error
tribble solution
The read.table option is very readable but not very general. with tribble pretty much any data type can be handled (though factors need adhoc fixing).
This solution isn't so useful for OP's example but is great for list columns (see example below). To make use of the output, library tibble is required.
Just as my first solution, it's a wrapper around dput, but instead of 'dputting' columns, i'm 'dputting' elements.
dput_small3 <- function(df,
name=as.character(substitute(df)),
n= nrow(df),
random=FALSE,
seed = 1){
name
if(random) {
set.seed(seed)
df <- df[sample(1:nrow(df),n),]
} else {
df <- head(df,n)
}
df1 <- lapply(df,function(col) if(is.factor(col)) as.character(col) else col)
dputs <- sapply(df1,function(col){
col_dputs <- sapply(col,function(elt) paste(capture.output(dput(elt)),collapse=""))
max_char <- max(nchar(unlist(col_dputs)))
sapply(col_dputs,function(elt) paste(c(rep(" ",max_char-nchar(elt)),elt),collapse=""))
})
lines <- paste(apply(dputs,1,paste,collapse=", "),collapse=",\n ")
output <- paste0(name," <- tibble::tribble(\n ",
paste0("~",names(df),collapse=", "),
",\n ",lines,"\n)")
cat(output)
sapply(names(df), function(x) if(is.factor(df[[x]])) cat(sep='','\n',name,'$',x,' <- factor(', name,'$',x,')'))
invisible(NULL)
}
dput_small3(dplyr::starwars[c(1:3,11)],"sw",n=6,random=TRUE)
# sw <- tibble::tribble(
# ~name, ~height, ~mass, ~films,
# "Lando Calrissian", 177L, 79, c("Return of the Jedi", "The Empire Strikes Back"),
# "Finis Valorum", 170L, NA_real_, "The Phantom Menace",
# "Ki-Adi-Mundi", 198L, 82, c("Attack of the Clones", "The Phantom Menace", "Revenge of the Sith"),
# "Grievous", 216L, 159, "Revenge of the Sith",
# "Wedge Antilles", 170L, 77, c("Return of the Jedi", "The Empire Strikes Back", "A New Hope"),
# "Wat Tambor", 193L, 48, "Attack of the Clones"
# )

The package datapasta won't always work perfectly as it currently doesn't support all types, but it is clean and easy, i.e.,
# install.packages(c("datapasta"), dependencies = TRUE)
datapasta::dpasta(Df)
#> data.frame(
#> A = c(2, 2, 2, 6, 7, 8),
#> C = c(1L, 3L, 5L, NA, NA, NA),
#> B = as.factor(c("A", "G", "N", NA, "L", "L"))
#> )

We could set control to NULL to simplify:
dput(Df, control = NULL)
# list(A = c(2, 2, 2, 6, 7, 8), B = c(NA, NA, NA, NA, 7, 9), C = c(1, 3, 5, NA, NA, NA))
Then wrap it with data.frame:
data.frame(dput(Df, control = NULL))
Edit: To avoid factor columns getting converted to numbers, we could convert them to character before calling dput:
dput_small <- function(d){
ix <- sapply(d, is.factor)
d[ix] <- lapply(d[ix], as.character)
dput(d, control = NULL)
}

You could simply write to a compressed connection.
gz <- gzfile("foo.gz", open="wt")
dput(Df, gz)
close(gz)

Generally a large dput is difficult to cope with, on SO or otherwise. Instead you can just save the structure directly to an Rda file:
save(Df, file='foo.Rda')
And read it back in:
load('foo.Rda')
See this question for a little more info and credit where credit is due: How to save a data.frame in R?
You could also look at the sink function...
If I've missed the purpose of your question, please feel free to expand on the reasons why dput is the only mechanism for you.

It might be worth mentioning memCompress and memDecompress here. For in-memory objects, it can reduce the size of large objects by compressing them as specified. And the latter reverses the compression. They're actually quite useful for package objects.
sum(nchar(dput(DF)))
# [1] 64
( mDF <- memCompress(as.character(DF)) )
# [1] 78 9c 4b d6 30 d2 51 80 20 33 1d 05 73 1d 05 0b 4d ae 64 0d 3f 47 1d 05 64 0c 14 b7 04 89 1b ea 28 18 eb 28 98 22 4b 6a 02 00 a8 ba 0c d2
length(mDF)
# [1] 46
cat(mdDF <- memDecompress(mDF, "gzip", TRUE))
# c(2, 2, 2, 6, 7, 8)
# c(NA, NA, NA, NA, 7, 9)
# c(1, 3, 5, NA, NA, NA)
nchar(mdDF)
# [1] 66
I haven't quite determined if the data frame can be reassembled easily, but I'm sure it can be.

There is also the read.so package, which I really like, in particular to read SO data.
It works for tibbles as well.
#devtools::install_github("alistaire47/read.so")
Df <- data.frame(A = c(2, 2, 2, 6, 7, 8),
B = c("A", "G", "N", NA, "L", "L"),
C = c(1L, 3L, 5L, NA, NA, NA))
read.so::write.so(Df)
#> Df <- data.frame(
#> A = c(2, 2, 2, 6, 7, 8),
#> B = c("A", "G", "N", NA, "L", "L"),
#> C = c(1L, 3L, 5L, NA, NA, NA)
#> )

Related

Routine for non-manual argument of a set of variables in coalesce() dplyr function [duplicate]

This question already has answers here:
Using dplyr to fill in missing values (through a join?)
(3 answers)
Closed 8 months ago.
This post was edited and submitted for review 8 months ago and failed to reopen the post:
Original close reason(s) were not resolved
I have a list of dfs to be combined into one. These dfs have some matching columns and rows and some distinct or missing ones.
The minimum structure (for understanding) of the first two dfs.
df1:
df1 <- structure(list(id = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6),
Name = c("LI","NO","WH","MA","BU","SO","FO","AT","CO","IN","SP","CE"),
H_A = c("H", "A", "H", "A", "H", "A", "H", "A", "H", "A", "H", "A"),
W = c(15, 13, 5, 13, 9, 12, 10, 13, 1, 8, 4, 2),
X = c(NA, NA, NA, NA, NA, NA, 12, 7, 5, 13, 1, 3),
Y = c(0, 0, 0, 0, 0,0, NA, NA, NA, NA, NA, NA)),
row.names = c(NA,-12L), class = c("tbl_df","tbl", "data.frame"))
df2:
df2 <- structure(list(id = c(1, 1, 2, 2, 3, 3),
Name = c("LI","NO", "WH", "MA", "BU", "SO"),
H_A = c("H", "A", "H", "A", "H", "A"),
W = c(15, 13, 5, 13, 9, 12),
X = c(10, 12, 11, 15, 6, 14),
Z = c(4, 14, 16, 16, 25, 30)),
row.names = c(NA,-6L),class = c("tbl_df", "tbl", "data.frame"))
This can be solved with this alternative:
df_combined <- full_join(df1, df2, by = c("id", "Name", "H_A")) %>%
mutate(X = coalesce(X.x, X.y),
W = coalesce(W.x, W.y)) %>%
select(-contains("."))
I would like to automate the routine for non-manual input of the variables in mutate coalesce function. After all, there are several variables for the context X and W above. In addition to this I will continue the routine for df3, df4, df5 that have the same minimal matching with df1.
Joins by their nature don't natively fill in positions we have to implement a fix to solve this problem, and although you can use if else statements as shown in the answer above, coalesce() is a much cleaner function to use.
See this post here for another example (could potentially be seen as a repeated question).
Using dplyr to fill in missing values (through a join?)
library(tidyverse)
df_test <- full_join(df1, df2, by = c("id", "Name", "H_A")) %>%
mutate(X = coalesce(X.x, X.y),
W = coalesce(W.x, W.y)) %>%
select(id, Name, H_A, W, X, Y, Z)
df_test == df_combined
id Name H_A W X Y Z
[1,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[2,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[3,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[4,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[5,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[6,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[7,] TRUE TRUE TRUE TRUE TRUE NA NA
[8,] TRUE TRUE TRUE TRUE TRUE NA NA
[9,] TRUE TRUE TRUE TRUE TRUE NA NA
[10,] TRUE TRUE TRUE TRUE TRUE NA NA
[11,] TRUE TRUE TRUE TRUE TRUE NA NA
[12,] TRUE TRUE TRUE TRUE TRUE NA NA
NA's expectedly return NA as you can't match two NA's together using a simple == statement.
You can use left_join from dplyr and substitute NA's like this, where I am guessing Id and H_A together make a key value:
library(dplyr)
df1 <- structure(list(id = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6),
Name = c("LI","NO","WH","MA","BU","SO","FO","AT","CO","IN","SP","CE"),
H_A = c("H", "A", "H", "A", "H", "A", "H", "A", "H", "A", "H", "A"),
W = c(15, 13, 5, 13, 9, 12, 10, 13, 1, 8, 4, 2),
X = c(NA, NA, NA, NA, NA, NA, 12, 7, 5, 13, 1, 3),
Y = c(0, 0, 0, 0, 0,0, NA, NA, NA, NA, NA, NA)),
row.names = c(NA,-12L), class = c("tbl_df","tbl", "data.frame"))
df2 <- structure(list(id = c(1, 1, 2, 2, 3, 3),
Name = c("LI","NO", "WH", "MA", "BU", "SO"),
H_A = c("H", "A", "H", "A", "H", "A"),
W = c(15, 13, 5, 13, 9, 12),
X = c(10, 12, 11, 15, 6, 14),
Z = c(4, 14, 16, 16, 25, 30)),
row.names = c(NA,-6L),class = c("tbl_df", "tbl", "data.frame"))
df_combined <- left_join(df1,
df2 %>%
select(id, H_A, "df2_X" = X, Z)) %>%
mutate(X = if_else(is.na(X), df2_X, X)) %>%
select(-df2_X)
#> Joining, by = c("id", "H_A")
df_combined
#> # A tibble: 12 × 7
#> id Name H_A W X Y Z
#> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 1 LI H 15 10 0 4
#> 2 1 NO A 13 12 0 14
#> 3 2 WH H 5 11 0 16
#> 4 2 MA A 13 15 0 16
#> 5 3 BU H 9 6 0 25
#> 6 3 SO A 12 14 0 30
#> 7 4 FO H 10 12 NA NA
#> 8 4 AT A 13 7 NA NA
#> 9 5 CO H 1 5 NA NA
#> 10 5 IN A 8 13 NA NA
#> 11 6 SP H 4 1 NA NA
#> 12 6 CE A 2 3 NA NA
data.table approach
library(data.table)
# set to data.table format
setDT(df1); setDT(df2)
# perform an update join, overwriting NA-values in W, X and Y, and
# adding Z, based on key-columns ID, Name and H_A
df1[df2, `:=`(W = ifelse(is.na(W), i.W, W),
X = ifelse(is.na(X), i.X, X),
Y = ifelse(is.na(Y), i.Y, Y),
Z = i.Z),
on = .(id, Name, H_A)][]
# id Name H_A W X Y Z
# 1: 1 LI H 15 10 0 4
# 2: 1 NO A 13 12 0 14
# 3: 2 WH H 5 11 0 16
# 4: 2 MA A 13 15 0 16
# 5: 3 BU H 9 6 0 25
# 6: 3 SO A 12 14 0 30
# 7: 4 FO H 10 12 NA NA
# 8: 4 AT A 13 7 NA NA
# 9: 5 CO H 1 5 NA NA
#10: 5 IN A 8 13 NA NA
#11: 6 SP H 4 1 NA NA
#12: 6 CE A 2 3 NA NA

closest value and data frame index index of all data frame elements of a list

I have a list containing data frames:
test <- list()
test[[1]] <- data.frame(C1=c(0.2,0.4,0.5), C2=c(2,3.5,3.7), C3=c(0.3,4,5))
test[[2]] <- data.frame(C1=c(0.1,0.3,0.6), C2=c(3.9,4.3,8), C3=c(3,5.2,10))
test[[3]] <- data.frame(C1=c(0.4,0.55,0.8), C2=c(8.9,10.3,14), C3=c(7,8.4,11))
I´d like to get the line among all data frames lines inside this list which column (e.g.C2 in this example) has the closest value to each element in a vector "vec" (below), as well as the list index (1, 2 or 3 in this example) where it happened.
vector <- c(3, 14.4, 7, 0)
The desired answer would be something like:
list.index line.number.in.df C1 C2 C3
1 2 0.4 3.5 4
3 3 0.8 14 11
2 3 0.6 8 10
1 1 0.2 2 0.3
I could manage to use lapply to get 10% of the problem solved for a single value, but couldn´t do it for a bunch of values (vector) besides getting all list elements dataframe lines where the closest value as found (not only a single line among all data frames),and could not get the corresponding list index as well, i.e.
value <- 3
lapply(test, function(x) x[which.min(abs(value-x$C2)),])
Result I got:
[[1]]
C1 C2 C3
2 0.4 3.5 4
[[2]]
C1 C2 C3
1 0.1 3.9 3
[[3]]
C1 C2 C3
1 0.4 8.9 7
Would anyone be so kind and patient to get me further on this?
Thanks in advance and Happy New Year.
Here is a dplyr approach. We can generate the list.index and line.number.in.df for each dataframe and then bind_rows them together. Next, slice the rows where C2 contains the closest value for each number in that vector.
library(dplyr)
test <- list(structure(list(C1 = c(0.2, 0.4, 0.5), C2 = c(2, 3.5, 3.7
), C3 = c(0.3, 4, 5)), class = "data.frame", row.names = c(NA,
-3L)), structure(list(C1 = c(0.1, 0.3, 0.6), C2 = c(3.9, 4.3,
8), C3 = c(3, 5.2, 10)), class = "data.frame", row.names = c(NA,
-3L)), structure(list(C1 = c(0.4, 0.55, 0.8), C2 = c(8.9, 10.3,
14), C3 = c(7, 8.4, 11)), class = "data.frame", row.names = c(NA,
-3L)))
vector <- c(3, 14.4, 7, 0)
test %>%
lapply(tibble::rowid_to_column, "line.number.in.df") %>%
bind_rows(.id = "list.index") %>%
slice(vapply(vector, \(x) which.min(abs(x - C2)), integer(1L)))
Output is
list.index line.number.in.df C1 C2 C3
1 1 2 0.4 3.5 4.0
2 3 3 0.8 14.0 11.0
3 2 3 0.6 8.0 10.0
4 1 1 0.2 2.0 0.3
You could exploit the substrings of the names.
(w <- sapply(v, \(v)
names(which.min(abs(unlist(setNames(test, seq_along(test))) - v)))))
# [1] "2.C31" "3.C23" "3.C31" "2.C11"
t(mapply(\(x, y) c(list=x, line=y, test[[x]][y, ]),
as.numeric(substr(w, 1, 1)), as.numeric(substring(w, 5)))) |>
as.data.frame()
# list line C1 C2 C3
# 1 2 1 0.1 3.9 3
# 2 3 3 0.8 14 11
# 3 3 1 0.4 8.9 7
# 4 2 1 0.1 3.9 3
Note: R >= 4.1 used.
Data:
test <- list(structure(list(C1 = c(0.2, 0.4, 0.5), C2 = c(2, 3.5, 3.7
), C3 = c(0.3, 4, 5)), class = "data.frame", row.names = c(NA,
-3L)), structure(list(C1 = c(0.1, 0.3, 0.6), C2 = c(3.9, 4.3,
8), C3 = c(3, 5.2, 10)), class = "data.frame", row.names = c(NA,
-3L)), structure(list(C1 = c(0.4, 0.55, 0.8), C2 = c(8.9, 10.3,
14), C3 = c(7, 8.4, 11)), class = "data.frame", row.names = c(NA,
-3L)))
v <- c(3, 14.4, 7, 0)
I hope this is what you are looking for. It finds the value in each of the columns per element of test, that is closest to the values in vector.
#install.packages('birk')
library(birk) # required for which.closest()
# find which of the values across the columns C1:C3 in each element of test are closest
# to the values of vector and return the corresponding row numbers
x <- sapply(1:length(vector), \(x) sapply(test, \(i) apply(i, 2, \(j) which.closest(j, vector[x]))))
x <- apply(x, 1, \(x) as.data.frame(table(x)))
x <- lapply(x, \(i) i[which.max(i[, 2]), ])
row_numbers_df <- as.numeric(matrix(do.call(rbind, x)[['x']]))
# extract the values in each of the column C1:C3 corresponding to row_numbers_df
vals <- array(0, dim = length(row_numbers_df))
for (i in 1:length(row_numbers_df)) { vals[i] <- do.call(cbind, test)[row_numbers_df[i], i] }
# how many columns does each data.frame embedded in test have?
unique_number_of_cols <- unique(sapply(test, ncol))
# store results in a data.frame
r <- \(x) round(x, 1)
out <- data.frame(
seq_len(length(test)),
r(rowMeans(matrix(row_numbers_df, ncol = unique_number_of_cols, byrow = TRUE))),
matrix(vals, ncol = unique_number_of_cols, byrow = TRUE)
)
names(out) <- c('list.index', 'line.number.in.df', sapply(test, colnames)[, 1])
Result
> out
list.index line.number.in.df C1 C2 C3
1 1 3.0 0.5 3.7 5
2 2 1.7 0.6 3.9 3
3 3 1.7 0.8 8.9 7
Alternatively, if you really want to have each line.number.in.df per unique column, then you can easily store them as separate columns in out.
x <- sapply(1:length(vector), \(x) sapply(test, \(i) apply(i, 2, \(j) which.closest(j, vector[x]))))
x <- apply(x, 1, \(x) as.data.frame(table(x)))
x <- lapply(x, \(i) i[which.max(i[, 2]), ])
row_numbers_df <- as.numeric(matrix(do.call(rbind, x)[['x']]))
names(row_numbers_df) <- do.call(c, lapply(test, names))
row_numbers_df
vals <- array(0, dim = length(row_numbers_df))
for (i in 1:length(row_numbers_df)) { vals[i] <- do.call(cbind, test)[row_numbers_df[i], i] }
unique_number_of_cols <- unique(sapply(test, ncol))
out <- data.frame(
seq_len(length(test)),
split(row_numbers_df, names(row_numbers_df)),
matrix(vals, ncol = unique_number_of_cols, byrow = TRUE)
)
column_names <- sapply(test, colnames)[, 1]
names(out) <- c('list.index',
paste0('line.number.in.df.', column_names),
column_names)
Result
> out
list.index line.number.in.df.C1 line.number.in.df.C2 line.number.in.df.C3 C1 C2 C3
1 1 3 3 3 0.5 3.7 5
2 2 3 1 1 0.6 3.9 3
3 3 3 1 1 0.8 8.9 7

calculate quantile for each group i dataframe and assign NA?

I made up this example to explain my question:
df= structure(list(group = structure(c(1L, 1L, 2L, 2L, 10L, 10L
), .Label = c("Eve", "ba", "De", "De","Mi", "C", "O", "W",
"as", "ras", "Cro", "ics"), class = "factor"), ds = c(8, 8,
1, 4, 4, 6), em = c(1, 3, 8,2, 7, 3)), row.names = c(74567L,
74568L, 74570L, 74576L, 74577L, 74578L), class = "data.frame")
I need for each group to assign all values of em and ds to NA
> quantile 90 = NA
< quantile 10 = NA
Here's a way to do it for each group and each numeric variable using dplyr and ifelse.
Having only a couple of samples per group makes it difficult to interpret the whole concept of quantiles, so the result you get very much depends on how you define a quantile. The type parameter allows you to specify the definition you are using. R defaults to type = 7:
library(dplyr)
df %>%
group_by(group) %>%
mutate(ds = ifelse(ds > quantile(ds, .9) | ds < quantile(ds, .1), NA, ds),
em = ifelse(em > quantile(em, .9) | em < quantile(em, .1), NA, em))
#> # A tibble: 6 x 3
#> # Groups: group [3]
#> group ds em
#> <fct> <dbl> <lgl>
#> 1 Eve 8 NA
#> 2 Eve 8 NA
#> 3 ba NA NA
#> 4 ba NA NA
#> 5 ras NA NA
#> 6 ras NA NA
However, you can change this depending on your definition:
df %>%
group_by(group) %>%
mutate(ds = ifelse(ds > quantile(ds, .9, type = 1) |
ds < quantile(ds, .1, type = 1), NA, ds),
em = ifelse(em > quantile(em, .9, type = 1) |
em < quantile(em, .1, type = 1), NA, em))
#> # A tibble: 6 x 3
#> # Groups: group [3]
#> group ds em
#> <fct> <dbl> <dbl>
#> 1 Eve 8 1
#> 2 Eve 8 3
#> 3 ba 1 8
#> 4 ba 4 2
#> 5 ras 4 7
#> 6 ras 6 3
Created on 2020-05-17 by the reprex package (v0.3.0)

Conversion from pairwise matrix to Cytoscape edge table is too slow

My code is similar to this.
Given a matrix like this:
a b c d
a 1 NA 3 4
b NA 2 NA 4
c NA NA NA NA
d NA NA NA 4
It converts it to this:
a a 1
a c 3
a d 4
b b 2
b d 4
d d 4
The relevant code is as below:
2 pears <- read.delim("pears.txt", header = TRUE, sep = "\t", dec = ".")
3 edges <- NULL
4 for (i in 1:nrow(pears)) {
5 for (j in 1:ncol(pears)) {
6 if (!(is.na(pears[i,j]))) {
7 edges <- rbind(edges, c(rownames(pears)[i], colnames(pears)[j], pears[i,j]))
8 }
9 }
10 print(i)
11 }
12 colnames(edges) <- c("gene1", "gene2", "PCC")
13 write.table(edges, "edges.txt", row.names = FALSE, quote = FALSE, sep = "\t")
When I run the code from a remote server in the background using screen -S on a 17804x17804 sparse (99% NA) matrix, it initially runs 5 print statements every 13 seconds. However, it has now slowed down to 7 print statements every minute. Why is the algorithm getting slower and slower as it progresses? Is there another way I can convert my matrix into a Cytoscape's format quicker?
We convert the data.frame to matrix, use melt from reshape2 to get the dimnames as two columns along with the values as third column, then subset while using na.rm to remove the NA rows
library(reshape2)
melt(as.matrix(df1), na.rm = TRUE)
data
df1 <- structure(list(a = c(1L, NA, NA, NA), b = c(NA, 2L, NA, NA),
c = c(3L, NA, NA, NA), d = c(4L, 4L, NA, 4L)), class = "data.frame",
row.names = c("a",
"b", "c", "d"))

Concatenate Rows If Value of Another Column In Next Row Is Empty

I have a data set as shown in Input table below. I want to combine rows (4,5,6), rows (8,9) and rows (11,12) of Input table such that they share the same ID as shown in row 4,8 and 11 in the Output table below.
I tried merge(), but that didn't work as expected. The key here is the ID column which has unique values.
Any suggestions on how I can achieve this efficiently?
Input
Row Name Val1 Val2 Unit ID
1 -0.5 5.5 V UI-001
2 a -0.5 2.5 V UI-002
3 b -0.5 5.5 V UI-003
4 c -0.5 5.5 V UI-004
5 d
6 e
7 -45 125 Ohms UI-005
8 f 2 kV UI-006
9 g
10 h 500 V UI-007
11 i 15 kV UI-008
12 j
13 k UI-009
dput() of Input
structure(list(Name = c(NA, "a", "b", "c", "d", "e", NA, "f",
"g", "h", "i", "j", "k"), Val1 = c(-0.5, -0.5, -0.5, -0.5, NA,
NA, -45, 2, NA, 500, 15, NA, NA), Val2 = c(5.5, 2.5, 5.5, 5.5,
NA, NA, 125, NA, NA, NA, NA, NA, NA), Unit = c("V", "V", "V",
"V", NA, NA, "Ohms", "kV", NA, "V", "kV", NA, NA), ID = c("UI-001",
"UI-002", "UI-003", "UI-004", NA, NA, "UI-005", "UI-006", NA,
"UI-007", "UI-008", NA, "UI-009")), row.names = c(NA, -13L), class =
c("tbl_df", "tbl", "data.frame"))
Output
Row Name Val1 Val2 Unit ID
1 -0.5 5.5 V UI-001
2 a -0.5 2.5 V UI-002
3 b -0.5 5.5 V UI-003
4 cde -0.5 5.5 V UI-004
5 -45 125 Ohms UI-005
6 fg 2 kV UI-006
7 h 500 V UI-007
8 ij 15 kV UI-008
9 k UI-009
dput() of Output
structure(list(Name = c(NA, "a", "b", "cde", NA, "fg", "h", "ij",
"k"), Val1 = c(-0.5, -0.5, -0.5, -0.5, -45, 2, 500, 15, NA),
Val2 = c(5.5, 2.5, 5.5, 5.5, 125, NA, NA, NA, NA), Unit = c("V",
"V", "V", "V", "Ohms", "kV", "V", "kV", NA), ID = c("UI-001",
"UI-002", "UI-003", "UI-004", "UI-005", "UI-006", "UI-007",
"UI-008", "UI-009")), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"))
We may use
out <- df[!is.na(df$ID), ]
out$Name[!is.na(out$Name)] <- tapply(df$Name, cumsum(!is.na(df$ID)), paste, collapse = "")[!is.na(out$Name)]
out
# Name Val1 Val2 Unit ID
# 1 <NA> -0.5 5.5 V UI-001
# 2 a -0.5 2.5 V UI-002
# 3 b -0.5 5.5 V UI-003
# 4 cde -0.5 5.5 V UI-004
# 7 <NA> -45.0 125.0 Ohms UI-005
# 8 fg 2.0 NA kV UI-006
# 10 h 500.0 NA V UI-007
# 11 ij 15.0 NA kV UI-008
# 13 k NA NA <NA> UI-009
The first line gets rid of all the rows where ID is NA. Then
tapply(df$Name, cumsum(!is.na(df$ID)), paste, collapse = "")
# 1 2 3 4 5 6 7 8 9
# "NA" "a" "b" "cde" "NA" "fg" "h" "ij" "k"
constructs the correct values for Name and !is.na(out$Name) gives us which rows of out should be modified (which is needed since "NA" isn't the same as NA).
Also a dplyr possibility:
df %>%
mutate(grp = ifelse((is.na(lead(ID, default = last(ID))) & !is.na(ID)) | is.na(ID), 1, 0),
grp = ifelse(grp != 0, cumsum(grp != lag(grp, 1, default = first(grp))), 0)) %>%
group_by(grp) %>%
mutate(Name = ifelse(grp != 0, paste(Name, collapse = ""), Name)) %>%
filter(!is.na(ID)) %>%
ungroup() %>%
select(-grp)
Name Val1 Val2 Unit ID
<chr> <dbl> <dbl> <chr> <chr>
1 <NA> -0.500 5.50 V UI-001
2 a -0.500 2.50 V UI-002
3 b -0.500 5.50 V UI-003
4 cde -0.500 5.50 V UI-004
5 <NA> -45.0 125. Ohms UI-005
6 fg 2.00 NA kV UI-006
7 h 500. NA V UI-007
8 ij 15.0 NA kV UI-008
9 k NA NA <NA> UI-009
First, it creates a grouping variable for NA cases on "ID" and the last non-NA cases on "ID" before those NA cases. Then, it groups by that grouping variable and combines the values from "Name" into one. Finally, it filters out the cases where "ID" is NA and removes the redundant grouping variable.
Or the same using rleid() from data.table to more conveniently create the grouping variable:
df %>%
mutate(grp = ifelse((is.na(lead(ID, default = last(ID))) & !is.na(ID)) | is.na(ID), 1, 0),
grp = ifelse(grp == 1, rleid(grp), grp)) %>%
group_by(grp) %>%
mutate(Name = ifelse(grp != 0, paste(Name, collapse = ""), Name)) %>%
filter(!is.na(ID)) %>%
ungroup() %>%
select(-grp)
Or a different possibility using fill():
df %>%
mutate(ID_temp = ID) %>%
fill(ID, .direction = "down") %>%
group_by(ID) %>%
mutate(Name = paste(Name, collapse = "")) %>%
filter(!is.na(ID_temp)) %>%
select(-ID_temp)
Here, you are filling the missing "ID" values with the previous non-missing value, grouping by it, and then combining the rows per groups.

Resources