R - Create new data frame based on variable names

R - Create new data frame based on variable names - r

Given two dataframes:
d1=structure(list(y = c(0.04090403771224, 0.321216286364446, -1.00056198338576,
0.549872767053012, 0.746529361891068, -0.756394989312306, 0.432210041946058,
1.04202671889042, 0.846691694527378, -0.0372890199537169), x = c(-0.626453810742332,
0.183643324222082, -0.835628612410047, 1.59528080213779, 0.329507771815361,
-0.820468384118015, 0.487429052428485, 0.738324705129217, 0.575781351653492,
-0.305388387156356), tx = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L,
1L), x2 = c(-0.41499456329968, -0.394289953710349, -0.0593133967111857,
1.10002537198388, 0.763175748457544, -0.164523596253587, -0.253361680136508,
0.696963375404737, 0.556663198673657, -0.68875569454952)), .Names = c("y",
"x", "tx", "x2"), row.names = c(NA, -10L), class = "data.frame")
d2=dput(reg1.coefs)
structure(c(-0.752515009279226, 2.43055896098665, 0.833724197554561,
-1.79389056223944), .Names = c("(Intercept)", "x", "tx", "x:tx"
))
How can I create a third data frame that selects only the variables in d2 and creates additional variable(s) corresponding to product of the the variables with a ':' in d2. In this example, the code should return a third variable x:tx corresponding to the product of x and tx in d.
out=structure(list(y = c(0.04090403771224, 0.321216286364446, -1.00056198338576,
0.549872767053012, 0.746529361891068, -0.756394989312306, 0.432210041946058,
1.04202671889042, 0.846691694527378, -0.0372890199537169), x = c(-0.626453810742332,
0.183643324222082, -0.835628612410047, 1.59528080213779, 0.329507771815361,
-0.820468384118015, 0.487429052428485, 0.738324705129217, 0.575781351653492,
-0.305388387156356), tx = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L,
1L), x1x2 = c(-0.626453810742332, 0.183643324222082, -0.835628612410047,
1.59528080213779, 0.329507771815361, -0.820468384118015, 0, 0,
0.575781351653492, -0.305388387156356)), .Names = c("y", "x",
"tx", "x1x2"), row.names = c(NA, -10L), class = "data.frame")
This is obviously easy for one case, but I need the code to be general enough so that if d2 contains products of other variables (e.g., x:x2 or x:x2:tx) the correct out is produced.

Related

Combining two lists by multiplying the corresponding values in R

Below is the structure of a chunk which includes two elements of list1 and list2.
list1:
list1 <- list(structure(list(chr22_20230714_G_A_b38 = 0.0000953181301665087,
chr22_20230737_G_A_b38 = -0.00124036704551427, chr22_20231229_T_A_b38 = 0.000808061558738542,
chr22_20231474_G_A_b38 = 0.000387528601423933, chr22_20231667_C_G_b38 = -0.000120624028990859), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame")), structure(list(
chr22_47157062_G_A_b38 = 0.00000909931572319958, chr22_47157212_G_A_b38 = -0.000124084106569373,
chr22_47157394_C_G_b38 = -0.0000752774417069946, chr22_47157559_G_A_b38 = 0.0000808446315377557,
chr22_47157607_T_C_b38 = 0.000237979025556899), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame")))
list2:
list2 <- list(structure(list(name = c("HG00096", "HG00097", "HG00099",
"HG00100", "HG00101"), ENSG = c("ENSG00000040608", "ENSG00000040608",
"ENSG00000040608", "ENSG00000040608", "ENSG00000040608"), expr = c(-0.5186894,
0.6170779, -0.5786774, 0.07324268, -0.7579184), chr22_20230714_G_A_b38 = c(1L,
1L, 1L, 2L, 1L), chr22_20230737_G_A_b38 = c(0L, 0L, 0L, 0L, 0L
), chr22_20231229_T_A_b38 = c(1L, 0L, 1L, 0L, 1L), chr22_20231474_G_A_b38 = c(0L,
1L, 0L, 0L, 0L), chr22_20231667_C_G_b38 = c(1L, 1L, 1L, 2L, 1L
)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(name = c("HG00096", "HG00097", "HG00099",
"HG00100", "HG00101"), ENSG = c("ENSG00000054611", "ENSG00000054611",
"ENSG00000054611", "ENSG00000054611", "ENSG00000054611"), expr = c(-0.5555929,
0.1600335, 0.4027508, -0.6028474, 2.271097), chr22_47157062_G_A_b38 = c(0L,
1L, 0L, 0L, 0L), chr22_47157212_G_A_b38 = c(0L, 0L, 1L, 1L, 2L
), chr22_47157394_C_G_b38 = c(0L, 1L, 1L, 1L, 2L), chr22_47157559_G_A_b38 = c(0L,
1L, 0L, 0L, 0L), chr22_47157607_T_C_b38 = c(0L, 1L, 1L, 1L, 2L
)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"
)))
Both lists contain the same number and names of elements, as well as the same number of columns in each corresponding element. Using this assumption, I want to multiply the value of each column in list1 by the corresponding column in list2.
Desired output:
out <- list(structure(list(name = c("HG00096", "HG00097", "HG00099",
"HG00100", "HG00101"), ENSG = c("ENSG00000040608", "ENSG00000040608",
"ENSG00000040608", "ENSG00000040608", "ENSG00000040608"), expr = c(-0.5186894,
0.6170779, -0.5786774, 0.07324268, -0.7579184), chr22_20230714_G_A_b38 = c(0.0000953,
0.0000953, 0.0000953, 0.0001906, 0.0000953), chr22_20230737_G_A_b38 = c(0,
0, 0, 0, 0), chr22_20231229_T_A_b38 = c(0.000808, 0, 0.000808,
0, 0.000808), chr22_20231474_G_A_b38 = c(0, 0.000388, 0, 0, 0
), chr22_20231667_C_G_b38 = c(-0.000121, -0.000121, -0.000121,
-0.000242, -0.000121)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(name = c("HG00096", "HG00097",
"HG00099", "HG00100", "HG00101"), ENSG = c("ENSG00000054611",
"ENSG00000054611", "ENSG00000054611", "ENSG00000054611", "ENSG00000054611"
), expr = c(-0.5555929, 0.1600335, 0.4027508, -0.6028474, 2.271097
), chr22_47157062_G_A_b38 = c(0, 0.0000091, 0, 0, 0), chr22_47157212_G_A_b38 = c(0,
0, -0.000124, -0.000124, -0.000248), chr22_47157394_C_G_b38 = c(0,
-0.0000753, -0.0000753, -0.0000753, -0.0001506), chr22_47157559_G_A_b38 = c(0,
0.0000808, 0, 0, 0), chr22_47157607_T_C_b38 = c(0, 0.000238,
0.000238, 0.000238, 0.000476)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame")))

We could use map2 or Map in base R
library(dplyr)
library(tidyr)
outnew <- map2(list2, list1, ~ {
dat1 <- .y
.x %>% mutate(across(names(dat1), ~ .x * dat1[[cur_column()]] ))
})

Conditionally adding characters to new column based on separate dataset

Hello all and thank you in advance.
I would like to add a new column to my pre-existing data frame where the values sourced from a second data frame based on certain conditions. The dataset I wish to add the new column to ("data_melt") has many different sample IDs (sample.#) under the variable column. Using a second dataset ("metadata") I want to add the pond names to the "data_melt" new column based on the sample-ids. The sample IDs are the same in both datasets.
My gut tells me there's an obvious solution but my head is pretty fried. Here is a toy example of my data_melt df (since its 25,000 observations):
> dput(toy)
structure(list(gene = c("serA", "mdh", "fdhB", "fdhA"), process = structure(c(1L,
1L, 1L, 1L), .Label = "energy", class = "factor"), category = structure(c(1L,
1L, 1L, 1L), .Label = "metabolism", class = "factor"), ko = structure(1:4, .Label = c("K00058",
"K00093", "K00125", "K00148"), class = "factor"), variable = structure(c(1L,
2L, 3L, 3L), .Label = c("sample.10", "sample.19", "sample.72"
), class = "factor"), value = c(0.00116, 2.77e-05, 1.84e-05,
0.0125)), row.names = c(NA, -4L), class = "data.frame")
And here is a toy example of my metadata df:
> dput(toy)
structure(list(sample = c("sample.10", "sample.19", "sample.72",
"sample.13"), pond = structure(c(2L, 2L, 1L, 1L), .Label = c("lower",
"upper"), class = "factor")), row.names = c(NA, -4L), class = "data.frame")
Thank you again!

We can use match from base R to create a numeric index to replace the values
toy$pond <- with(toy, out$pond[match(variable, out$sample)])

I believe merge will work here.
sss <- structure(list(gene = c("serA", "mdh", "fdhB", "fdhA"), process = structure(c(1L,
1L, 1L, 1L), .Label = "energy", class = "factor"), category = structure(c(1L,
1L, 1L, 1L), .Label = "metabolism", class = "factor"), ko = structure(1:4, .Label = c("K00058",
"K00093", "K00125", "K00148"), class = "factor"), variable = structure(c(1L,
2L, 3L, 3L), .Label = c("sample.10", "sample.19", "sample.72"
), class = "factor"), value = c(0.00116, 2.77e-05, 1.84e-05,
0.0125)), row.names = c(NA, -4L), class = "data.frame")
ss <- structure(list(sample = c("sample.10", "sample.19", "sample.72",
"sample.13"), pond = structure(c(2L, 2L, 1L, 1L), .Label = c("lower",
"upper"), class = "factor")), row.names = c(NA, -4L), class = "data.frame")
ssss <- merge(sss, ss, by.x = "variable", by.y = "sample")

You can use left_join() from the dplyr package after renaming sample to variable in the metadata data frame.
library(tidyverse)
data_melt <- structure(list(gene = c("serA", "mdh", "fdhB", "fdhA"),
process = structure(c(1L, 1L, 1L, 1L),
.Label = "energy",
class = "factor"),
category = structure(c(1L, 1L, 1L, 1L),
.Label = "metabolism",
class = "factor"),
ko = structure(1:4,
.Label = c("K00058", "K00093", "K00125", "K00148"),
class = "factor"),
variable = structure(c(1L, 2L, 3L, 3L),
.Label = c("sample.10", "sample.19", "sample.72"),
class = "factor"),
value = c(0.00116, 2.77e-05, 1.84e-05, 0.0125)),
row.names = c(NA, -4L),
class = "data.frame")
metadata <- structure(list(sample = c("sample.10", "sample.19", "sample.72", "sample.13"),
pond = structure(c(2L, 2L, 1L, 1L),
.Label = c("lower", "upper"),
class = "factor")),
row.names = c(NA, -4L),
class = "data.frame") %>%
# Renaming the column, so we can join the two data sets together
rename(variable = sample)
data_melt <- data_melt %>%
left_join(metadata, by = "variable")

Using conditional selection to create a subset of data

I have a dataset called dietox which has missing values (NA) for the Feed variable. I need to use conditional selection to create a subset of the data for which the rows with missing values are deleted.
The code I tried was:
dietox[!is.NA[dietox$Feed, ]
... but am not sure if that is right to create a subset.
dput(head(dietox))
dietox <- structure(list(Weight = c(26.5, 27.59999, 36.5, 40.29999, 49.09998,
55.39999), Feed = c(NA, 5.200005, 17.6, 28.5, 45.200001, 56.900002 ),
Time = 1:6, Pig = c(4601L, 4601L, 4601L, 4601L, 4601L, 4601L ),
Evit = c(1L, 1L, 1L, 1L, 1L, 1L), Cu = c(1L, 1L, 1L, 1L, 1L, 1L),
Litter = c(1L, 1L, 1L, 1L, 1L, 1L)),
.Names = c("Weight", "Feed", "Time", "Pig", "Evit", "Cu", "Litter"),
row.names = c(NA, 6L), class = "data.frame")

You have the right idea, but is.na is a function and so needs to be used with parenthesis.
dietox[!is.na(dietox$Feed), ]

Combine bar plot and stat_smooth() line from different data sets in ggplot2

I'm trying to overlay a stat_smooth() line from one dataset over a bar plot of another. Both csv files draw from the same dataset, but I had to make a new one for the bar plot because I had to add a few columns (including error bars) that wouldn't make sense in the big csv. So, I have code for the bar plot, and code for the line made using stat_smooth, but can't figure out how to combine them. I just want a graph with the line on top of the bars. Here's the code for the bar plot:
`e <- read.csv("Retro Complex.csv", header=T, sep=",")
e <- subset(e, Accuracy != 0)
limits <- aes(ymax = Confidence + SE, ymin = Confidence - SE)
e$Complexity <- factor(e$Complexity)
p <- ggplot(e, aes(e$Complexity, Confidence))
p +
geom_bar(position = "dodge", stat = "identity") +
geom_errorbar(limits, position = "dodge", width = 0.25) +
coord_cartesian(ylim=c(0,1)) +
scale_y_continuous(labels = percent) +
ggtitle("Retro")`
And here's for the line
`ggplot(retroacc, aes(x=Complexity.Sample, y=risk)) +
stat_smooth(aes(x=Complexity.Sample, y=risk), data=retroacc,
method="glm", method.args=list(family="binomial"), se=FALSE) +
ylim(0,1)`
Here's what they both look like:
Stat_smooth() line:
Barplot:
Sample Data
For the bar plot:
structure(list(Complexity = structure(1:5, .Label = c("1", "2",
"3", "4", "5"), class = "factor"), Accuracy = c(1L, 1L, 1L, 1L,
1L), Risk = c(0.69297164, 0.695793434, 0.695891571, 0.746606335,
0.748717949), SE = c(0.003621776, 0.004254081, 0.00669456, 0.008114764,
0.021963804), Proportion = c(0.823475656, 0.809299751, 0.863727821,
0.94724695, 0.882352941), SEAcc = c(0.002716612, 0.003267882,
0.004639995, 0.004059001, 0.015325003)), .Names = c("Complexity",
"Accuracy", "Confidence", "SE", "Proportion", "SEAcc"), row.names = c(1L,
3L, 5L, 7L, 9L), class = "data.frame")
For the line:
structure(list(risk = c(0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), acc = c(0L, 1L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
Uniqueness = c(0.405166959, 0.407414244, 0.285123931, 0.248994487,
0.259019778, 0.334552913, 0.300580793, 0.354632526, 0.309841996,
0.331460876, 0.289981111, 0.362405881, 0.37389863, 0.253672193,
0.342903451, 0.294459829, 0.387447291, 0.519657612, 0.278964406
), Average.Similarity = c(0.406700667, 0.409547355, 0.275663862,
0.240909144, 0.251796956, 0.31827466, 0.240574971, 0.349093002,
0.34253811, 0.348084627, 0.290495997, 0.318312198, 0.404143605,
0.290789337, 0.293259599, 0.320214236, 0.382449298, 0.506295194,
0.335167223), Complexity.Sample = c(8521L, 11407L, 3963L,
2536L, 2327L, 3724L, 4005L, 5845L, 5770L, 5246L, 3629L, 3994L,
4285L, 1503L, 8222L, 3683L, 5639L, 10288L, 3076L)), .Names = c("risk",
"acc", "Uniqueness", "Average.Similarity", "Complexity.Sample"
), class = "data.frame", row.names = c(NA, -19L))
So yeah, if any of you guys know how to combine these into one plot please let me know!!

Reproducible example and dput error

I'm trying to reproduce a data frame and dput is not cooperating.
dput command :
dput(head(data, 10))
dput output :
structure(list(lexptot = c(8.28377505197124, 9.1595012302023,
8.14707583238833, 9.86330744180814, 8.21391453619232, 8.92372556833205,
7.77219149815994, 8.58202430280175, 8.34096828565733, 10.1133857229336
), year = c(0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L), dfmfdyr = c(0,
1, 0, 1, 0, 1, 0, 1, 0, 1), dfmfd98 = c(1, 1, 1, 1, 1, 1, 1,
1, 1, 1), nh = c(11054L, 11054L, 11061L, 11061L, 11081L, 11081L,
11101L, 11101L, 12021L, 12021L)), .Names = c("lexptot", "year",
"dfmfdyr", "dfmfd98", "nh"), vars = list(nh), drop = TRUE, indices = list(
0:1, 2:3, 4:5, 6:7, 8:9), group_sizes = c(2L, 2L, 2L, 2L,
2L), biggest_group_size = 2L, labels = structure(list(nh = c(11054L,
11061L, 11081L, 11101L, 12021L)), class = "data.frame", row.names = c(NA,
-5L), .Names = "nh", vars = list(nh)), row.names = c(NA, 10L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Error :
Error in structure(list(lexptot = c(8.28377505197124, 9.1595012302023, :
object 'nh' not found
Why is this happening right from a dput command?
Edit :
Relevant posts, but suggestions did not work.
Why does this dplyr dput not work?
Edit 2 :
It appears because one of my variables is a group object, dput cannot reproduce this. The solution is to use ungroup(data) then rerun dput and all works.

The issue was one of the variable objects was a group and therefore, dput() couldn't recognize this. The solution was to ungroup() the data.
ungroup(data)
dput(head(data, 10))
New Data.frame :
structure(list(lexptot = c(8.28377505197124, 9.1595012302023,
8.14707583238833, 9.86330744180814, 8.21391453619232, 8.92372556833205,
7.77219149815994, 8.58202430280175, 8.34096828565733, 10.1133857229336
), year = c(0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L), dfmfd98 = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1), dfmfd = c(0L, 1L, 0L, 1L, 1L, 1L,
1L, 1L, 1L, 1L)), .Names = c("lexptot", "year", "dfmfd98", "dfmfd"
), class = c("tbl_df", "data.frame"), row.names = c(NA, -10L))

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

R - Create new data frame based on variable names - r

Related

Combining two lists by multiplying the corresponding values in R

Conditionally adding characters to new column based on separate dataset

Using conditional selection to create a subset of data

Combine bar plot and stat_smooth() line from different data sets in ggplot2

Reproducible example and dput error

Categories

Resources