I have a data frame called stats. I'd like to group by month_name and item and generate a random variable drawn from a normal distribution in a new column called rv.
This is the code I tried but it repeats the generation of 1 random variable in the rv column:
stats %>%
group_by(month_name, item) %>%
mutate(rv = rnorm(1, mean = mean, sd = sd))
The goal is to eventually replicate the rv output 10,000 times. How can I modify my code to generate the random variable for every row once and 10,000 times?
This is my data:
structure(list(month_name = structure(c(1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L,
5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L,
8L, 8L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 11L, 11L,
11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L), .Label = c("January",
"February", "March", "April", "May", "June", "July", "August",
"September", "October", "November", "December"), class = c("ordered",
"factor")), item = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L,
2L, 3L, 4L, 5L), .Label = c("a", "b", "e", "d", "c"), class = "factor"),
min = c(39853.3676768762, 11963.7336771844, 28475.0829411315,
36241.9007031999, 35743.7750504859, 16092.552892924, 12528.9369640133,
28363.8516762228, 29557.1911907891, 20577.9268503088, 26012.6643332399,
43743.1386573406, 33757.0104608081, 24012.3644652027, 29987.8232384625,
26663.1671529956, 50724.1357441692, 33156.7203254077, 36604.0975795671,
32448.5497811945, 47818.2983698804, 25173.5205474241, 29648.7882981325,
39034.0329768052, 15083.5548610647, 41560.8896893507, 40563.2944650284,
48794.4526055819, 35895.1783353774, 30085.4271923688, 39675.7305889162,
33628.9557047603, 36950.5993766457, 30593.5091646214, 28957.5398056329,
37080.7178800747, 45454.3924797489, 28755.6280571895, 34733.1340290652,
37227.9361452194, 21493.809533368, 33292.9944106622, 28137.6372068055,
25582.8046285949, 23073.0637573296, 28846.9082264882, 19454.182866794,
33869.2858697563, 19016.7538627489, 30647.6876387916, 35123.8965500988,
34146.2780735908, 40593.6508043686, 26908.3734089905, 47178.2458120079,
24665.5899193578, 22701.4906439165, 50735.1331088719, 36108.7624278488,
21415.5715318508), lower = c(54524.7101912146, 26928.6804993352,
25119.8847919585, 45942.5372327181, 52100.762800828, 23399.2712234262,
14178.7907654734, 71366.6268933559, 49209.2124037853, 54643.7588467776,
48369.7944054794, 29515.3335011807, 41577.635577101, 25357.3837384686,
43253.4733925982, 43401.4748829102, 37741.3586860236, 52294.4029786582,
58136.6122795486, 43617.5523486807, 46648.1777348884, 47822.6060157009,
37122.0182632065, 65447.4620274838, 29544.1919272749, 54822.3562275875,
64814.4174753617, 65538.2587526896, 39975.4034746898, 59117.6049731313,
49024.4324422717, 25273.7368374795, 56946.7596272533, 50660.5745923196,
37221.8185672126, 30508.2772838287, 47172.6674212663, 52956.1465111511,
45488.8349086128, 52660.1832157037, 37406.8854102724, 25601.012749268,
41414.610113642, 41145.7009104373, 26879.9690641376, 69323.7347440924,
59453.3099916568, 19260.9187209561, 14090.2250971317, 41778.9038974128,
35013.9160392596, 39672.0871995261, 57517.2881078087, 52765.3573599843,
57267.2271717807, 54869.720268229, 58525.9231470629, 44610.285805162,
47317.3995094377, 17599.590085043), mean = c(58549.8098049081,
56374.4327553941, 39864.1715264267, 85333.1530921059, 64454.2358008729,
63343.4098283811, 69838.6859070403, 41935.3881398536, 40239.4399412696,
70073.2291007902, 57535.295477502, 76197.4454180647, 60836.2074195693,
64601.7379215889, 51599.3556004457, 49092.0124309883, 47319.767991988,
63121.0872241636, 43048.0322965586, 77405.4987695189, 64320.8901918307,
53059.7915920758, 63712.4934804165, 37248.933469329, 48285.12302248,
60352.1030623367, 67648.010113929, 52282.8579266665, 63868.4373429784,
71370.1455147326, 59275.2217698193, 74524.7831867724, 62464.1935824186,
50255.8945012446, 31094.1686136834, 75833.6439248775, 32190.7391406323,
77010.5148506178, 69635.0888164364, 65885.8987213858, 54022.7135642953,
35801.3865465657, 60637.9983665307, 90783.7721781328, 57264.0603250172,
59977.2976696403, 71712.656969139, 76705.4011709067, 89462.5059367925,
76714.0458753254, 56859.5782454854, 66820.0053236744, 58243.7435076688,
52843.8704599132, 77247.3384533588, 55515.7748808548, 75004.3165800858,
88370.1869726297, 68628.9281194796, 53895.0496305422), median = c(42352.1610450345,
57330.3183802072, 55273.2047201131, 82351.3852530883, 46370.4898234873,
52386.0432388715, 47943.0683307536, 53897.781347776, 67858.0064600009,
73013.024717384, 83116.7356352266, 44401.5903576421, 69025.6068023045,
81625.3403276092, 43344.4404418446, 49701.9746204065, 44889.5603216509,
86449.7649043697, 52150.9769065634, 58675.8138647348, 55665.7047792249,
44566.4888204713, 50517.7492643733, 73778.9515308994, 60652.1631558926,
87345.0069311662, 68268.9807235179, 41356.3226356087, 41585.1763113502,
75144.8373297139, 81967.7788670882, 66041.6207332688, 55103.8870449834,
77301.4195253735, 54130.4774678618, 65176.7990367632, 46834.9652749994,
65134.3889325556, 76621.5018669346, 89066.7483257445, 79344.8597627239,
50867.4889878177, 51326.3717332736, 74843.6262595514, 66235.6184875188,
98300.5112442494, 51378.9240605971, 61277.8214283028, 48915.1245226839,
52765.9194941648, 47028.8412992194, 74841.2039136489, 70896.5761749783,
67414.0877191645, 60655.1682545525, 42707.2850070942, 51244.6187187212,
70889.9732948709, 82834.1260629236, 56029.4540887989), upper = c(96808.9361470916,
72722.9262056796, 89079.513341868, 84709.1878768955, 87694.368834914,
87860.8548839792, 80996.3827453218, 84247.9259137302, 95585.6388675179,
57338.746606262, 88681.3926853573, 87957.989278465, 87360.6574510974,
92664.4254709955, 73493.0826366849, 84230.5990186054, 81442.2517006442,
87801.9592453634, 107883.319372054, 101919.939543795, 78090.4252899963,
70239.1417329303, 100675.767786787, 99806.9236049608, 71452.5071326737,
73879.3479602876, 106131.22309752, 125238.035074805, 76731.6350473027,
105563.285669622, 98604.105083167, 88657.8428176833, 81133.2031578456,
92495.2957986084, 104836.803460225, 102419.6178137, 86160.3548401189,
87287.9179449312, 72987.3973022452, 73185.0732579627, 90916.179982239,
111282.33982277, 142168.512194455, 100479.774695548, 118375.00968986,
116099.107730658, 105747.461541425, 106715.198136428, 128585.197217447,
87996.5319472346, 67831.1501517932, 109713.080164634, 78535.3157822644,
128602.704986898, 82213.8086826659, 118591.773718681, 66518.2467960131,
91250.5061727746, 117072.914540123, 114524.034290364), max = c(137612.711045413,
142519.370905613, 137456.124250483, 149209.014602568, 158745.717583772,
144886.189765236, 168837.723206789, 148308.890270968, 158590.65413993,
152288.303209753, 154042.306686713, 143922.848061827, 147477.579594905,
147438.066965268, 141502.628117831, 150285.096748915, 148713.594899874,
156656.255445038, 151517.357942321, 146177.731181398, 130056.291991729,
150991.849546995, 150476.190905448, 140149.802748207, 162573.574139209,
124218.878401843, 140313.610415297, 156852.359228369, 147676.550419975,
139922.178103581, 131822.195549853, 143008.968758112, 142237.425864494,
148756.818388612, 123905.560034301, 157126.60664862, 132868.19652461,
137884.902850549, 142164.212835827, 144616.429331364, 154277.663061656,
156870.781144851, 170948.478868233, 154970.297432983, 144661.430142095,
151193.528913062, 136056.623739965, 132695.069145067, 144366.408646971,
154456.483407293, 143518.023088591, 145811.265404348, 139900.024678788,
127547.709882734, 149995.24047052, 145400.958382574, 159524.480570906,
118905.663549293, 161631.72583606, 147524.546274058), sd = c(9989.37951375166,
9906.50689980405, 9903.6852849217, 10008.3321579478, 10075.4653993515,
10063.7122293343, 10053.0016932606, 9826.1129055558, 9855.88655389009,
10028.7176055065, 10070.3833732403, 9941.07465801432, 10094.2667749602,
9910.53181242413, 10104.5889493016, 9851.70104229335, 9972.91821342281,
10080.4485086333, 10044.5102818099, 10037.3707232711, 10025.1107006076,
10022.3659427419, 9941.51637265177, 9873.12826319285, 10027.9036424549,
10033.6518983864, 9970.47127759776, 9937.3319252128, 10013.3439414305,
10030.3125017708, 10168.5115559098, 10213.3568382367, 9990.24289183087,
9968.82189362707, 10048.7504375345, 10015.8411633632, 10037.6851291425,
9925.92765463682, 9835.81447415085, 9782.6505066721, 10033.5360418173,
9991.76186224687, 9924.86818104305, 9970.41809893224, 9980.55197551292,
9886.97032019385, 9925.73912143071, 9971.01687402101, 9858.19281102242,
9969.19466304141, 9955.12658457894, 10139.5950943687, 9967.09479735319,
10168.1650679826, 10023.9501235604, 9821.41776472295, 10064.1149573067,
10134.8532916488, 9943.57024828908, 9833.93164357077)), row.names = c(NA,
-60L), groups = structure(list(month_name = structure(1:12, .Label = c("January",
"February", "March", "April", "May", "June", "July", "August",
"September", "October", "November", "December"), class = c("ordered",
"factor")), .rows = structure(list(1:5, 6:10, 11:15, 16:20, 21:25,
26:30, 31:35, 36:40, 41:45, 46:50, 51:55, 56:60), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
you can try data.table:
library(data.table) # load data.table
setDT(stats) # convert stats to data.table
stats[, rv := rnorm(.N, mean = mean, sd = sd), by = .(month_name, item)]
In your example there's only one record of each combination month_name and item, but I can imagine your real data has more.
The := is an assignment operator. You'll assign the result of rnorm into a new column called rv.
The by = part serves for grouping, see ?data.table.
EDIT TO ADD:
If you want 10,000 random variables, then:
stats[, new_rv := .(list(rnorm(1e4, mean, sd))), by = .(month_name, item)]
You already know the := and the by = parts, so let's dive into the expression in the middle:
The .(list()) bit will assign the resulting list (vector of 10,000 random numbers, in our case) to the variable (because we are using the assignment operator :=).
The very interesting thing is that with this .(list()) "combo" you can store complex things in a variable (column) of a data.table. I use it often to store things such as forecasts, plots or linear models, etc. by group: it is very useful!
Now, if you want to operate on your new variable, please keep in mind that it is a list, so you need to subset it accordingly:
If you want to check that the standard deviation of new_rv of row 1 is close to what you expect, the following code will throw an error:
stats[1, sd(new_rv)]
> Error in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm) :
> is.atomic(x) is not TRUE
It is complaining that you are attempting to calculate sd() on a list. The correct code is:
stats[1, sd(new_rv[[1]])]
[1] 9926.439
The [[1]]] part is correctly subsetting the first element of the list.
Is there any way to write the data in a block of a table line by line in R?
I'll be grateful can someone find me a solution.
Thank you.
This is the input data in tad.
structure(list(gene = structure(c(4L, 7L, 2L, 10L, 1L, 9L, 6L,
3L, 8L, 5L), .Label = c("ENSG00000065243", "ENSG00000084070",
"ENSG00000127423", "ENSG00000135801", "ENSG00000163909", "ENSG00000174950",
"ENSG00000183615", "ENSG00000197056", "ENSG00000203857", "ENSG00000204060"
), class = "factor"), domain = c(9L, 1L, 5L, 1L, 9L, 2L, 1L,
4L, 6L, 6L)), row.names = c(NA, 10L), class = "data.frame")
This is the code I used.
colnames(tad)<-c("gene", "domain")
domain_result= aggregate(gene~domain, tad, paste, collapse = ",")
This is the output
I need this to be line by line. For example in the first row the data is like this.
ENSG00000183615,ENSG00000204060,ENSG00000174950
But I need it to be line by line.
Okay, here is what you can do to get the output you want in a way I understand it. Basically, what you want to do is separate the column gene into multiple columns while maintaining the domain. There is a function in tidyr called separate_rows that can do that.
library(tidyr)
gene_domains_out <- separate_rows(gene_domains, gene,sep=",")
Sample input Data
Here is a dput of the image above
#dput(head(gene_domains))
structure(list(domain = c(1L, 3L, 4L, 5L, 6L, 7L), gene = c("ENSG00000230594,ENSG00000171155,ENSG00000224089,ENSG00000230347,ENSG00000236446,ENSG00000186471,ENSG00000101892,ENSG00000182890,ENSG00000232119,ENSG00000131721,ENSG00000101882,ENSG00000101883,ENSG00000242362,ENSG00000226685,ENSG00000125352,ENSG00000236126,ENSG00000237957,ENSG00000005893,ENSG00000125355,ENSG00000226600,ENSG00000125356,ENSG00000203989,ENSG00000226929,ENSG00000228517,ENSG00000177485,ENSG00000226023,ENSG00000236371,ENSG00000278646,ENSG00000158290",
"ENSG00000176774,ENSG00000176746,ENSG00000232030,ENSG00000188408",
"ENSG00000198205,ENSG00000215174,ENSG00000165591,ENSG00000198455,ENSG00000186787,ENSG00000204271,ENSG00000147059",
"ENSG00000029993,ENSG00000166049,ENSG00000183862,ENSG00000102181,ENSG00000013619,ENSG00000130032,ENSG00000171100,ENSG00000160131,ENSG00000063601,ENSG00000147378,ENSG00000102195",
"ENSG00000147099,ENSG00000067177,ENSG00000184388,ENSG00000225396,ENSG00000268994,ENSG00000198034,ENSG00000125931,ENSG00000269502,ENSG00000184911,ENSG00000275520",
"ENSG00000102081,ENSG00000176988")), row.names = c(NA, 6L), class = "data.frame")
Sample output
Here is a dput of the output data
#dput(head(gene_domains_out))
structure(list(domain = c(1L, 1L, 1L, 1L, 1L, 1L), gene = c("ENSG00000230594",
"ENSG00000171155", "ENSG00000224089", "ENSG00000230347", "ENSG00000236446",
"ENSG00000186471")), row.names = c(NA, 6L), class = "data.frame")
Hope that helps.
For a sample dataframe:
df <- structure(list(area = structure(c(1L, 4L, 3L, 8L, 5L, 7L, 6L,
2L), .Label = c("DE1", "DE3", "DE4", "DE5", "DE9", "DEA", "DEB",
"DEC"), class = "factor"), to.delete = c(1L, 0L, 1L, 0L, 1L,
1L, 1L, 0L)), .Names = c("area", "to.delete"), class = "data.frame", row.names = c(NA,
-8L))
I want to create a list of the areas which have a '1' in the 'to'delete' column. I know how to subset the 1s out of this dataframe, however I want the list of areas as eventually I will use this list to extract these areas from the main master data file (df2, listed below).
df2 <- structure(list(id = 1:24, area = structure(c(1L, 1L, 4L, 4L,
4L, 3L, 3L, 3L, 3L, 3L, 8L, 8L, 8L, 8L, 5L, 7L, 7L, 7L, 6L, 6L,
2L, 2L, 2L, 2L), .Label = c("DE1", "DE3", "DE4", "DE5", "DE9",
"DEA", "DEB", "DEC"), class = "factor")), .Names = c("id", "area"
), class = "data.frame", row.names = c(NA, -24L))
I prefer to do this in two steps, so I can easily see which areas I have deleted (thanks to answers below for suggestions of using list).
a <- list(df$area[df$to.delete == 1])
df2.subset <- df2[df2$area %in% a,]
This however doesn't seem to work at the moment, so if anyone has any ideas, then that would be great.
df2 should then be left with only areas DE5, DEC and DE3.
Many thanks.
Here is another method using split to collect the areas into two lists:
# get two lists of areas and give list items appropriate names
keepDrop <- setNames(split(df$area, df$to.delete), c("drop", "keep"))
# now perform dropping
df2.smaller <- df2[df2$area %in% keepDrop[["keep"]],]
We can use subset. Based on the description, the OP wants to subset the rows of a main data ('maindata') based on the 'area' that corresponds to 1 in 'to.delete' column. In that case, we extract the 'area' (df$area[df$to.delete ==1]) and with %in% we subset the 'maindata'.
subset(maindata, area %in% df$area[df$to.delete==1])
It's not too clear what you are asking.
This will create a list where each element is a different Area:
lapply(df$area[df$to.delete == 1], function(x) x)
If you want a list with just one element containing all the areas:
list(df$area[df$to.delete == 1])
Edit:
To answer the second part of your question:
a <- list(df$area[df$to.delete == 1])
df2.subset <- df2[!df2$area %in% a[[1]], ]
Here's what you can try .
a <- as.list(subset(df,df$to.delete == 1))
> a
$area
[1] DE1 DE4 DE9 DEB DEA
Levels: DE1 DE3 DE4 DE5 DE9 DEA DEB DEC
$to.delete
[1] 1 1 1 1 1
My data set:
structure(list(Site = c(2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 4L, 5L, 5L, 6L, 6L, 6L), Average.worm.weight..g. = c(0.1934,
0.249, 0.263, 0.262, 0.4186, 0.204, 0.311, 0.481, 0.326, 0.657,
0.347, 0.311, 0.239, 0.4156, 0.31, 0.3136, 0.4033, 0.302, 0.277
), Average.total.immune.cell.count = structure(c(8L, 16L, 11L,
12L, 10L, 1L, 4L, 15L, 4L, 3L, 17L, 13L, 18L, 7L, 5L, 6L, 9L,
14L, 2L), .Label = c("0", "168750", "18650000", "200,000", "21,600,000",
"226666.6", "22683333.33", "2533333.33", "283333.333", "291666.6",
"335833.3", "435800", "474816666.7", "500000", "6450000", "729166.667",
"7433333.3", "9916667"), class = "factor"), Average.eleocyte.number = structure(c(2L,
5L, 14L, 10L, 1L, 1L, 6L, 1L, 6L, 7L, 1L, 9L, 15L, 8L, 12L, 3L,
11L, 13L, 4L), .Label = c("0", "1266666.67", "153333.3", "168740",
"17", "200,000", "2266666.667", "22683333.33", "23116666.67",
"264000", "283333.333", "442", "500000", "7.3", "9916667"), class = "factor")), .Names = c("Site",
"Average.worm.weight..g.", "Average.total.immune.cell.count",
"Average.eleocyte.number"), class = "data.frame", row.names = c(NA,
-19L))
This is my R script so far:
Plotting multiple data series on a graph
y1<-dframe1$"Average.total.immune.cell.count"
y2<-dframe1$"Average.eleocyte.number"
x<-dframe1$"Average.worm.weight..g."
plot.default(y1~x,type="p" )
points(y2~x)
I am trying to add to y series to the same scatterplot and I am struggling to do so, I want to have different symbols for the points so as to tell apart the two different data series. Also I would like the axes to meet on the bottom left hand side and would appreciate being informed as to how I can do that? I would also like the y axis to be in standard form, but do not know how to get R to do that.
Best regards.
K.
So this is an object lesson is getting your data in the correct format to begin with. Your numbers have commas, which R does not like. Hence the numbers get converted to character and imported as factors (which your structure(...) clearly shows. You need to fix that, or better yet get rid of the commas prior to exporting.
Something like this will work
colnames(dframe) <- c("Site","x","y1","y2")
dframe$y1 <- as.numeric(as.character(gsub(",","",dframe$y1,fixed=TRUE)))
dframe$y2 <- as.numeric(as.character(gsub(",","",dframe$y2,fixed=TRUE)))
plot(y1~x,dframe, col="red", pch=20)
points(y2~x,dframe, col="blue", pch=20)
But there are additional problems. One of the numbers (in row 12) is a factor of 10 larger than all the others, so the plot above is not very informative. It's hard to know if this is a data input error, or a genuine outlier in your data.
EDIT: Response to OP's comment
dframe <- dframe[-12,] # remove row 12
dframe <- dframe[order(dframe$x),] # order by increasing x
plot(y1~x,dframe, col="red", pch=20, type="b")
points(y2~x,dframe, col="blue", pch=20, type="b")
legend("topleft",legend=c("y1","y2"),col=c("red","blue"),pch=20)