Add relative frequency by factor to a data frame

Add relative frequency by factor to a data frame - r

I want to add a column to a data frame that looks like this with the relative frequency by factor (Var2)
X = structure(list(Var1 = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L
), .Label = c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
"10"), class = "factor"), Var2 = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("No Treatment", "Any Treatment"), class = "factor"),
Freq = c(1L, 3L, 6L, 13L, 30L, 53L, 69L, 123L, 198L, 270L,
1324L, 1L, 0L, 4L, 10L, 16L, 33L, 44L, 75L, 113L, 159L, 630L
)), .Names = c("Var1", "Var2", "Freq"), row.names = c(NA,
-22L), class = "data.frame")
The solution that I have in mind is very complicated, and not very flexible. This is what I'm doing right now:
library(data.table)
DT =data.table(X)
myfun <- function (freq, group, total1, total2)
{
if(group[[1]] == "No Treatment"){
relfreq = freq/total1
}else{
relfreq = freq/total2
}
return(relfreq)
}
DT[,relfreq:=myfun(Freq,Var2,sum(DT$Freq[DT$Var2=="No Treatment"]), sum(DT$Freq[DT$Var2=="Any Treatment"]))]
Can somebody show me a better solution that is more flexible and allows Var2 to take more than 2 values?
Thanks!

Here is a data table solution, since you started out that way.
DT[,relfreq:=Freq/sum(Freq),by=Var2]
This will be faster if your dataset is extremely large, mostly because data table adds the new column by reference, rather than copying the whole dataset.

You can get a vector of the sum by factor with ave and divide X$Freq by this vector:
X$relfreq <- X$Freq / ave(X$Freq, X$Var2, FUN=sum)
Or even:
X$relfreq <- ave(X$Freq, X$Var2, FUN=function(x) x/sum(x))
Note that your function is incorrect, and divides each Freq by 2090 in your example, rather than dividing by the sum of the Freq of each factor level.

Related

perform acf plot for each type of group in R

Say, here the mydata (little part)
transport<- structure(list(date = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L), .Label = c("01.01.2001", "01.02.2001", "01.03.2001",
"01.04.2001", "01.05.2001", "01.06.2001", "01.07.2001", "01.08.2001",
"01.09.2001", "01.10.2001", "01.11.2001", "01.12.2001"), class = "factor"),
Market_82 = c(7000L, 7272L, 7668L, 7869L, 8057L, 8428L, 8587L,
8823L, 8922L, 9178L, 9306L, 9439L, 3725L, 4883L, 8186L, 7525L,
6335L, 4252L, 5642L, 1326L, 8605L, 3501L, 1944L, 7332L),
transport = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("plane", "train"), class = "factor")), .Names = c("date",
"Market_82", "transport"), class = "data.frame", row.names = c(NA,
-24L))
group variable - Transport.
For each type of transport i must get acf plot of time series.
something like this
How perform acf plot for each transport?
I have a lot of groups. How to do that plots were in folder
C:/Users/admin/Documents/myplot

akrun's answer is spot on. Since you tagged the question with ggplot2 you could also use ggAcf from the forcast package.
The first step is to split your data.
transport_split <- split(transport, transport$transport)
If you want to include the respective element of column transport in the title, subtitle etc. try with Map
out <- Map(
f = function(x, y)
forecast::ggAcf(x$Market_82) + labs(title = y),
x = transport_split,
y = names(transport_split)
)
out$train

We can do this with Acf from forecast
library(forecast)
par(mfrow = c(2, 1))
lapply(split(transport['Market_82'], transport$transport), Acf)
If we also want the title, then
lst <- lapply(split(transport['Market_82'], transport$transport), acf, plot = FALSE)
par(mfrow = c(2, 1))
lapply(names(lst), function(x) plot(lst[[x]], main = x))

dplyr - subtract based on condition from two different data frames

I have a data frame which looks like the following:
quant <- structure(list(Name = structure(c(158L, 159L, 160L, 161L, 162L,
163L, 164L, 165L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 98L,
99L, 100L, 101L), .Label = c("abc_02_NEHC_025_100_A", "abc_02_NEHC_025_100_B",
"abc_02_NEHC_025_100_C", "abc_02_NEHC_025_100_D", "abc_02_NEHC_025_100_E",
"abc_02_NEHC_025_100_F", "abc_02_NEHC_025_100_G", "abc_02_NEHC_025_100_H",
"abc_02_NEHC_05_100_A", "abc_02_NEHC_05_100_B", "abc_02_NEHC_05_100_C",
"abc_02_NEHC_05_100_D", "abc_02_NEHC_05_100_E", "abc_02_NEHC_05_100_F",
"abc_02_NEHC_05_100_G", "abc_02_NEHC_05_100_H", "abc_02_NEHC_100_1_A",
"abc_02_NEHC_100_1_B", "abc_02_NEHC_100_1_C", "abc_02_NEHC_100_1_D",
"abc_02_NEHC_100_1_E", "abc_02_NEHC_100_1_F", "abc_02_NEHC_100_1_G",
"abc_02_NEHC_100_1_H", "abc_02_VL_025_100_A", "abc_02_VL_025_100_B",
"abc_02_VL_025_100_C", "abc_02_VL_025_100_D", "abc_02_VL_025_100_E",
"abc_02_VL_025_100_F", "abc_02_VL_025_100_G", "abc_02_VL_025_100_H",
"abc_02_VL_05_100_A", "abc_02_VL_05_100_B", "abc_02_VL_05_100_C",
"abc_02_VL_05_100_D", "abc_02_VL_05_100_E", "abc_02_VL_05_100_F",
"abc_02_VL_05_100_G", "abc_02_VL_05_100_H", "abc_02_VL_1_100_A",
"abc_02_VL_1_100_B", "abc_02_VL_1_100_C", "abc_02_VL_1_100_D",
"abc_02_VL_1_100_E", "abc_02_VL_1_100_F", "abc_02_VL_1_100_G",
"abc_02_VL_1_100_H", "BACKGROUND_NEHC_0125_100_A", "BACKGROUND_NEHC_0125_100_B",
"BACKGROUND_NEHC_0125_100_C", "BACKGROUND_NEHC_0125_100_D", "BACKGROUND_NEHC_0125_100_E",
"BACKGROUND_NEHC_0125_100_F", "BACKGROUND_NEHC_0125_100_G", "BACKGROUND_NEHC_025_100_A",
"BACKGROUND_NEHC_025_100_B", "BACKGROUND_NEHC_025_100_C", "BACKGROUND_NEHC_025_100_D",
"BACKGROUND_NEHC_025_100_F", "BACKGROUND_NEHC_025_100_G", "BACKGROUND_NEHC_05_100_A",
"BACKGROUND_NEHC_05_100_B", "BACKGROUND_NEHC_05_100_C", "BACKGROUND_NEHC_05_100_D",
"BACKGROUND_NEHC_05_100_F", "BACKGROUND_NEHC_05_100_G", "BACKGROUND_NEHC_05_100_H",
"BACKGROUND_NEHC_1_100_A", "BACKGROUND_NEHC_1_100_B", "BACKGROUND_NEHC_1_100_C",
"BACKGROUND_NEHC_1_100_D", "BACKGROUND_NEHC_1_100_E", "BACKGROUND_NEHC_1_100_F",
"BACKGROUND_NEHC_1_100_G", "BACKGROUND_VL_0125_100_A", "BACKGROUND_VL_0125_100_B",
"BACKGROUND_VL_0125_100_C", "BACKGROUND_VL_0125_100_D", "BACKGROUND_VL_0125_100_E",
"BACKGROUND_VL_0125_100_F", "BACKGROUND_VL_025_100_A", "BACKGROUND_VL_025_100_B",
"BACKGROUND_VL_025_100_C", "BACKGROUND_VL_025_100_D", "BACKGROUND_VL_025_100_E",
"BACKGROUND_VL_025_100_F", "BACKGROUND_VL_025_100_G", "BACKGROUND_VL_025_100_H",
"BACKGROUND_VL_05_100_A", "BACKGROUND_VL_05_100_B", "BACKGROUND_VL_05_100_C",
"BACKGROUND_VL_05_100_D", "BACKGROUND_VL_05_100_E", "BACKGROUND_VL_05_100_F",
"BACKGROUND_VL_05_100_G", "BACKGROUND_VL_05_100_H", "BACKGROUND_VL_1_100_A",
"BACKGROUND_VL_1_100_B", "BACKGROUND_VL_1_100_C", "BACKGROUND_VL_1_100_D",
"BACKGROUND_VL_1_100_E", "BACKGROUND_VL_1_100_F", "BACKGROUND_VL_1_100_G",
"BACKGROUND_VL_1_100_H", "Epq_11_NEHC_0125_100_a", "Epq_11_NEHC_0125_100_B",
"Epq_11_NEHC_0125_100_C", "Epq_11_NEHC_0125_100_D", "Epq_11_NEHC_0125_100_E",
"Epq_11_NEHC_0125_100_F", "Epq_11_NEHC_0125_100_G", "Epq_11_NEHC_025_100_a",
"Epq_11_NEHC_025_100_B", "Epq_11_NEHC_025_100_C", "Epq_11_NEHC_025_100_D",
"Epq_11_NEHC_025_100_E", "Epq_11_NEHC_05_100_a", "Epq_11_NEHC_05_100_B",
"Epq_11_NEHC_05_100_C", "Epq_11_NEHC_05_100_D", "Epq_11_NEHC_05_100_E",
"Epq_11_NEHC_05_100_F", "Epq_11_NEHC_05_100_G", "Epq_11_NEHC_05_100_H",
"Epq_11_NEHC_1_100_a", "Epq_11_NEHC_1_100_B", "Epq_11_NEHC_1_100_C",
"Epq_11_NEHC_1_100_D", "Epq_11_NEHC_1_100_E", "Epq_11_NEHC_1_100_F",
"Epq_11_NEHC_1_100_G", "Epq_11_NEHC_1_100_H", "Epq_11_VL_0125_100_A",
"Epq_11_VL_0125_100_B", "Epq_11_VL_0125_100_C", "Epq_11_VL_0125_100_D",
"Epq_11_VL_0125_100_E", "Epq_11_VL_0125_100_F", "Epq_11_VL_0125_100_G",
"Epq_11_VL_0125_100_H", "Epq_11_VL_025_100_A", "Epq_11_VL_025_100_B",
"Epq_11_VL_025_100_C", "Epq_11_VL_025_100_D", "Epq_11_VL_025_100_E",
"Epq_11_VL_025_100_F", "Epq_11_VL_025_100_G", "Epq_11_VL_025_100_H",
"Epq_11_VL_05_100_A", "Epq_11_VL_05_100_B", "Epq_11_VL_05_100_C",
"Epq_11_VL_05_100_D", "Epq_11_VL_05_100_E", "Epq_11_VL_05_100_F",
"Epq_11_VL_05_100_G", "Epq_11_VL_05_100_H", "Epq_11_VL_1_100_A",
"Epq_11_VL_1_100_B", "Epq_11_VL_1_100_C", "Epq_11_VL_1_100_D",
"Epq_11_VL_1_100_E", "Epq_11_VL_1_100_F", "Epq_11_VL_1_100_G",
"Epq_11_VL_1_100_H"), class = "factor"), conc_factor = structure(c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L), .Label = c("pep_0.125", "pep_0.25", "pep_0.5", "pep_1.0"
), class = "factor"), peptide_factor = structure(c(3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("ABC", "Background", "EpQ_11"), class = "factor"),
serum_factor = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NEHC",
"VL"), class = "factor"), mean_fluorescence = c(65535, 65535,
65534.93359, 65535, 65535, 65535, 65535, 65535, 21322.06055,
22704.08594, 22546.32617, 21801.30664, 21668.2168, 22054.40234,
21621.54688, 21516.33984, 17760.80273, 17886.12891, 18382.7832,
17531.80273)), class = "data.frame", row.names = c(NA, -20L
), .Names = c("Name", "conc_factor", "peptide_factor", "serum_factor",
"mean_fluorescence"))
This is actually just a slice (1:20) of my complete data frame. Just to have a better idea of my complete data frame, I am pasting below the levels of the variables conc_factor, peptide_factor and serum_factor:
levels(quant$conc_factor)
[1] "pep_0.125" "pep_0.25" "pep_0.5" "pep_1.0"
levels(quant$peptide_factor)
[1] "ABC" "Background" "EpQ_11"
levels(quant$serum_factor)
[1] "NEHC" "VL"
With the following command:
summary_backgrounds <- quant %>% filter(peptide_factor=="Background") %>% group_by(conc_factor, serum_factor) %>% summarise(avg_fluorescence_grouped = mean(mean_fluorescence))
conc_factor serum_factor avg_fluorescence_grouped
<fctr> <fctr> <dbl>
1 pep_0.125 NEHC 18439.70
2 pep_0.125 VL 16985.60
3 pep_0.25 NEHC 18666.52
4 pep_0.25 VL 17577.98
5 pep_0.5 NEHC 18300.47
6 pep_0.5 VL 18010.99
7 pep_1.0 NEHC 16103.50
8 pep_1.0 VL 17710.50
I obtained the mean_fluorescence values of the Background, for each conc_factor and serum_factor. What I am trying to do now is the following: I want to add a new variable to the data frame quant (named avg_fluorescence_minus_background) in which I will subtract the background values (summary_backgrounds$avg_fluorescence_grouped, considering conc_factor and serum_factor from each of the individual values on quant$mean_fluorescence.
For example, for quant[1, ], given that I have conc_factor=="pep_1.0" and serum_factor=="VL", my result would be 65535.00 - 17710.50 = 47824.5. and so on.

Read up on joins and you'll find they make this type of problem very easy to solve:
quant <- left_join(quant, summary_backgrounds, by = c("conc_factor", "serum_factor"))
mutate(quant, avg_flourescence_minus_bg = mean_fluorescence - avg_fluorescence_grouped)

ggplot2 loop graph with conditional subsets

Data description:
I have a data set that is in long format with multiple different grouping variables (in data example: StandID and simID)
What I am trying to do:
I need to create simple scatter plots (x=predicted, y=observed) from this dataset for multiple columns based on a unique grouping variable.
An example of what I am trying to do using just standard plot is
obs=subset(example,simID=="OBS_OBS_OBS")
csfnw=example[example$simID== "CS_F_NW",]
plot(obs$X1HR,csfnw$X1HR)
I would need to do this for all simID and columns 9-14. (12 graphs total from data example)
What I have tried:
The problem I am running into is the y axis needs to remain the same, while cycling through the different subsets for the x axis.
I will admit up front, I have no idea what would be the best approach for this... I thought this would be easy for a split second because the data is already in long format and I would just be pointing to a subset of the data.
1) My original approach was to try and just splice up the data so that each simID had its own data frame, and compare it against the observation dataframe but I don't know how I would then pass it to ggplot.
2) My second idea was to make some kind of makeGraph function containing all the aesthetics I wanted essentially and use some kind of apply on it to pass everything through the function, but I could get neither to work.
makePlot=function(dat,x,y) {
ggplot(data=dat,aes(x=x,y=y))+geom_point(shape=Treat)+theme_bw()
}
What I could get to work was just breaking down the dataframe into the vectors of the variables I would then pass to some kind of loop/apply
sims=levels(example$simID)
sims2=sims[sims != "OBS_OBS_OBS"]
fuel_classes=colnames(example)[9:14]
Thank you
Data example:
example=structure(list(Year = structure(c(7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L), .Label = c("2001", "2002", "2003", "2004", "2005",
"2013", "2014", "2015"), class = "factor"), StandID = structure(c(10L,
2L, 6L, 22L, 14L, 18L, 34L, 26L, 30L, 10L, 2L, 6L, 22L, 14L,
18L, 34L, 26L, 30L, 10L, 2L, 6L, 22L, 14L, 18L, 34L, 26L, 30L
), .Label = c("1NB", "1NC", "1NT", "1NTB", "1RB", "1RC", "1RT",
"1RTB", "1SB", "1SC", "1ST", "1STB", "2NB", "2NC", "2NT", "2NTB",
"2RB", "2RC", "2RT", "2RTB", "2SB", "2SC", "2ST", "2STB", "3NB",
"3NC", "3NT", "3NTB", "3RB", "3RC", "3RT", "3RTB", "3SB", "3SC",
"3ST", "3STB"), class = "factor"), Block = structure(c(1L, 1L,
1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L,
1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("1", "2", "3"
), class = "factor"), Aspect = structure(c(3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L), .Label = c("N", "R", "S"), class = "factor"),
Treat = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("B", "C", "T", "TB"), class = "factor"),
Variant = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("CS", "OBS", "SN"), class = "factor"),
Fuels = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("F", "NF", "OBS"), class = "factor"),
Weather = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("NW", "OBS", "W"), class = "factor"),
X1HR = c(0.321666667, 0.177777778, 0.216111111, 0.280555556,
0.255555556, 0.251666667, 0.296666667, 0.231111111, 0.22,
0.27556628, 0.298042506, 0.440185249, 0.36150676, 0.398630172,
0.367523015, 0.345717251, 0.349305987, 0.412227929, 0.242860824,
0.258737177, 0.394024998, 0.287317872, 0.321927488, 0.281322986,
0.313588411, 0.303123146, 0.383658946), X10HR = c(0.440555556,
0.32, 0.266666667, 0.292222222, 0.496666667, 0.334444444,
0.564444444, 0.424444444, 0.432777778, 0.775042951, 0.832148314,
1.08174026, 1.023838878, 0.976997674, 0.844206274, 0.929837704,
1.0527215, 1.089246511, 0.88642776, 0.920596302, 1.209707737,
1.083737493, 1.077612877, 0.92481339, 1.041637182, 1.149550319,
1.229776621), X100HR = c(0.953888889, 1.379444444, 0.881666667,
1.640555556, 2.321666667, 1.122222222, 1.907777778, 1.633888889,
1.208333333, 1.832724094, 2.149356842, 2.364475727, 2.493232965,
2.262988567, 1.903909683, 2.135747433, 2.256677628, 2.288722038,
1.997704744, 2.087135553, 2.524872541, 2.34671092, 2.338253498,
2.06796217, 2.176314831, 2.580271006, 2.857197046), X1000HR = c(4.766666667,
8.342222222, 3.803333333, 8.057777778, 10.11444444, 6.931111111,
6.980555556, 13.20611111, 1.853333333, 3.389177084, 4.915714741,
2.795267582, 2.48227787, 2.218413353, 1.64684248, 2.716156483,
2.913746119, 2.238629341, 3.449863434, 3.432626724, 3.617531776,
3.641639471, 3.453454971, 3.176793337, 3.459602833, 3.871166945,
2.683447838), LITTER = c(2.4, 2.219444444, 2.772222222, 2.596666667,
2.693888889, 2.226111111, 2.552222222, 3.109444444, 2.963333333,
2.882233381, 3.025934696, 3.174396992, 3.291081667, 2.897673607,
2.737119675, 2.987895727, 3.679605484, 2.769756079, 2.882241249,
3.02594161, 3.174404144, 3.291091681, 2.897681713, 2.737129688,
2.987901449, 3.679611444, 2.769766569), DUFF = c(1.483333333,
1.723888889, 0.901666667, 1.520555556, 1.49, 1.366111111,
0.551666667, 1.056111111, 0.786111111, 2.034614563, 2.349547148,
1.685223818, 2.301301956, 2.609308243, 2.21895647, 2.043699026,
2.142618418, 0.953421116, 4.968493462, 4.990526676, 5.012362003,
5.023665905, 4.974074364, 4.947199821, 4.976779461, 5.082509995,
3.55211544), simID = structure(c(5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("CS_F_NW", "CS_F_W",
"CS_NF_NW", "CS_NF_W", "OBS_OBS_OBS", "SN_F_NW", "SN_F_W",
"SN_NF_NW", "SN_NF_W"), class = "factor")), .Names = c("Year",
"StandID", "Block", "Aspect", "Treat", "Variant", "Fuels", "Weather",
"X1HR", "X10HR", "X100HR", "X1000HR", "LITTER", "DUFF", "simID"
), row.names = c(37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L,
82L, 83L, 84L, 85L, 86L, 87L, 88L, 89L, 90L, 127L, 128L, 129L,
130L, 131L, 132L, 133L, 134L, 135L), class = "data.frame")

You were actually on the right track. If all plots are the same, just make one function and then use loops to loop over the subsets. For your example this can be done like this:
library(ggplot2)
# the plot function
plotFun = function(dat, title) {
ggplot(data=dat) +
geom_point(aes(x = x, y = y), shape=18) +
ggtitle(title) +
theme_bw()
}
# columns of interest
colIdx = 9:14
# split on all values of simID
dfList = split(example, example$simID)
# simID has never appearing factors. These are removed
dfList = dfList[lapply(dfList, nrow) != 0]
# make empty array for saving plots
plotList = array(list(), dim = c(length(dfList), length(dfList), length(colIdx)),
dimnames = list(names(dfList), names(dfList), names(example)[colIdx]))
# the first two loops loop over all unique combinations of dfList
for (i in 2:length(dfList)) {
for (j in 1:(i-1)) {
# loop over target variables
for (k in seq_along(colIdx)) {
# store variables to plot in a temporary dataframe
tempDf = data.frame(x = dfList[[i]][, colIdx[k]],
y = dfList[[j]][, colIdx[k]])
# add a title so we can see in the plot what is plotted vs what
title = paste0(names(dfList)[i], ":", names(dfList[[i]])[colIdx[k]], " VS ",
names(dfList)[j], ":", names(dfList[[j]])[colIdx[k]])
# make and save plot
plotList[[i, j, k]] = plotFun(tempDf, title)
}
}
}
# call the plots like this
plotList[[2, 1, 4]]
# Note that we only filled the lower triangle of combinations
# therefore indexing with [[1, 1, 1]] just returns NULL
plotList[, , 1]
This process can probably be more optimized, but when creating graphs I would go for clarity above speed since speed usually isn't an issue.

Reshape a large matrix with missing values and multiple vars of interest [duplicate]

This question already has answers here:
Convert data from long format to wide format with multiple measure columns
(6 answers)
Closed 4 years ago.
I need to reorganize a large dataset into a specific format for further analysis. Right now the data are in long format, with multiple records through time for each point. I need to reshape the data so that each point has a single record, but it will add many new columns of the time-specific data. I’ve looked at previous similar posts but I need to ultimately convert several of the current variables into columns, and I can’t find an example of such. Is there a way to accomplish this in a single reshape, or will I have to do several and then concatenate the new columns back together? Another wrinkle before I post the example is that not all points were sampled at each time-step, so I need those values to show up as NA. For example, (see data below) SitePoint A1 was not sampled at all in 2012, SitePoint A10 was not sampled during the first round in 2012, but K83 was sampled all nine times.
mydatain <- structure(list(SitePoint = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 6L, 6L), .Label = c("A1", "A10", "K145", "K83", "T15",
"T213"), class = "factor"), Year_Rotation = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 1L, 2L, 4L, 5L,
6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 1L, 7L), .Label = c("2010_1", "2010_2",
"2010_3", "2011_1", "2011_2", "2011_3", "2012_1", "2012_2", "2012_3"
), class = "factor"), MR_Fire = structure(c(5L, 6L, 6L, 2L, 9L,
9L, 5L, 6L, 6L, 2L, 9L, 9L, 7L, 8L, 16L, 17L, 21L, 22L, 23L,
25L, 3L, 4L, 10L, 11L, 12L, 13L, 14L, 15L, 18L, 19L, 20L, 1L,
2L, 2L, 5L, 6L, 6L, 11L, 11L, 12L, 7L, 24L), .Label = c("0",
"1", "10", "11", "12", "13", "14", "15", "2", "23", "24", "25",
"35", "36", "37", "39", "40", "47", "48", "49", "51", "52", "53",
"8", "9"), class = "factor"), fire_seas = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 3L), .Label = c("dry", "fire", "wet"
), class = "factor"), OptTSF = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 1L)), .Names = c("SitePoint", "Year_Rotation", "MR_Fire",
"fire_seas", "OptTSF"), row.names = c(31L, 32L, 33L, 34L, 35L,
36L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 10543L, 10544L,
10545L, 10546L, 10547L, 10548L, 10549L, 10550L, 14988L, 14989L,
14990L, 14991L, 14992L, 14993L, 14994L, 14995L, 14996L, 17370L,
17371L, 17372L, 17373L, 17374L, 17375L, 17376L, 17377L, 17378L,
19353L, 19354L), class = "data.frame")
Ultimately I need something like this:
myfinal <- structure(list(SitePoint = structure(1:6, .Label = c("A1", "A10",
"K145", "K83", "T15", "T213"), class = "factor"), MR_Fire_2010_1 = c(12L,
12L, 39L, 23L, 0L, 14L), MR_Fire_2010_2 = c(13L, 13L, 40L, 24L,
1L, NA), MR_Fire_2010_3 = c(13L, 13L, NA, 25L, 1L, NA), MR_Fire_2011_1 = c(1L,
1L, 51L, 35L, 12L, NA), MR_Fire_2011_2 = c(2L, 2L, 52L, 36L,
13L, NA), MR_Fire_2011_3 = c(2L, 2L, 53L, 37L, 13L, NA), MR_Fire_2012_1 = c(NA,
NA, 9L, 47L, 24L, 8L), MR_Fire_2012_2 = c(NA, 14L, 10L, 48L,
24L, NA), MR_Fire_2012_3 = c(NA, 15L, 11L, 49L, 25L, NA), season_2010_1 = structure(c(2L,
2L, 1L, 2L, 2L, 1L), .Label = c("dry", "fire"), class = "factor"),
season_2010_2 = structure(c(2L, 2L, 1L, 2L, 2L, NA), .Label = c("dry",
"fire"), class = "factor"), season_2010_3 = structure(c(1L,
1L, NA, 1L, 1L, NA), .Label = "fire", class = "factor"),
season_2011_1 = structure(c(2L, 2L, 1L, 2L, 2L, NA), .Label = c("dry",
"fire"), class = "factor"), season_2011_2 = structure(c(2L,
2L, 1L, 2L, 2L, NA), .Label = c("dry", "fire"), class = "factor"),
season_2011_3 = structure(c(2L, 2L, 1L, 2L, 2L, NA), .Label = c("dry",
"fire"), class = "factor"), season_2012_1 = structure(c(NA,
NA, 2L, 1L, 1L, 2L), .Label = c("fire", "wet"), class = "factor"),
season_2012_2 = structure(c(NA, 1L, 2L, 1L, 1L, NA), .Label = c("fire",
"wet"), class = "factor"), season_2012_3 = structure(c(NA,
1L, 2L, 1L, 1L, NA), .Label = c("fire", "wet"), class = "factor"),
OptTSF_2010_1 = c(1L, 1L, 0L, 1L, 1L, 1L), OptTSF_2010_2 = c(1L,
1L, 0L, 1L, 1L, NA), OptTSF_2010_3 = c(1L, 1L, NA, 1L, 1L,
NA), OptTSF_2011_1 = c(1L, 1L, 0L, 0L, 1L, NA), OptTSF_2011_2 = c(1L,
1L, 0L, 0L, 1L, NA), OptTSF_2011_3 = c(1L, 1L, 0L, 0L, 1L,
NA), OptTSF_2012_1 = c(NA, NA, 1L, 0L, 0L, 1L), OptTSF_2012_2 = c(NA,
1L, 1L, 0L, 0L, NA), OptTSF_2012_3 = c(NA, 1L, 1L, 0L, 0L,
NA)), .Names = c("SitePoint", "MR_Fire_2010_1", "MR_Fire_2010_2",
"MR_Fire_2010_3", "MR_Fire_2011_1", "MR_Fire_2011_2", "MR_Fire_2011_3",
"MR_Fire_2012_1", "MR_Fire_2012_2", "MR_Fire_2012_3", "season_2010_1",
"season_2010_2", "season_2010_3", "season_2011_1", "season_2011_2",
"season_2011_3", "season_2012_1", "season_2012_2", "season_2012_3",
"OptTSF_2010_1", "OptTSF_2010_2", "OptTSF_2010_3", "OptTSF_2011_1",
"OptTSF_2011_2", "OptTSF_2011_3", "OptTSF_2012_1", "OptTSF_2012_2",
"OptTSF_2012_3"), class = "data.frame", row.names = c(NA, -6L
))
The actual dataset is about 23656 records X 15 variables, so doing it by hand is likely to cause major headaches and potential for mistakes. Any help or suggestions are appreciated. If this has been answered elsewhere, apologies. I couldn’t find anything directly applicable; everything seemed to related to three columns and only one of those being extracted as new variables. Thanks.
SP

dcast from the devel version of data.table i.e., v1.9.5 can cast multiple columns simultaneously. It can be installed from here.
library(data.table) ## v1.9.5+
dcast(setDT(mydatain), SitePoint~Year_Rotation,
value.var=c('MR_Fire', 'fire_seas', 'OptTSF'))

You can use reshape to change the structure of your dataframe from long to wide using the following code:
reshape(mydatain,timevar="Year_Rotation",idvar="SitePoint",direction="wide")

R shiny app with rCharts

I'm able to create this graph with rCharts:
library(rCharts)
X <- structure(list(Var1 = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), .Label = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10"), class = "factor"), Var2 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("control",
"treatment1", "treatment2"), class = "factor"), Freq = c(0L,
0L, 3L, 2L, 6L, 9L, 13L, 36L, 50L, 497L, 0L, 2L, 1L, 3L, 6L,
4L, 11L, 29L, 50L, 499L, 1L, 2L, 0L, 2L, 5L, 6L, 12L, 22L, 63L,
490L)), .Names = c("Var1", "Var2", "Freq"), row.names = c(NA,
-30L), class = "data.frame")
n1<-nPlot(Freq ~ Var1, group = 'Var2', data = X, type = 'multiBarChart')
print(n1)
Now I'm trying to embeded in a Shiny app. I can do a shiny app with ggplot2, but I'm not sure how to print the rCharts graph.
This is the shiny code that I have right now:
#server.R
library(rCharts)
X <- structure(list(Var1 = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), .Label = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10"), class = "factor"), Var2 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("control",
"treatment1", "treatment2"), class = "factor"), Freq = c(0L,
0L, 3L, 2L, 6L, 9L, 13L, 36L, 50L, 497L, 0L, 2L, 1L, 3L, 6L,
4L, 11L, 29L, 50L, 499L, 1L, 2L, 0L, 2L, 5L, 6L, 12L, 22L, 63L,
490L)), .Names = c("Var1", "Var2", "Freq"), row.names = c(NA,
-30L), class = "data.frame")
shinyServer(
function(input, output) {
output$histogram <- renderPlot({
# You can access the value of the widget with input$select, e.g.
output$value <- renderPrint({ input$select })
n2 <- nPlot(Freq ~ Var1, group = 'Var2', data = X, type = 'multiBarChart')
n2$set(dom = "histogram")
return(n2)
})
}
)
#ui.R
shinyUI(fluidPage(
titlePanel("Quiz 3 grades distribution"),
sidebarLayout(
sidebarPanel(
helpText("Quiz 3 grade distribution by treatment group"),
selectInput("select", label = h3("Select box"),
choices = list("All" = 0, "Not Perfect" = 1, "Perfect" = 2),
selected = 0)
),
mainPanel(plotOutput("histogram"))
)
))
What am I doing wrong? Thanks!

Use renderChart2 and showOutput to display nvd3 plots in shiny. Using renderChart2 doesn't require the using $set(dom = ....
library(rCharts)
library(shiny)
X <- data.frame(Var1 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L,8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L),
Var2 = structure(c(1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("control","treatment1", "treatment2"), class = "factor"),
Freq = c(0L,0L, 3L, 2L, 6L, 9L, 13L, 36L, 50L, 497L, 0L, 2L, 1L, 3L, 6L, 4L, 11L, 29L, 50L, 499L, 1L, 2L, 0L, 2L, 5L, 6L, 12L, 22L, 63L,490L)
)
runApp(
list(ui = fluidPage(
titlePanel("Quiz 3 grades distribution"),
sidebarLayout(
sidebarPanel(
helpText("Quiz 3 grade distribution by treatment group"),
selectInput("select", label = h3("Select box"),
choices = list("All" = 0, "Not Perfect" = 1, "Perfect" = 2),
selected = 0)
),
mainPanel(
showOutput("histogram","Nvd3")
)
)
),
server = shinyServer(
function(input, output, session) {
output$histogram <- renderChart2({
n2 <- nPlot(Freq ~ Var1, group = 'Var2', data = X, type = 'multiBarChart')
n2
})
}
)
)
)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Add relative frequency by factor to a data frame - r

Here is a data table solution, since you started out that way. DT[,relfreq:=Freq/sum(Freq),by=Var2] This will be faster if your dataset is extremely large, mostly because data table adds the new column by reference, rather than copying the whole dataset.

Related

perform acf plot for each type of group in R

dplyr - subtract based on condition from two different data frames

ggplot2 loop graph with conditional subsets

Reshape a large matrix with missing values and multiple vars of interest [duplicate]

R shiny app with rCharts

Categories

Resources