Related
I have the following data frame (see dput() below):
Date Time Price Volume VolumeSEK Bid.Price Ask.Price BidAskSpread MidPrice Company
1005 11.09.2018 25204.72 98.500 1153 113570.500 98.58 98.58 0.00 98.580 SEB
1071 11.09.2018 25209.89 233.300 158 36861.400 233.30 233.80 0.50 233.550 Alfa Laval
88995 12.09.2018 25220.83 170.500 101 17220.500 170.50 170.60 0.10 170.550 Skanska
1115 11.09.2018 25224.86 233.300 1 233.300 233.30 233.70 0.40 233.500 Alfa Laval
89001 12.09.2018 25229.77 96.960 937 90851.520 96.96 97.04 0.08 97.000 SEB
259224 14.09.2018 25239.65 213.950 126 26957.700 214.00 214.20 0.20 214.100 Swedbank
329555 17.09.2018 25244.28 178.375 19 3389.125 178.35 178.60 0.25 178.475 Skanska
1177 11.09.2018 25248.27 233.400 127 29641.800 233.30 233.60 0.30 233.450 Alfa Laval
1197 11.09.2018 25256.45 286.600 267 76522.200 286.60 287.10 0.50 286.850 Kinnevik
1200 11.09.2018 25258.17 98.520 32 3152.640 98.30 98.38 0.08 98.340 SEB
and two vectors
Comp<- c("Skanska", "SEB", "Swedbank", "Kinnevik", "Investor", "Alfa Laval")
Day<- c("11.09.2018","12.09.2018", "13.09.2018", "14.09.2018", "15.09.2018", "16.09.2018", "17.09.2018")
I would like to compute the mean VolumeSEK for every single stock for every single day and save the results in a matrix. I am quite new to R so my first guess was to use a for loop. However, this did not work for me so far. This was my approach, but I am a little lost and this might not be the right approach at all.
mat <- matrix(, nrow = 6, ncol = 7))
for (i in 1:Comp){
for(j in 1:Day){
mat[i,j]= mean(df$VolumeSEK[df$Company==Comp[i]& df$Date==Day[j]])
}#2
}#1
Any feedback and hints are highly appreciated. Thanks in advance!
df<-structure(list(X.RIC = structure(c(8L, 2L, 10L, 2L, 8L, 12L,
10L, 2L, 6L, 8L, 12L, 4L, 6L, 8L, 6L, 2L, 6L, 8L, 12L, 4L, 6L,
8L, 12L, 12L, 4L, 4L, 4L, 12L, 4L, 12L, 12L, 4L, 12L, 4L, 4L,
8L, 6L, 12L, 4L, 4L, 6L, 10L, 4L, 10L, 12L, 12L, 8L, 4L, 6L,
8L), .Label = c("ALFA.ST", "ALFAs.BCO", "INVEb.ST", "INVEBs.BCO",
"KINVb.ST", "KINVBs.BCO", "SEBa.ST", "SEBAs.BCO", "SKAb.ST",
"SKABs.BCO", "SWEDa.ST", "SWEDAs.BCO"), class = "factor"), Date = structure(c(1L,
1L, 2L, 1L, 2L, 4L, 5L, 1L, 1L, 1L, 1L, 4L, 2L, 1L, 3L, 2L, 5L,
1L, 1L, 1L, 4L, 2L, 1L, 3L, 2L, 1L, 1L, 5L, 1L, 1L, 5L, 1L, 5L,
1L, 1L, 5L, 5L, 1L, 1L, 1L, 4L, 5L, 1L, 2L, 4L, 1L, 3L, 4L, 5L,
5L), .Label = c("11.09.2018", "12.09.2018", "13.09.2018", "14.09.2018",
"17.09.2018"), class = "factor"), Time = c(25204.724866253, 25209.891063318,
25220.83, 25224.862743496, 25229.77, 25239.65, 25244.28, 25248.266841503,
25256.450392157, 25258.169598025, 25259.431887444, 25265.42,
25267.73, 25282.608168894, 25297.72, 25300.78, 25304.39, 25312.181336031,
25314.992406965, 25334.129581998, 25337.19, 25337.52, 25338.977745285,
25339.14, 25340.48, 25341.34500136, 25346.804459672, 25347.23,
25351.80572164, 25352.089646376, 25354.56, 25356.805147054, 25359.55,
25361.804327741, 25366.804555871, 25370.11, 25372.53, 25378.384314178,
25378.884337058, 25386.788916974, 25388.64, 25389.67, 25392.033315652,
25401.17, 25403.9, 25421.773090991, 25421.98, 25424.19, 25424.21,
25424.85), Type = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Quote",
"Trade"), class = "factor"), Price = c(98.5, 233.3, 170.5, 233.3,
96.96, 213.95, 178.375, 233.4, 286.6, 98.52, 213.1, 409.75, 290.9,
98.42, 291.5, 235, 288.8, 98.4, 213.2, 407.8, 291.3, 96.78, 213,
212.3, 406.48, 407.9, 407.9, 212.8, 407.9, 213, 212.8, 407.9,
212.8, 407.9, 407.9, 96.86, 288.5, 213, 407.7, 407.9, 291.3,
178.7, 407.8, 170.9, 213.9, 212.9, 96.46, 409.7, 288.3, 96.88
), Volume = c(1153L, 158L, 101L, 1L, 937L, 126L, 19L, 127L, 267L,
32L, 64L, 17L, 31L, 733L, 100L, 130L, 51L, 46L, 214L, 21L, 78L,
155L, 55L, 761L, 295L, 121L, 6L, 113L, 5L, 350L, 4L, 5L, 3L,
6L, 5L, 711L, 567L, 350L, 13L, 8L, 4L, 110L, 587L, 607L, 1272L,
363L, 13L, 419L, 63L, 21L), Venue = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "SINT[GV5_TEXT]", class = "factor"),
VolumeSEK = c(113570.5, 36861.4, 17220.5, 233.3, 90851.52,
26957.7, 3389.125, 29641.8, 76522.2, 3152.64, 13638.4, 6965.75,
9017.9, 72141.86, 29150, 30550, 14728.8, 4526.4, 45624.8,
8563.8, 22721.4, 15000.9, 11715, 161560.3, 119911.6, 49355.9,
2447.4, 24046.4, 2039.5, 74550, 851.2, 2039.5, 638.4, 2447.4,
2039.5, 68867.46, 163579.5, 74550, 5300.1, 3263.2, 1165.2,
19657, 239378.6, 103736.3, 272080.8, 77282.7, 1253.98, 171664.3,
18162.9, 2034.48), Bid.Price = c(98.58, 233.3, 170.5, 233.3,
96.96, 214, 178.35, 233.3, 286.6, 98.3, 212.8, 409.6, 290.9,
98.38, 291.5, 235, 288.6, 98.38, 213.1, 407.5, 291.3, 96.76,
213, 212.1, 406.2, 407.5, 407.5, 212.8, 407.5, 213, 212.8,
407.5, 212.8, 407.5, 407.5, 96.88, 288.5, 213.1, 407.3, 407.6,
291.6, 178.6, 407.5, 170.75, 213.8, 213, 96.46, 409.3, 288.2,
96.88), Ask.Price = c(98.58, 233.8, 170.6, 233.7, 97.04,
214.2, 178.6, 233.6, 287.1, 98.38, 213, 410, 291.1, 98.42,
291.7, 235.4, 289, 98.46, 213.3, 407.8, 291.5, 96.78, 213.2,
212.4, 406.5, 407.9, 407.9, 213.1, 407.9, 213.1, 213.1, 407.9,
213.1, 407.9, 407.9, 96.98, 288.8, 213.2, 407.8, 407.9, 291.9,
178.75, 407.8, 170.95, 213.9, 213.1, 96.54, 409.6, 288.6,
96.96), BidAskSpread = c(0, 0.5, 0.1, 0.399999999999977,
0.08, 0.2, 0.25, 0.299999999999983, 0.5, 0.0799999999999983,
0.199999999999989, 0.4, 0.2, 0.0400000000000063, 0.2, 0.4,
0.4, 0.0799999999999983, 0.200000000000017, 0.300000000000011,
0.2, 0.02, 0.199999999999989, 0.3, 0.3, 0.399999999999977,
0.399999999999977, 0.3, 0.399999999999977, 0.0999999999999943,
0.3, 0.399999999999977, 0.3, 0.399999999999977, 0.399999999999977,
0.1, 0.3, 0.0999999999999943, 0.5, 0.299999999999955, 0.3,
0.15, 0.300000000000011, 0.2, 0.1, 0.0999999999999943, 0.08,
0.3, 0.4, 0.08), MidPrice = c(98.58, 233.55, 170.55, 233.5,
97, 214.1, 178.475, 233.45, 286.85, 98.34, 212.9, 409.8,
291, 98.4, 291.6, 235.2, 288.8, 98.42, 213.2, 407.65, 291.4,
96.77, 213.1, 212.25, 406.35, 407.7, 407.7, 212.95, 407.7,
213.05, 212.95, 407.7, 212.95, 407.7, 407.7, 96.93, 288.65,
213.15, 407.55, 407.75, 291.75, 178.675, 407.65, 170.85,
213.85, 213.05, 96.5, 409.45, 288.4, 96.92), Company = structure(c(4L,
1L, 5L, 1L, 4L, 6L, 5L, 1L, 3L, 4L, 6L, 2L, 3L, 4L, 3L, 1L,
3L, 4L, 6L, 2L, 3L, 4L, 6L, 6L, 2L, 2L, 2L, 6L, 2L, 6L, 6L,
2L, 6L, 2L, 2L, 4L, 3L, 6L, 2L, 2L, 3L, 5L, 2L, 5L, 6L, 6L,
4L, 2L, 3L, 4L), .Label = c("Alfa Laval", "Investor", "Kinnevik",
"SEB", "Skanska", "Swedbank"), class = "factor")), .Names = c("X.RIC",
"Date", "Time", "Type", "Price", "Volume", "Venue", "VolumeSEK",
"Bid.Price", "Ask.Price", "BidAskSpread", "MidPrice", "Company"
), row.names = c(1005L, 1071L, 88995L, 1115L, 89001L, 259224L,
329555L, 1177L, 1197L, 1200L, 1201L, 259266L, 89158L, 1253L,
178546L, 89199L, 329638L, 1312L, 1319L, 1369L, 259339L, 89245L,
1383L, 178643L, 89249L, 1385L, 1388L, 329712L, 1401L, 1404L,
329722L, 1412L, 329729L, 1418L, 1421L, 329762L, 329771L, 1437L,
1443L, 1471L, 259393L, 329810L, 1485L, 89373L, 259439L, 1532L,
178820L, 259511L, 329870L, 329871L), class = "data.frame")
data.table approach, casted to wide format
library(data.table)
dcast( setDT(df)[ Company %in% Comp & Date %in% Day, ][, list( mean = mean(VolumeSEK)), by = .(Company, Date)], Company ~ Date )
# Company 11.09.2018 12.09.2018 13.09.2018 14.09.2018 17.09.2018
# 1: Alfa Laval 22245.50 30550.00 NA NA NA
# 2: Investor 31687.49 119911.60 NA 89315.02 NA
# 3: Kinnevik 76522.20 9017.90 29150.00 11943.30 65490.40
# 4: SEB 48347.85 52926.21 1253.98 NA 35450.97
# 5: Skanska NA 60478.40 NA NA 11523.06
# 6: Swedbank 49560.15 NA 161560.30 149519.25 8512.00
switch Company ~ Date to Date ~ Company for casting the other way around (i.e. Companies to columns).
Using basic R:
aggregate(VolumeSEK ~ Date + Company, data=df, FUN = mean)
Date Company VolumeSEK
1 11.09.2018 Alfa Laval 22245.50
2 12.09.2018 Alfa Laval 30550.00
3 11.09.2018 Investor 31687.49
4 12.09.2018 Investor 119911.60
5 14.09.2018 Investor 89315.02
6 11.09.2018 Kinnevik 76522.20
7 12.09.2018 Kinnevik 9017.90
8 13.09.2018 Kinnevik 29150.00
9 14.09.2018 Kinnevik 11943.30
10 17.09.2018 Kinnevik 65490.40
11 11.09.2018 SEB 48347.85
12 12.09.2018 SEB 52926.21
13 13.09.2018 SEB 1253.98
14 17.09.2018 SEB 35450.97
15 12.09.2018 Skanska 60478.40
16 17.09.2018 Skanska 11523.06
17 11.09.2018 Swedbank 49560.15
18 13.09.2018 Swedbank 161560.30
19 14.09.2018 Swedbank 149519.25
20 17.09.2018 Swedbank 8512.00
This calculates the mean of VolumeSEK for each Company and Date pair. This is in a long format but if you need it as wide do:
reshape(df2, idvar = "Company", timevar = "Date", direction = "wide")
Company VolumeSEK.11.09.2018 VolumeSEK.12.09.2018 VolumeSEK.14.09.2018 VolumeSEK.13.09.2018 VolumeSEK.17.09.2018
1 Alfa Laval 22245.50 30550.00 NA NA NA
3 Investor 31687.49 119911.60 89315.02 NA NA
6 Kinnevik 76522.20 9017.90 11943.30 29150.00 65490.40
11 SEB 48347.85 52926.21 NA 1253.98 35450.97
15 Skanska NA 60478.40 NA NA 11523.06
17 Swedbank 49560.15 NA 149519.25 161560.30 8512.00
where df2 is the result from aggregate above.
For this last step you can also use the reshape2 library and do:
library(reshape2)
dcast(df2, Company ~ Date, value.var = "VolumeSEK")
Company 11.09.2018 12.09.2018 13.09.2018 14.09.2018 17.09.2018
1 Alfa Laval 22245.50 30550.00 NA NA NA
2 Investor 31687.49 119911.60 NA 89315.02 NA
3 Kinnevik 76522.20 9017.90 29150.00 11943.30 65490.40
4 SEB 48347.85 52926.21 1253.98 NA 35450.97
5 Skanska NA 60478.40 NA NA 11523.06
6 Swedbank 49560.15 NA 161560.30 149519.25 8512.00
This is a much simpler command and gives a cleaner result.
here is a solution with the tidyverse package.
Note that no loops are needed here:
library(tidyverse)
df %>% as_tibble %>% group_by(Date, Company) %>%
summarise(x = mean(VolumeSEK)) %>%
ungroup() %>%
spread(Company, x)
# A tibble: 5 x 7
Date `Alfa Laval` Investor Kinnevik SEB Skanska Swedbank
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 11.09.2018 22246. 31687. 76522. 48348. NA 49560.
2 12.09.2018 30550 119912. 9017.9 52926. 60478. NA
3 13.09.2018 NA NA 29150 1254.0 NA 161560.
4 14.09.2018 NA 89315. 11943. NA NA 149519.
5 17.09.2018 NA NA 65490. 35451. 11523. 8512
I am trying to show the distribution of data between three different methods(FAP, One PIT (onetrans), Two PIT (twotrans), shown in facets below) for measuring the forest fuels. My count on the y-axis is the number of sample points that estimate the grouped value on the x-axis (Total.kg.m2). The Total.kg.m2 is a continuous variable. I don't particularly care how big the binwidth is on the x-axis is but I want only values that are exactly zero to be above the "0" label. My current graph [1] is misrepresentative because there are no sample points that estimate "0" for the FAP method. Below is some example data and my code. How can I do this more effectively? My dataframe is called "cwd" but I have included a subset at the bottom.
My current graph:
The code for my current graph:
method_names <- c(`FAP` = "FAP", `onetrans` = "PIT - One Transect ", `twotrans` ="PIT - Two Transects")
ggplot(sampleData, aes(Total.kg.m2)) +
geom_histogram(bins=40, color = "black", fill = "white") +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor =
element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
legend.position = "none",axis.text=element_text(size=10), axis.title =
element_text(size = 12)) +
scale_x_continuous(name= expression("kg m"^"-2"), breaks =seq(0,16,1)) +
scale_y_continuous(name = "Count", breaks = seq(0, 80,10), limits= c(0,70)) +
facet_grid(.~method) +
facet_wrap(~method, ncol =1, labeller = as_labeller(method_names)) +
theme(strip.text.x = element_text(size =14),
strip.background = element_rect(color = "black", fill = "gray"))
I don't think using geom_bar gets me what I want and I tried changing the binwidth to 0.05 in geom_histogram but then I get bins too small. Essentially, I think I'm trying to change my data from continuous numeric to factors but I'm not sure how to make it work.
Here is some sample data:
sampleData
Site Treatment Unit Plot Total.Tons.ac Total.kg.m2 method
130 Thinning CO 10 7 0.4500000 0.1008000 twotrans
351 Shelterwood CO 12 1 7.2211615 1.6175402 twotrans
88 Thinning NB 3 7 1.1400000 0.2553600 twotrans
224 Shelterwood NB 2 3 2.1136105 0.4734487 onetrans
54 Thinning SB 9 11 1.8857743 0.4224134 onetrans
74 Thinning SB 1 3 0.8500000 0.1904000 twotrans
328 Shelterwood DB 7 11 0.8740906 0.1957963 twotrans
341 Shelterwood CO 10 5 2.4210886 0.5423239 twotrans
266 Shelterwood WB 9 7 1.0092961 0.2260823 onetrans
405 Shelterwood WB 9 5 7.0029263 1.5686555 FAP
332 Shelterwood NB 8 7 2.8059152 0.6285250 twotrans
126 Thinning SB 9 11 1.4900000 0.3337600 twotrans
295 Shelterwood NB 2 5 7.6567281 1.7151071 twotrans
406 Shelterwood WB 9 7 3.0703135 0.6877502 FAP
179 Thinning FB 6 9 13.2916773 2.9773357 FAP
185 Thinning FB 7 9 5.3594318 1.2005127 FAP
39 Thinning FB 7 5 0.0000000 0.0000000 onetrans
187 Thinning NB 8 1 0.9477477 0.2122955 FAP
10 Thinning FB 2 7 0.0000000 0.0000000 onetrans
102 Thinning SB 5 11 0.0000000 0.0000000 twotrans
dput(sampleData)
structure(list(Site = structure(c(2L, 1L, 2L, 1L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label =
c("Shelterwood",
"Thinning"), class = "factor"), Treatment = structure(c(1L, 1L,
4L, 4L, 5L, 5L, 2L, 1L, 6L, 6L, 4L, 5L, 4L, 6L, 3L, 3L, 3L, 4L,
3L, 5L), .Label = c("CO", "DB", "FB", "NB", "SB", "WB"), class = "factor"),
Unit = c(10L, 12L, 3L, 2L, 9L, 1L, 7L, 10L, 9L, 9L, 8L, 9L,
2L, 9L, 6L, 7L, 7L, 8L, 2L, 5L), Plot = c(7L, 1L, 7L, 3L,
11L, 3L, 11L, 5L, 7L, 5L, 7L, 11L, 5L, 7L, 9L, 9L, 5L, 1L,
7L, 11L), Total.Tons.ac = c(0.45, 7.221161504, 1.14, 2.113610483,
1.885774282, 0.85, 0.874090569, 2.421088641, 1.009296069,
7.002926269, 2.805915201, 1.49, 7.656728085, 3.07031351,
13.29167729, 5.359431807, 0, 0.947747726, 0, 0), Total.kg.m2 = c(0.1008,
1.617540177, 0.25536, 0.473448748, 0.422413439, 0.1904, 0.195796287,
0.542323856, 0.22608232, 1.568655484, 0.628525005, 0.33376,
1.715107091, 0.687750226, 2.977335712, 1.200512725, 0, 0.212295491,
0, 0), method = structure(c(3L, 3L, 3L, 2L, 2L, 3L, 3L, 3L,
2L, 1L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 1L, 2L, 3L), .Label = c("FAP",
"onetrans", "twotrans"), class = "factor")), .Names = c("Site",
"Treatment", "Unit", "Plot", "Total.Tons.ac", "Total.kg.m2",
"method"), row.names = c(130L, 351L, 88L, 224L, 54L, 74L, 328L,
341L, 266L, 405L, 332L, 126L, 295L, 406L, 179L, 185L, 39L, 187L,
10L, 102L), class = "data.frame")
How can I construct a heatmap like matrix from 3 variables, 2 categorical and 1 numeric, in which certain events do not occur. My dplyr code overlooks those events and misses about 20 cavities in the surface plot that I'd like to make. For that I need an accurate matrix. But this is rather complicated.
What I consider a NA event is a maximum time for which two categorical events (Modeling and Discourse) do not occur simultaneously. So a point of null time observations (NA), not even zero.
I have the following dataframe:
df <- structure(list(`Modeling Code` = structure(c(4L, 4L, 4L, 4L,
4L, 4L, 4L, 6L, 4L, 5L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L,
2L, 6L, 6L, 6L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 5L, 5L, 5L, 5L, 5L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L,
6L, 6L, 6L, 6L, 6L, 4L, 5L, 5L, 5L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 4L, 4L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 3L, 3L, 5L, 4L, 4L, 4L,
4L, 5L, 6L, 6L, 6L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L,
4L, 5L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 6L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 5L, 2L, 2L,
2L, 5L, 4L, 4L, 2L, 2L, 5L, 2L, 2L, 3L, 5L, 5L, 5L, 4L, 4L, 1L,
1L, 4L, 4L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 6L, 5L, 5L, 2L, 5L, 5L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 6L, 5L, 5L,
5L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 6L, 6L, 6L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 5L, 5L, 5L, 5L, 3L, 2L, 2L, 2L, 2L, 2L,
5L, 5L, 5L, 3L, 3L, 3L, 3L, 6L, 6L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 5L, 5L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 2L, 2L, 2L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 3L, 3L, 3L, 6L, 6L, 6L, 2L, 2L, 2L, 2L, 6L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 6L, 2L, 6L, 2L, 6L, 6L, 6L, 6L, 2L, 2L, 2L,
2L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 5L, 5L, 5L, 3L, 3L, 3L, 3L, 3L, 4L, 5L, 3L,
3L, 3L, 3L, 6L, 6L, 6L, 6L, 6L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 6L, 6L, 6L, 6L, 6L,
6L, 1L, 1L, 1L, 1L, 1L, 6L, 6L, 1L, 1L, 1L, 3L, 3L, 1L), .Label = c("A",
"MA", "OFF", "P", "SM", "V"), class = "factor"), `Discourse Code` = structure(c(8L,
5L, 8L, 1L, 9L, 2L, 8L, 6L, 5L, 6L, 5L, 8L, 3L, 3L, 6L, 2L, 2L,
9L, 3L, 3L, 6L, 6L, 3L, 3L, 8L, 6L, 9L, 3L, 3L, 9L, 8L, 6L, 8L,
6L, 9L, 3L, 3L, 6L, 6L, 4L, 9L, 1L, 6L, 9L, 6L, 3L, 3L, 6L, 8L,
2L, 6L, 2L, 8L, 2L, 2L, 2L, 2L, 8L, 2L, 1L, 6L, 8L, 9L, 2L, 6L,
8L, 2L, 2L, 3L, 3L, 2L, 2L, 2L, 2L, 9L, 1L, 6L, 8L, 7L, 7L, 6L,
8L, 6L, 9L, 9L, 6L, 1L, 1L, 6L, 6L, 9L, 9L, 1L, 1L, 9L, 6L, 6L,
6L, 1L, 1L, 9L, 6L, 9L, 1L, 6L, 1L, 9L, 9L, 1L, 6L, 1L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 6L, 9L, 6L, 9L, 8L, 2L, 8L, 2L, 1L, 2L, 6L,
4L, 1L, 1L, 1L, 9L, 5L, 1L, 9L, 8L, 2L, 9L, 2L, 7L, 6L, 1L, 6L,
1L, 2L, 6L, 6L, 6L, 9L, 2L, 2L, 9L, 7L, 7L, 7L, 7L, 9L, 2L, 1L,
1L, 4L, 8L, 4L, 6L, 1L, 6L, 9L, 2L, 1L, 9L, 6L, 6L, 9L, 1L, 6L,
2L, 4L, 4L, 4L, 4L, 8L, 6L, 2L, 1L, 1L, 1L, 2L, 6L, 6L, 8L, 2L,
4L, 6L, 9L, 1L, 6L, 1L, 1L, 3L, 2L, 2L, 2L, 9L, 9L, 9L, 8L, 2L,
6L, 1L, 2L, 1L, 2L, 2L, 1L, 8L, 2L, 6L, 6L, 8L, 2L, 7L, 2L, 2L,
6L, 2L, 2L, 6L, 4L, 8L, 7L, 7L, 7L, 7L, 6L, 8L, 7L, 7L, 9L, 1L,
9L, 2L, 9L, 1L, 6L, 9L, 2L, 6L, 2L, 7L, 9L, 8L, 9L, 9L, 2L, 8L,
9L, 4L, 2L, 4L, 6L, 2L, 6L, 1L, 1L, 3L, 9L, 1L, 8L, 9L, 9L, 9L,
6L, 2L, 6L, 2L, 2L, 7L, 7L, 7L, 8L, 1L, 2L, 2L, 2L, 2L, 6L, 8L,
6L, 1L, 6L, 8L, 2L, 1L, 2L, 6L, 9L, 2L, 9L, 2L, 6L, 2L, 1L, 1L,
9L, 9L, 9L, 8L, 4L, 9L, 6L, 1L, 2L, 9L, 8L, 2L, 1L, 6L, 1L, 6L,
2L, 8L, 2L, 2L, 8L, 4L, 4L, 9L, 6L, 1L, 9L, 7L, 7L, 7L, 7L, 7L,
9L, 6L, 7L, 7L, 7L, 7L, 8L, 6L, 2L, 2L, 6L, 8L, 8L, 4L, 2L, 6L,
1L, 6L, 9L, 6L, 9L, 9L, 2L, 8L, 6L, 6L, 2L, 2L, 9L, 9L, 6L, 2L,
2L, 3L, 3L, 3L, 2L, 9L, 2L, 9L, 2L, 9L, 1L, 9L, 8L, 6L, 7L, 7L,
6L), .Label = c("AG", "C", "D", "DA", "G", "J", "OFF", "Q", "S"
), class = "factor"), Time_Processed = c(1.3833, 1.4333, 1.4667,
1.5333, 1.6167, 1.65, 1.6833, 1.7333, 1.8, 1.8667, 1.9833, 2.05,
2.1333, 2.1667, 2.2167, 2.3, 2.3167, 2.3667, 2.5667, 2.5833,
2.6, 2.7833, 2.8, 2.8167, 2.8667, 3.0167, 3.0333, 3.05, 3.05,
3.1, 3.1833, 3.2667, 3.3, 3.3333, 3.4167, 3.45, 3.4833, 3.5667,
3.6, 3.7, 3.7167, 3.8, 3.95, 4, 4.05, 4.15, 4.1667, 4.15, 4.2167,
4.3, 4.3833, 4.4, 4.4833, 4.5833, 4.6, 4.7, 4.8, 4.8333, 4.8833,
5, 5.05, 5.1, 5.2167, 5.4333, 5.45, 5.6, 5.7, 5.9167, 6.25, 6.2667,
6.2833, 6.4667, 6.5167, 6.5333, 6.55, 6.6667, 6.7167, 6.9, 6.95,
7.05, 7.05, 7.45, 7.6167, 7.7667, 7.7833, 7.8333, 8, 8.0167,
8.05, 8.1, 8.2833, 8.3167, 8.4333, 8.4667, 8.5, 8.55, 8.8833,
9.2667, 9.3167, 9.3333, 9.35, 9.5167, 9.6833, 9.7167, 9.7667,
9.7833, 9.8333, 9.9, 9.9667, 10.0667, 10.0833, 10.15, 10.2, 10.2667,
10.2667, 10.3, 10.35, 10.3667, 10.4, 10.7, 10.7833, 10.9, 11.1333,
11.1833, 11.2167, 11.2333, 11.25, 11.3, 11.35, 11.4167, 11.4667,
11.5333, 11.5667, 11.6667, 11.85, 11.8667, 11.8833, 12.25, 12.3167,
12.7167, 12.7333, 12.8, 12.85, 12.9333, 12.9667, 13.2667, 13.3167,
13.4, 13.4167, 13.5, 13.55, 13.6333, 13.9, 13.95, 13.9667, 14.05,
14.0833, 14.3167, 14.35, 14.3667, 14.4333, 14.4667, 14.5, 14.5333,
14.5833, 14.5833, 14.6167, 14.6667, 14.7167, 14.75, 14.7667,
15.05, 15.0833, 15.25, 15.4333, 15.4833, 15.5167, 15.6, 15.6333,
15.7167, 15.7333, 15.7667, 15.8667, 16.0167, 16.2, 16.2833, 16.3333,
16.3833, 16.45, 16.6, 16.6667, 16.9333, 16.9667, 17, 17.0333,
17.0833, 17.1167, 17.2167, 17.35, 17.4333, 17.55, 17.6, 17.6167,
17.65, 17.7, 17.7167, 17.75, 17.7833, 17.8833, 17.9333, 17.9833,
18.0167, 18.0333, 18.05, 18.0667, 18.1, 18.1667, 18.2, 18.3667,
18.45, 18.5333, 18.6333, 18.6667, 18.7333, 18.85, 18.8833, 18.9833,
19.0333, 19.0667, 19.3833, 19.5333, 19.6333, 19.6667, 19.7167,
19.9333, 19.9667, 20.05, 20.2333, 20.3667, 20.4333, 20.5, 20.5167,
20.5167, 20.55, 20.6167, 20.7167, 20.7667, 20.8167, 20.8667,
21.1333, 21.1833, 21.2, 21.2167, 21.2333, 21.2833, 21.3, 21.5,
21.5833, 21.6333, 21.6667, 21.6833, 21.6833, 21.8167, 21.8833,
22.1333, 22.1667, 22.35, 22.4333, 22.5, 22.5333, 22.5833, 22.6,
22.6, 22.65, 22.6667, 22.7167, 22.75, 22.8833, 23.0667, 23.0833,
23.1167, 23.3167, 23.35, 23.3667, 23.45, 23.5, 23.7667, 23.9833,
24.1833, 24.2167, 24.25, 24.2833, 24.5167, 24.5333, 24.6833,
24.7833, 24.7833, 24.8, 24.8, 24.8667, 25.3833, 25.4333, 25.4833,
25.5, 25.5167, 25.55, 25.5667, 25.5833, 25.6667, 25.7, 26, 26.1333,
26.1667, 26.2, 26.2333, 26.2667, 26.4, 26.4333, 26.4667, 26.5,
26.5167, 26.6667, 26.7, 26.8, 27.0833, 27.1833, 27.2, 27.2, 27.45,
27.5667, 27.6667, 27.7, 27.75, 27.7667, 27.7667, 27.8, 27.8333,
28.0333, 28.35, 28.6333, 28.6333, 28.7833, 28.8, 28.85, 29, 29.1833,
29.3333, 29.6667, 29.7333, 29.8, 29.8833, 29.9, 29.9333, 30.0667,
30.1, 30.1833, 30.2167, 30.25, 30.3, 30.3833, 30.5, 30.55, 30.7167,
31.0167, 31.45, 31.6, 31.8, 31.8333, 32.0167, 32.15, 32.15, 32.1667,
32.2167, 32.2167, 32.2333, 32.3833, 32.6167, 32.6667, 32.7, 32.7167,
32.7333, 32.75, 32.9, 33.0833, 33.1333, 33.1833)), row.names = c(NA,
-386L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("Modeling Code",
"Discourse Code", "Time_Processed"))
Looks a little bit like this:
df[1:10,]
# A tibble: 10 x 3
`Modeling Code` `Discourse Code` Time_Processed
<fct> <fct> <dbl>
1 P Q 1.38
2 P G 1.43
3 P Q 1.47
4 P AG 1.53
5 P S 1.62
6 P C 1.65
7 P Q 1.68
8 V J 1.73
9 P G 1.80
10 SM J 1.87
If I construct a matrix for my heatmap For the two categorical variables Modeling Code and Discourse Code, it looks a little bit like this:
with(df, table(`Discourse Code`, `Modeling Code`)) %>% prop.table() %>% as.data.frame() -> z
ggplot(data = z, aes(x = `Modeling.Code`, y = `Discourse.Code`, fill = Freq)) + theme_bw() + geom_tile() + geom_text(size = 3, aes(label = Freq))
This is a heatmap of the freqency of occurence of each matching categorical varibale so (C & MA) occur simutaneously about 10.6% of the time, while many pairs of categorical factors do not sumulatenously occur at all. These are the ones with 0 quantity. All those factors add up to 1, accounting for 100% of all pairs of Modeling and Discourse Codes.
If you count the number of zeroes (no occurring pairs) in this data-set you will see that there are twenty zeroes and this is important.
I was interested in the times at which these pairs occur so I decided to make a contour plot with plot_ly from my original dataset.
plot_ly(data = df, x = ~ `Modeling Code`, y = ~ `Discourse Code`, z = ~ `Time_Processed`, type = "contour")
Inspection of this contour plot with an interactive mouse shows that the Time points of "Time_Processed" are the maximum values of the "Modeling Codes" and "Discourse Codes"
So I generate those points with dplyr:
df %>%
+ group_by(`Modeling Code`, `Discourse Code`) %>%
+ summarise(max_time = max(Time_Processed))
# A tibble: 34 x 3
# Groups: Modeling Code [?]
`Modeling Code` `Discourse Code` max_time
<fct> <fct> <dbl>
1 A AG 9.97
2 A C 32.7
3 A D 4.17
4 A J 33.2
5 A Q 32.8
6 A S 32.7
7 MA AG 24.7
8 MA C 31.4
9 MA D 22.4
10 MA DA 27.2
# ... with 24 more rows
Hold up!!! There are only 34 entries, of maximum times, but the size of my heatmap is (6 x 9) = 54 cells. The 20 missing entries are the categorical pairs that yield zero. So I'm finding it very difficult to construct my matrix.
A MA OFF P SM V
S 32.733 31.800 NA 30.3000 30.250 32.700
Q 32.750 27.1833 NA 30.5000 29.800 28.85
OFF NA NA 33.133 NA NA NA
J 33.1833 26.5167 NA 30.7167 30.2167 31.8333
G NA NA NA 11.8500 NA NA
DA NA 20.72 NA NA 29.8833 25.700
D 4.1667 22.235 NA 6.2667 NA 32.2167
C 32.6667 31.4500 NA 30.3833 29.9000 32.1500
AG 9.967 24.6833 NA 13.2667 30.0667 32.7167
This is the matrix (assuming I didn't make any manual mistakes) that I'd like to create based on my observations. The NAs are values that for the Modeling and Discourse Code pairs that do not occur, so it's the 20 entries that my dplyr summarise function with maximum time could not capture, but my heatmap did. So if I do that then I can tediously fill out this matrix.
My question is how can I construct this matrix?
In addition, I would prefer that the matching values either show up as NAs or as -1, but not zero ... because my goal is to construct this matrix and then I can create a 3D surface plot that complements by contour plot so that I can accurately see the types of procedures that my subjects are implementing over an event that is about 30 minutes. So if those drop columns are interpreted as zero, then the surface plot will be wrong because at the beginning of the event (time 0) the subjects did not use those procedures.
Complex problems sometimes have simple solutions and it wasn't clear to me until I did a lot of experimentation with all existing functions. I figured out that dcast accomplished my goal. All the word noise was me trying to explain the complexity of my problem I was hoping you would understand.
dcast(data = FERMI_1, formula = `Discourse Code` ~ `Modeling Code`, value.var = "Time_Processed", fun.aggregate = max, fill = -1)
Discourse Code A MA OFF P SM V
1 AG 9.9667 24.6833 -1.0000 13.2667 30.0667 32.7167
2 C 32.6667 31.4500 -1.0000 30.3833 29.9000 32.1500
3 D 4.1667 22.3500 -1.0000 6.2667 -1.0000 32.2167
4 DA -1.0000 27.2000 -1.0000 -1.0000 29.8833 25.7000
5 G -1.0000 -1.0000 -1.0000 11.8500 -1.0000 -1.0000
6 J 33.1833 26.5167 -1.0000 30.7167 30.2167 31.8333
7 OFF -1.0000 -1.0000 33.1333 -1.0000 -1.0000 -1.0000
8 Q 32.7500 27.1833 -1.0000 30.5000 29.8000 28.8500
9 S 32.7333 31.8000 -1.0000 30.3000 30.2500 32.7000
It appears my comment answered the question:
If you have an object that supports the is.na and [<- functions then reassigning a numeric value of -1 to entries that currently are NA is as simple as obj[ is.na(obj) ] <- -1. (I cannot really tell if this is the request, since I got lost in the long presentation that didn't have a definite goal.) If on the other hand, the need is to first generate such a matrix from a long format data-obj named df2 might be addressed by
obj <- xtabs(max_time ~Modeling Code+Discourse Code, data=df2)
The frequency plot that I'm trying to do is
Barplot with counts above each bar
Relative frequency in left side
Cumulative frequency in right side
The dataset is
dput(x2)
c(1L, 5L, 3L, 3L, 5L, 3L, 4L, 1L, 2L, 2L, 7L, 3L, 2L, 2L, 3L,
3L, 2L, 1L, 5L, 4L, 4L, 3L, 5L, 2L, 6L, 2L, 1L, 2L, 5L, 5L, 5L,
3L, 6L, 4L, 5L, 4L, 6L, 7L)
The distribution of frequencies are
table(x2)
x2
1 2 3 4 5 6 7
4 8 8 5 8 3 2
The relative frequencies are
prop.table(table(x2))
x2
1 2 3 4 5 6 7
0.10526316 0.21052632 0.21052632 0.13157895 0.21052632 0.07894737 0.05263158
EDIT: Like in the image below, but with cumulative frequency in the right side, relative frequency in the left and the bars with counts
library(tidyverse)
library(broom)
table(x2) %>%
tidy() %>%
mutate(rel_freq = Freq/sum(Freq), sum = sum(Freq)) %>%
ggplot(aes(reorder(x2, Freq), rel_freq)) +
geom_col() +
geom_text(aes(label = Freq), vjust = -.5) +
scale_y_continuous(sec.axis = sec_axis(~.*length(x2)))
I've read through the ggplot2 docs website and other question but I couldn't find a solution. I'm trying to visualize some data for varying age groups. I have sort of managed to do it but it does not look like I would intend it to.
Here is the code for my plot
p <- ggplot(suggestion, aes(interaction(Age,variable), value, color = Age, fill = factor(variable), group = Age))
p + geom_bar(stat = "identity")+
facet_grid(.~Age)![The facetting separates the age variables][1]
My ultimate goal is to created a stack bar graph, which is why I used the fill, but it does not put the TDX values in its corresponding Age group and Year. (Sometimes TDX values == DX values, but I want to visualize when they don't)
Here's the dput(suggestion)
structure(list(Age = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L,
7L), .Label = c("0-2", "3-9", "10-19", "20-39", "40-59", "60-64",
"65+", "UNSP", "(all)"), class = "factor"), variable = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
8L, 8L, 8L, 8L, 8L, 8L, 8L), .Label = c("Year.10.DX", "Year.11.DX",
"Year.12.DX", "Year.13.DX", "Year.10.TDX", "Year.11.TDX", "Year.12.TDX",
"Year.13.TDX"), class = "factor"), value = c(26.8648932910636,
30.487741796656, 31.9938838749782, 62.8189679326958, 72.8480838120064,
69.3044125928752, 36.9789457527416, 21.808001825378, 24.1073451428435,
40.3305134762935, 70.4486116545885, 68.8342676191755, 63.9227718107745,
34.6086468618636, 8.84033719571875, 13.2807072303835, 28.4781516422802,
55.139497471546, 59.7230544500003, 67.9448927372699, 37.7293286937066,
6.9507024051526, 17.4393054963572, 33.1485743479821, 61.198647580693,
58.6845873573852, 48.0073013177248, 28.4455801248562, 26.8648932910636,
19.8044453272475, 23.0189084635948, 53.7037832071889, 60.6516550126422,
58.1573725886767, 27.0791868812255, 21.808001825378, 19.8146296425633,
35.0587750051557, 62.3308555053346, 59.3299998610862, 56.5341245769817,
27.7229319271878, 8.84033719571875, 13.2807072303835, 22.4081606349585,
48.0252683906252, 52.7560684009579, 65.2890977685045, 32.4142337849399,
6.9507024051526, 15.2833655677215, 24.5268503180754, 52.536784326675,
51.4100599515986, 40.9609231655724, 18.1306673637441)), row.names = c(NA,
-56L), .Names = c("Age", "variable", "value"), class = "data.frame")
It's unclear what you need but perhaps this.
ggplot(a,aes(x=variable,y=value,fill=Age)) + geom_bar(stat='identity')
+facet_wrap(~Age)
If you want to visualize separately the TDX and the DX entries, we'll need to change the dataframe a bit.
> head(a)
Age variable value
1 0-2 Year.10.DX 26.86489
2 3-9 Year.10.DX 30.48774
3 10-19 Year.10.DX 31.99388
4 20-39 Year.10.DX 62.81897
5 40-59 Year.10.DX 72.84808
6 60-64 Year.10.DX 69.30441
The column of interest variable is a combination of year and of TDX/DX value. We'll use the tidyr package to separate this into two columns.
library(tidyr)
library(dplyr)
tidy_a<- a %>% separate(variable, into = c( 'nothing',"year",'label'), sep = "\\.")
This actually splits the levels of column variable into three components, since we split on . and the character . appears twice in each entry.
> head(tidy_a)
Age nothing year label value
1 0-2 Year 10 DX 26.86489
2 3-9 Year 10 DX 30.48774
3 10-19 Year 10 DX 31.99388
4 20-39 Year 10 DX 62.81897
5 40-59 Year 10 DX 72.84808
6 60-64 Year 10 DX 69.30441
So the column nothing is rather useless, just a necessary result of using separate and separating on .. Now this will allow us to visualize TDX/DX separately.
ggplot(tidy_a,aes(x=year,y=value,fill=label)) + geom_bar(stat='identity') + facet_wrap(~Age)