Related
I have a folder which serves as a container for a standardized report from a system. This report is run on a daily basis. However, the report may require re-run for a certain date or range of dates depending on user preferences and asks. Thus file content may change significantly.
I would like to create a script that would group the unique dates together in one dataframe based on the latest run time, and another dataframe for the dates that are being revised.
Here is a simplified version of the table:
structure(list(Source = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L), Date = structure(c(1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), .Label = c("11-Feb-20", "12-Feb-20"
), class = "factor"), FarmType = structure(c(3L, 4L, 5L, 1L,
2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L), .Label = c("AJSKJA",
"ASKJKA", "GHDGH", "KLKIUK", "KLSAKJ"), class = "factor"), FarmName = structure(c(1L,
2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L), .Label = c("",
"JJHGH", "JKJKK", "JUISO", "SDLLS"), class = "factor"), Perform = c(13.04144378,
1.230474165, 1.230474165, 13.9407486, 13.9407486, 13.04144378,
1.230474165, 1.230474165, 13.9407486, 13.9407486, 13.04144378,
15.26566, 1.230474165, 13.9407486), RunDate = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("02/14/2020",
"02/15/2020"), class = "factor")), class = "data.frame", row.names = c(NA,
-14L))
Please note that the number of columns does not change, however, after each re-run the number of rows may increase/decrease.
The idea is -- the first group of data that is based on the most recent run would represent the up-to-date information (corrections, revisions, etc.), while the second group essentially looks at what is being revised and how the numbers and data are changing.
Expected output for the first group:
structure(list(Source = c(3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L),
Date = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("11-Feb-20",
"12-Feb-20"), class = "factor"), FarmType = structure(c(3L,
4L, 5L, 1L, 3L, 4L, 5L, 1L, 2L), .Label = c("AJSKJA", "ASKJKA",
"GHDGH", "KLKIUK", "KLSAKJ"), class = "factor"), FarmName = structure(c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 5L), .Label = c("", "JJHGH",
"JKJKK", "JUISO", "SDLLS"), class = "factor"), Perform = c(13.04144378,
15.26566, 1.230474165, 13.9407486, 13.04144378, 1.230474165,
1.230474165, 13.9407486, 13.9407486), RunDate = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("02/14/2020",
"02/15/2020"), class = "factor")), class = "data.frame", row.names = c(NA,
-9L))
Expected output for the second group:
structure(list(Source = c(1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L),
Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "11-Feb-20", class = "factor"),
FarmType = structure(c(3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L
), .Label = c("AJSKJA", "ASKJKA", "GHDGH", "KLKIUK", "KLSAKJ"
), class = "factor"), FarmName = structure(c(1L, 2L, 3L,
4L, 5L, 1L, 2L, 3L, 4L), .Label = c("", "JJHGH", "JKJKK",
"JUISO", "SDLLS"), class = "factor"), Perform = c(13.04144378,
1.230474165, 1.230474165, 13.9407486, 13.9407486, 13.04144378,
15.26566, 1.230474165, 13.9407486), RunDate = structure(c(1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("02/14/2020",
"02/15/2020"), class = "factor")), class = "data.frame", row.names = c(NA,
-9L))
Thank you for your time. Please let me know if you have questions.
We could group by 'Date' and filter those groups where the 'RunDate' is the latest after converting to Date class
library(lubridate)
library(dplyr)
new1 <- df1 %>%
group_by(Date) %>%
filter(mdy(RunDate) == max(mdy(RunDate)))
and for the second set, we can check if the number of distinct elements of 'RunDate' is more than 1
new2 <- df1 %>%
group_by(Date) %>%
filter(n_distinct(RunDate) > 1)
I want to make transition plot with three columns. I use Gmisc package but not the transitionPlot function since it does not enable me include third column. Therefore, I used the code below. My problem is that my result transition table is dark green and there is box shadow. Could you please help me how I can change the color and get rid of the shadow? Thank you. This is my first inquiry, if there is something wrong, sorry.
Here a dataframe sample (I took this from stackoverflow, since I do not have the data):
x <- structure(list(Obs = 1:13, Seq.1 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("a", "b", "c" ), class = "factor"), Seq.2 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("c", "d"), class = "factor"), Seq.3 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("", "d", "e"), class = "factor"), Seq.4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "f"), class = "factor")), .Names = c("Obs", "Seq.1", "Seq.2", "Seq.3", "Seq.4"), class = "data.frame", row.names = c(NA, -13L))
library(Gmisc)
library(dplyr)
transitions <- table(x$Seq.1,x$Seq.2) %>%
getRefClass("Transition")$new(label=c("1st Iteration", "2nd Iteration"))
transitions$box_width = 0.25;
transitions$box_label_cex = 0.7;
transitions$arrow_type = "simple";
transitions$arrow_rez = 300;
table(x$Seq.2,x$Seq.3) %>% transitions$addTransitions(label = '3rd Iteration')
transitions$render()
I am given a big data set with several columns. As an example
set.seed(1)
x <- 1:15
y <- letters[1:3][sample(1:3, 15, replace = T)]
z <- letters[10:13][sample(1:3, 15, replace = T)]
r <- letters[20:24][sample(1:3, 15, replace = T)]
df <- data.frame("Number"=x, "Section"=y,"Chapter"=z,"Rating"=r)
dput(df)
structure(list(Number = 1:15, Area = structure(c(1L, 2L, 2L, 3L, 1L, 3L, 3L, 2L, 2L, 1L, 1L, 1L, 3L, 2L, 3L), .Label = c("a", "b", "c"), class = "factor"), Section = structure(c(2L, 3L, 3L, 2L, 3L, 3L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 3L, 2L), .Label = c("j", "k", "l"), class = "factor"), Rating = structure(c(2L, 2L, 2L, 1L, 3L, 3L, 3L, 1L, 3L, 2L, 3L, 2L, 3L, 2L, 2L), .Label = c("A", "B", "C"), class = "factor")), class = "data.frame", row.names = c(NA,-15L))
I would like now to create frequency tables and graphs split by rating and a a chosen category, e.g. via a string:
Category<-"Section"
data_count <- ddply(df, .(get(Category),Rating), 'count')
data_rel_freq <- ddply(data_count, .(Rating), transform, rel_freq = freq/sum(freq))
dput(data_rel_freq)
structure(list(get.Category. = structure(c(2L, 2L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 2L, 2L, 3L, 3L), .Label = c("j", "k","l"), class = "factor"), Number = c(4L, 8L, 10L, 12L, 1L, 15L, 2L, 3L, 14L, 7L, 9L, 11L, 13L, 5L, 6L), Area = structure(c(3L, 2L, 1L, 1L, 1L, 3L, 2L, 2L, 2L, 3L, 2L, 1L, 3L, 1L, 3L), .Label = c("a", b", "c"), class = "factor"), Section = structure(c(2L, 2L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 2L, 2L, 3L, 3L), .Label = c("j", "k", "l"), class = "factor"), Rating = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), freq = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), rel_freq = c(0.5, 0.5, 0.142857142857143, 0.142857142857143, 0.142857142857143, 0.142857142857143, 0.142857142857143, 0.142857142857143, 0.142857142857143, 0.166666666666667, 0.166666666666667, 0.166666666666667, 0.166666666666667, 0.166666666666667, 0.166666666666667)), class = "data.frame", row.names = c(NA, -15L))
Using ggplot
ggplot(data_rel_freq,aes(x = Rating, y = rel_freq,fill = get(Category)))+
geom_bar(position = "fill",stat = "identity",color="black") +
scale_y_continuous(labels = percent_format())+
labs(x = "Rating", y="Relative Frequency")
The issue is now that "get(Category)" is now treated as a new column
get.Category. Number Area Section Rating freq rel_freq
1 k 4 c k A 1 0.5000000
2 k 8 b k A 1 0.5000000
3 j 10 a j B 1 0.1428571
4 j 12 a j B 1 0.1428571
5 k 1 a k B 1 0.1428571
6 k 15 c k B 1 0.1428571
7 l 2 b l B 1 0.1428571
Moreover, the Number column should be summed, e.g. the other categories (here: Area) should be dropped and it we should have just one line with for Section "k" with Rating "A".
We can use count to get the frequency of the column 'Section' by evaluating the object identifier 'Category' after converting to symbol (sym) and evaluate (!!) it. Within the ggplot syntax, the aes can also take a symbol and can be evaluated as earlier
library(tidyverse)
library(scales)
library(ggplot2)
df %>%
count(!! rlang::sym(Category), Rating) %>%
group_by(Rating) %>%
mutate(rel_freq = n/sum(n)) %>%
ggplot(., aes(x =Rating, y = rel_freq, fill = !! rlang::sym(Category))) +
geom_bar(position = "fill",stat = "identity",color="black") +
scale_y_continuous(labels = percent_format())+
labs(x = "Rating", y="Relative Frequency")
-output
I am severly struggling with a data-rearrangement problem. The data below contains agreements (rows) which collapsed or were stable (column "collapse") and feature provisions which were reduced, kept, added or absent (columns "diff.pps_leadership","diff.pps_cabinet", etc.)
I want to rearrange the data so that I get an overview of how many % of those agreements which reduced, kept, or added a specific provision collapsed. The rows should be the provisions (diff.pps_leadership...), the columns should be "reduced, "kept", and "added". And the content of the cells should be the % of those collapsed (only in relation to those which reduced, kept, or added the provision; not the total).
In Excle I would do this in pivot table, but I haven't been able to get there with R. I tried the cast, aggregate, melt, and transpose commands, but haven't succeeded.
Eventually, the result should look similar to this
https://docs.google.com/spreadsheets/d/1yhIbvTQTYkkwSFVxWEnPwvSvwTc0vuTYZxa15Eh1lT8/edit?usp=sharing
Hope my question is not too specific. Grateful for any hint/advice.
example <- structure(list(Agreement = structure(c(8L, 4L, 6L, 9L, 2L, 3L,
7L, 10L, 5L, 1L), .Label = c("Abuja Agreement", "Accra Peace Agreement",
"Arusha Agreement", "Arusha/Global Ceasefire Agreement", "Comprehensive Peace Agreement",
"InterabsentCongolese Dialogue", "Lome Agreement", "Lusaka Protocol",
"Ouagadougou Agreement", "Tansitional Constituion"), class = "factor"),
diff.pps_cabinet = structure(c(2L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("kept", "reduced"), class = "factor"),
diff.pps_leadership = structure(c(1L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 2L, 3L), .Label = c("absent", "kept", "reduced"), class = "factor"),
diff.mps_milcmd = structure(c(3L, 2L, 3L, 3L, 3L, 3L, 1L,
3L, 2L, 3L), .Label = c("absent", "kept", "reduced"), class = "factor"),
diff.mps_armyint = structure(c(3L, 2L, 2L, 3L, 3L, 3L, 1L,
3L, 2L, 3L), .Label = c("absent", "kept", "reduced"), class = "factor"),
diff.eps_commission = structure(c(1L, 1L, 1L, 1L, 3L, 1L,
3L, 1L, 2L, 3L), .Label = c("absent", "kept", "reduced"), class = "factor"),
diff.eps_company = structure(c(1L, 2L, 1L, 1L, 3L, 1L, 1L,
1L, 2L, 3L), .Label = c("absent", "kept", "reduced"), class = "factor"),
diff.veto_leg = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("absent", "added"), class = "factor"),
diff.tps_devolution = structure(c(2L, 1L, 2L, 3L, 1L, 1L,
1L, 2L, 2L, 1L), .Label = c("absent", "kept", "reduced"), class = "factor"),
diff.ca.psh = structure(c(3L, 2L, 1L, 1L, 4L, 1L, 1L, 1L,
4L, 1L), .Label = c("absent", "added", "kept", "reduced"), class = "factor"),
collapse = structure(c(1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L,
1L), .Label = c("collapse", "stable"), class = "factor")), .Names = c("Agreement",
"diff.pps_cabinet", "diff.pps_leadership", "diff.mps_milcmd",
"diff.mps_armyint", "diff.eps_commission", "diff.eps_company",
"diff.veto_leg", "diff.tps_devolution", "diff.ca.psh", "collapse"
), class = "data.frame", row.names = c(NA, -10L))
The following gets the job done.
library(data.table)
setDT(example)
mvs <- c("diff.pps_cabinet", "diff.pps_leadership",
"diff.mps_milcmd", "diff.mps_armyint")
vls <- c("reduced", "kept", "added", "absent")
melt(example, c("Agreement", "collapse"), mvs
)[ , setNames(vapply(
vls, function(vv) list(paste0(
s <- sum(collapse[idx <- value == vv] == "collapse"),
" out of ", sum(idx), " = ", floor(100 * s / sum(idx)), "% collapsed"),
paste(Agreement[idx], collapse = "\n")),
vector("list", 2)),
paste0(rep(vls, each = 2),
c(".percent", ".names"))), by = variable]
Current prints NaN when there's nothing; to fix this, replace sum(idx) in the denominator by (if (!any(idx)) 1 else sum(idx)).
I am trying to print a-priori contrasts with type III sums of squares results. (Please don't speak about type I vs. type III. That's not the point of my question.) I can print the contrasts like I need using summary.aov(), however that uses type I SS. When I use the Anova() function from library(car) to get type III SS, it doesn't print the contrasts. I have also tried using drop1() with the lm() model, but this just prints the same results as Anova() (without the contrasts).
Please advise on a way to print the results of the contrasts with type III SS. An example follows.
Sample data:
DF <- structure(list(Code = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L,
3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L,
9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L), .Label = c("A",
"B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L"), class =
"factor"), GzrTreat = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), contrasts = structure(c(1,
-2, 1, 1, 0, -1), .Dim = c(3L, 2L), .Dimnames = list(c("I",
"N", "R"), NULL)), .Label = c("I", "N", "R"), class = "factor"),
BugTreat = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label =
c("Immigration", "Initial", "None"), class = "factor"), TempTreat =
structure(c(2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("Not Warm", "Warmed"), class =
"factor"), ShadeTreat = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L), .Label = c("Light",
"Shaded"), class = "factor"), EpiChla = c(0.268482353, 0.423119608,
0.579507843, 0.738839216, 0.727856863, 0.523960784, 0.405801961,
0.335964706, 0.584441176, 0.557543137, 0.436456863, 0.563909804,
0.432398039, 0.344956863, 0.340309804, 0.992884314, 0.938390196,
0.663270588, 0.239833333, 0.62875098, 0.466011765, 0.536182353,
0.340309804, 0.721172549, 0.752082353, 0.269372549, 0.198180392,
1.298882353, 0.298354902, 0.913139216, 0.846129412, 0.922317647,
0.727033333, 1.187662745, 0.35622549, 0.073547059), log_EpiChla =
c(0.10328443, 0.153241402, 0.198521787, 0.240259426, 0.237507762,
0.182973791, 0.147924145, 0.125794985, 0.19987612, 0.192440084,
0.157292589, 0.194211702, 0.156063718, 0.128708355, 0.127205194,
0.299482089, 0.287441205, 0.220962908, 0.093363308, 0.21185469,
0.166137456, 0.186442772, 0.127205194, 0.235824411, 0.243554515,
0.103589102, 0.078522208, 0.361516746, 0.113393422, 0.281746574,
0.266262141, 0.283825153, 0.23730072, 0.339980371, 0.132331903,
0.030821087), MeanZGrowthAFDM_g = c(0.00665, 0.003966667, 0.004466667,
0.01705, 0.0139, 0.0129, 0.0081, 0.003833333, 0.00575, 0.011266667,
0.0103, 0.009, 0.0052, 0.00595, 0.0105, 0.0091, 0.00905, 0.0045, 0.0031,
0.006466667, 0.0053, 0.009766667, 0.0181, 0.00725, 0, 0.0012, 5e-04,
0.0076, 0.00615, 0.0814, NA, 0.0038, 0.00165, 0.0046, 0, 0.0015)),
.Names = c("Code", "GzrTreat", "BugTreat", "TempTreat", "ShadeTreat",
"EpiChla", "log_EpiChla", "MeanZGrowthAFDM_g"), class = "data.frame",
row.names = c(NA, -36L))
Code:
## a-priori contrasts
library(stats)
contrasts(DF$GzrTreat) <- cbind(c(1,-2,1), c(1,0,-1))
round(crossprod(contrasts(DF$GzrTreat)))
c_labels <- list(GzrTreat=list('presence'=1, 'immigration'=2))
## model
library(car)
EpiLM <- lm(log_EpiChla~TempTreat*GzrTreat*ShadeTreat, DF)
summary.aov(EpiLM, split=c_labels) ### MUST USE summary.aov(), to get
#contrast results, but sadly this uses Type I SS
Anova(EpiLM, split=c_labels, type="III") # Uses Type III SS, but NO
#CONTRASTS!!!!!
drop1(EpiLM, ~., test="F") # again, this does not print contrasts
# I need contrast results like from summary.aov(), AND Type III SS
# like from Anova()