Plotting subset os subset in R - r

I have a data.frame like following:
files
Total 1000
Subset1 587
Subset2 123
I would like to represent the above data frame in a such way that of 123 files is a subset of 587 which itself subset of 1000. When I use pie or bar graphs, it is misleading.
My sincere apologies if my question is very amateurish. Kindly guide me how can represent the above data in R plots.

Perhaps something like this:
df = data.frame(files=c(1000,587,123),row.names = c('total','subset1','subset2'))
library(VennDiagram)
draw.triple.venn(area1 = df$files[1], area2 = df$files[2], area3 = df$files[3],
n12 = 587, n23 = 123, n13 = 123, n123 = 123,
category = c("Total", "Subset1", "Subset2"),
lty = "blank", fill = c("skyblue", "pink1", "mediumorchid"),
cat.pos = 0,cat.dist = c(-0.02,-0.05,-0.02))
Result:

You could do it as follows:
df$files[1] <- df$files[1] - sum(df$files[-1])
pie(df$files, df$sets)
The result:
Data:
df <- read.table(text=" sets files
Total 1000
Subset1 587
Subset2 123 ", header=TRUE)

Related

Plotting every three rows from data frame

I would like to make some plots from my data. Unfortunately, it is hard to predict how many plots I will generate because it depends on data and may be different. It is a reason why I would like to make it easy adjustable. However, it will be most often a plot from group of 3 rows each time.
So, I would like to plot from rows 1:3, 4-6,7-9, etc.
This is data:
> dput(DF_final)
structure(list(AC = c(0.0031682160632777, 0.00228591145206846,
0.00142094444568728, 0.000661218113472149, 0.0010078157353918,
0.000400289437089513, 40.4634784175177, 40.5055070858594, 0.0183737773741582
), SD = c(0.00250647379467532, 0.0013244185401148, 0.000469332241199189,
0.000294558308707343, 0.000385553400676202, 0.000104447914881357,
11.0693842400794, 8.78768774254084, 0.00696532251341454), ln_AC = c(-5.75458660556339,
-6.08099044923792, -6.556433525855, -7.32142679754668, -6.89996992823399,
-7.8233226797995, 3.70039979980691, 3.70143794229703, -3.99683077355773
), ln_SD = c(-5.98887837626238, -6.62678175351058, -7.66419963690747,
-8.13003358225542, -7.86083085139947, -9.16682203300101, 2.40418312097106,
2.17335162163583, -4.96681136795312), Percent_AC = c(126.401324043689,
172.597361244303, 302.758754023937, 224.477834753288, 261.394591157605,
383.243109777925, 365.544076706723, 460.934756361151, 263.789326894369
), Percent_SD = c(100, 100, 100, 100, 100, 100, 100, 100, 100
), TP = c(0, 40, 80, 0, 40, 80, 0, 40, 80)), row.names = c("Tim_0",
"Tim_40", "Tim_80", "Jack_0", "Jack_40", "Jack_80", "Tom_0",
"Tom_40", "Tom_80"), class = "data.frame")
Column ln_AC should be set as an Y axis and column TP as X axis. First of all I would like to have all of them on separate graphs next to each other (remember about issue that the number of plots may be igh at some point) and if possible everything at the same graph. It should be a point plot with trend line.
Is it also possible to get a slope, SD slope, R^2 on a plot from linear regression ?
I manage to do it a for a single plot but regression line looks strange...
The code below was used to generate this plot and regression line.
fit <- lm(DF_final$ln_AC~DF_final$TP, data=DF_final)
plot(DF_final[1:3,7], DF_final[1:3,3], type = "p", ylim = c(-10,0), xlim=c(0,100), col = "red")
lines(DF_final$TP, fitted(fit), col="blue")
In base R (without so many packages), you can do:
# splits every 3 rows
DF = split(DF_final,gsub("_[^ ]*","",rownames(DF_final) ))
# you can also do
# DF = split(DF_final,(1:nrow(DF_final) - 1) %/%3 ))
To store your values:
slopes = vector("numeric",3)
names(slopes) = names(DF)
rsq = vector("numeric",3)
names(rsq) = names(DF)
To plot:
par(mfrow=c(1,3))
for(i in names(DF)){
fit <- lm(ln_AC~TP, data=DF[[i]])
plot(DF[[i]]$TP, DF[[i]]$ln_AC, type = "p", col = "red",main=i)
abline(fit, col="blue")
slopes[i]=round(fit$coefficients[2],digits=2)
rsq[i]=round(summary(fit)$r.squared,digits=2)
mtext(side=1,paste("slope=",slopes[i],"\nrsq=",rsq[i]),
padj=-2,cex=0.7)
}
And your values:
slopes
Jack Tim Tom
-0.01 -0.01 -0.10
rsq
Jack Tim Tom
0.29 0.99 0.75
If I understand correctly, the reason you want 3 observation per graph is because you have different individuals (Jack,Tim,Tom) . Is that so?
If you don't want to worry about that number, you can do this
# move rownames to column
data$person <- rownames(data)
data$person <- gsub("\\_.*","",data$person) # remove TP from names
# better to use library(data.table) for this step
data <- melt(data,id.vars=c("person","TP","ln_AC"))
ggplot(data,aes(x=TP, y=ln_AC)) + geom_point() +
geom_smooth(method = "lm") + facet_grid(~person)
This results in a plot like #giocomai, but it will work also if you have 4,5,6 or whatever persons in your data.
---- Edit
If you want to add R2 values, you can do something like this. Note, that it may not be the best and elegant solution, but it works.
data <- data.frame(...)
data$person <- rownames(data)
data$person <- gsub("\\_.*","",data$person)
# run lm for all persons and save them in a data.frame
nomi <- unique(data$person)
#lmStats <- data.frame()
lmStats <- sapply(nomi,
function(ita){
model <- lm(ln_AC~TP,data= data[which(data$person == ita),])
lmStat <- summary(model)
# I only save r2, but you can get all the statistics you need
lmRow <- data.frame("r2" = lmStat$r.squared )
#lmStats <- rbind(lmStats,lmRow)
}
)
lmStats <- do.call(rbind,lmStats)
# format the output,and create a dataframe we will use to annotate facet_grid
lmStats <- as.data.frame(lmStats)
rownames(lmStats) <- gsub("\\..*","",rownames(lmStats))
lmStats$person <- rownames(lmStats)
colnames(lmStats)[1] <- "r2"
lmStats$r2 <- round(lmStats$r2,2)
lmStats$TP <- 40
lmStats$ln_AC <- 0
lmStats$lab <- paste0("r2= ",lmStats$r2)
# melt and add r2 column to the data (not necessary, but I like to have everything I plot in teh data)
data <- melt(data,id.vars=c("person","TP","ln_AC"))
data$r2 <- lmStats[match(data$person,rownames(lmStats)),1]
ggplot(data,aes(x=TP, y=ln_AC)) + geom_point() +
geom_smooth(method = "lm") + facet_grid(~person) +
geom_text(data=lmStats,label=lmStats$lab)
An easier way (less steps) would be to use facet_grid(~r2), so that you have the R.square value in the title.
If I understand correctly what you mean, assuming you will always have three observation per graph, your main issue would be creating a categorical variable to separate them. Here's one way to accomplish it. Depending on the layout you prefer, you may want to check facet_wrap instead of facet_grid.
library("dplyr")
library("ggplot2")
DF_final <- structure(list(AC = c(0.0031682160632777, 0.00228591145206846,
0.00142094444568728, 0.000661218113472149, 0.0010078157353918,
0.000400289437089513, 40.4634784175177, 40.5055070858594, 0.0183737773741582
), SD = c(0.00250647379467532, 0.0013244185401148, 0.000469332241199189,
0.000294558308707343, 0.000385553400676202, 0.000104447914881357,
11.0693842400794, 8.78768774254084, 0.00696532251341454), ln_AC = c(-5.75458660556339,
-6.08099044923792, -6.556433525855, -7.32142679754668, -6.89996992823399,
-7.8233226797995, 3.70039979980691, 3.70143794229703, -3.99683077355773
), ln_SD = c(-5.98887837626238, -6.62678175351058, -7.66419963690747,
-8.13003358225542, -7.86083085139947, -9.16682203300101, 2.40418312097106,
2.17335162163583, -4.96681136795312), Percent_AC = c(126.401324043689,
172.597361244303, 302.758754023937, 224.477834753288, 261.394591157605,
383.243109777925, 365.544076706723, 460.934756361151, 263.789326894369
), Percent_SD = c(100, 100, 100, 100, 100, 100, 100, 100, 100
), TP = c(0, 40, 80, 0, 40, 80, 0, 40, 80)), row.names = c("Tim_0",
"Tim_40", "Tim_80", "Jack_0", "Jack_40", "Jack_80", "Tom_0",
"Tom_40", "Tom_80"), class = "data.frame")
DF_final %>%
mutate(id = as.character(sapply(1:(nrow(DF_final)/3), rep, 3))) %>%
ggplot(aes(x=TP, y=ln_AC)) +
geom_point() +
geom_smooth(method = "lm") +
facet_grid(~id)
Created on 2020-02-06 by the reprex package (v0.3.0)

Rstudio Venn Diagram with several numbers

I Have two dataset, where im counting the number of each sample within that dataset. And i want to combine the numbers of each sample for both dataset and then create a VennDiagram using R, with the main numbers of the largest dataset and in parenthesis the smaller dataset numbers.
The R code i'm currently using now
grid.newpage()
temp <- draw.triple.venn(area1 = 428,
area2 = 145,
area3 = 303,
n12 = 26,
n23 = 16,
n13 = 14,
n123 = 9,
category = c("text1", "text2", "text3"),
scaled=FALSE,
ext.text=FALSE)
grid.draw(temp)
pdf(file="figure1.pdf")
grid.draw(temp)
dev.off()
The result i get
What i would like to create
I have looked at the eulerr package and the Venn diagram package but i cant seem to figure out if there is a way to make this possible.
All help is much appreciated. Many thanks.
Not sure what the 'small' dataset looks like, but this does the trick
Here, you add traces to the grid with custom labels, and y-coordinates proportional to the y-coordinates of you 'large' dataset
library(VennDiagram)
temp <- draw.triple.venn(area1 = 428,
area2 = 145,
area3 = 303,
n12 = 26,
n23 = 16,
n13 = 14,
n123 = 9,
category = c("text1", "text2", "text3"),
scaled=FALSE,
ext.text=FALSE)
temp[[17]] <- temp[[13]]
temp[[17]]$y <- temp[[13]]$y * 0.8
temp[[17]]$label <- '(3)'
temp[[18]] <- temp[[9]]
temp[[18]]$y <- temp[[9]]$y * 0.95
temp[[18]]$label <- '(4)'
temp[[19]] <- temp[[7]]
temp[[19]]$y <- temp[[7]]$y * 0.95
temp[[19]]$label <- '(5)'
grid.draw(temp)

R: plot multiple curves vs one var but for 4 factors

I have a DF that looks like:
id app vac dac
1: 1 1000802 579 455
2: 1 1000803 1284 918
3: 1 1000807 68 66
4: 1 1000809 1470 903
5: 2 1000802 407 188
6: 2 1000803 365 364
7: 2 1000807 938 116
8: 2 1000809 699 570
I need to plot vac and dac for each app on same canvas as a function of id. I know how to do it for only one app by using melt and bulk-plot with ggplot. But I'm stuck how to do it for arbitrary number of factors/levels.
In this example there will be total 8 curves for 4 app. Any thoughts?
Here's the data frame for tests. Thank you!!
df = structure(list(id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), app = c(1000802,
1000803, 1000807, 1000809, 1000802, 1000803, 1000807, 1000809
), vac = c(579, 1284, 68, 1470, 407, 365, 938, 699), dac = c(455,
918, 66, 903, 188, 364, 116, 570)), .Names = c("id", "app", "vac",
"dac"), class = c("data.table", "data.frame"), row.names = c(NA,
-8L))
Edit: some clarification on axes,
x axis = id, y axis = values of vac and dac for each of 4 app factors
It is a bit unclear what you are looking for, but if you are looking for a line connecting the values of vac and dac, here is a solution using dplyr and tidyr.
First, gather the vac and dac columns (this is similar to reshape2::melt but with a syntax I find easier to follow). Then, set the variable (which has "vac" and "dac") as your x-locations, the value (from the old vac and dac columns) as your y and then map app and id to aesthetics (here, color and linetype). Set the group to ensure that it connects the right pairs of points, and add geom_line:
df %>%
gather(variable, value, vac, dac) %>%
ggplot(aes(x = variable
, y = value
, color = factor(app)
, linetype = factor(id)
, group = paste(app, id))) +
geom_line()
gives
Given the question edit, you can change axes like so:
df %>%
gather(variable, value, vac, dac) %>%
ggplot(aes(x = id
, y = value
, color = factor(app)
, linetype = variable
, group = paste(app, variable))) +
geom_line()
gives
I not sure, I understood your question but I would do something like
ggplot(df,aes(vac,app,group=app)) + geom_point(aes(color=factor(app)))

Draw a graph in R with header elaborate on two columns

I have a table with header expanded on two columns. How to draw a 3D graph on this table OR what would be a way to draw a graph on tables having elaborated headers. Kindly suggest me alternate ways to achieve this (if any)
Crime Table:
year
2014 2015 2016
Reported Detected Reported Detected Reported Detected
Murder 221 208 178 172 26 20
Murder(Gain) 20 16 11 9 1 1
Dacoity 51 45 44 36 5 1
Robbery 538 316 351 201 23 10
Chain Snatching 528 394 342 229 23 0
Code:
library(tables)
#CLASS 1 CRIMES 2014
c14 <- structure(list(`Reported` = c(221, 20, 51,
538, 528), `Detected` = c(208, 16, 45, 316, 394)), .Names = c("Reported",
"Detected"), row.names = c("Murder", "Murder(Gain)", "Dacoity", "Robbery", "Chain Snatching"), class = "data.frame")
c14
#CLASS 1 CRIMES 2015
c15 <- structure(list(`Reported` = c(178, 11, 44,
351, 342), `Detected` = c(172, 9,
36, 201, 229)), .Names = c("Reported",
"Detected"), row.names = c("Murder", "Murder(Gain)", "Dacoity",
"Robbery", "Chain Snatching"), class = "data.frame")
c15
#CLASS 1 CRIMES 31-01-2016
c16 <- structure(list(`Reported` = c(26, 1, 5,
23, 23), `Detected` = c(20, 1,
1, 10, 0)), .Names = c("Reported",
"Detected"), row.names = c("Murder", "Murder(Gain)", "Dacoity",
"Robbery", "Chain Snatching"), class = "data.frame")
c16
# rbind with rownames as a column
st <- rbind(
data.frame(c14, year = '2014', what = factor(rownames(c14), levels = rownames(c14)),
row.names= NULL, check.names = FALSE),
data.frame(c15,year = '2015',what = factor(rownames(c15), levels = rownames(c15)),
row.names = NULL,check.names = FALSE),
data.frame(c16,year = '2016',what = factor(rownames(c16), levels = rownames(c16)),
row.names = NULL,check.names = FALSE)
)
crimetable <- tabular(Heading()*what ~ year*(`Reported` +`Detected`)*Heading()*(identity),data=st)
crimetable
As I hate 3D plots for 3-way tables and I like ggplot2, I suggest this:
Gather your data into "long" format:
library(tidyr)
st_long = gather(st, type, count, -c(year, what))
head(st_long, 3)
# year what type count
# 1 2014 Murder Reported 221
# 2 2014 Murder(Gain) Reported 20
# 3 2014 Dacoity Reported 51
As you can see, both Detected and Reported columns are now included in the same column called type. This is useful for ggplot2, as it can easily create facets. Facets are separate elements within the plot that share the same aesthetic components but work with on different groups of data:
library(ggplot2)
ggplot(st_long, aes(year, count, group = what, color = what)) +
geom_line() +
facet_wrap(~ type)
(I am not saying that line plot is the only/best plot here, but it is often used when comparing frequencies across different time-points.)

Plot concentric pie charts in r

I'm trying to do a concentric pie chart. The internal pie represent three classes of subjects and each class has to be splitted in 3 sub-classes (of course the slices for the sub-classes have to be in line with the corresponding internal slice).
this is what I tried:
layout(matrix(c(1,1,1,1,2,1,1,1,1), nrow=3)); pie(x=c(14,22,15,3,15,33,0,6,45),labels="",col=c("#f21c39","#dba814","#7309de")); pie(x=c(51,51,51),labels=c("O","VG","V"),col=c("#c64719","#0600f5","#089c1f"))
This worked, but the internal pie is too small. I tried to play with the radius option, but then the external slices are not correspondent to the internal ones. how can I adjust them?
Use par(new=TRUE) to overplot the pies rather than layout() in this case
pie(x=c(14,22,15,3,15,33,0,6,45),labels="",
col=c("#f21c39","#dba814","#7309de"))
par(new=TRUE)
pie(x=c(51,51,51),labels=c("O","VG","V"),radius=.5,
col=c("#c64719","#0600f5","#089c1f"))
Three years later. this can be achieved using sunburstR package. http://timelyportfolio.github.io/sunburstR/example_baseball.html
Example:
DF <- data.frame(LOGRECNO = c(60, 61, 62, 63, 64, 65),
STATE = c(1, 1, 1, 1, 1, 1),
COUNTY = c(1, 1, 1, 1, 1, 1),
TRACT = c(21100, 21100, 21100, 21100, 21100, 21100),
BLOCK = c(1053, 1054, 1055, 1056, 1057, 1058))
DF$BLOCKID <-
paste(DF$LOGRECNO, DF$STATE, DF$COUNTY,
DF$TRACT, DF$BLOCK, sep = "-")
DF %>%
select(BLOCKID) %>%
group_by(BLOCKID) %>%
summarise(Tots=n())->dftest
sunburst(dftest)
I'm sure you are able to adapt this to suit your needs!
you could also use the ggsunburst package
# install ggsunburst
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("rPython")) install.packages("rPython")
install.packages("http://genome.crg.es/~didac/ggsunburst/ggsunburst_0.0.9.tar.gz", repos=NULL, type="source")
library(ggsunburst)
df <- read.table(header=T, text = "
parent node size
O 1 14
O 2 22
O 3 15
V 1 3
V 2 15
V 3 33
VG 1 1
VG 2 6
VG 3 45")
write.table(df, file = 'df.txt', sep = ',', row.names = F)
sb <- sunburst_data('df.txt', type = "node_parent", sep = ",")
p <- sunburst(sb, node_labels = T, leaf_labels = F, rects.fill.aes = "name")
cols <- c("O" = "#c64719", "V" = "#0600f5", "VG" = "#089c1f", "1" = "#f21c39", "2" = "#dba814", "3" = "#7309de")
p + scale_fill_manual(values = cols)

Resources