Related
I have this plot
With
> str(a)
'data.frame': 150 obs. of 2 variables:
$ study: Factor w/ 7 levels "A","S","H","D",..: 7 2 4 5 3 1 7 2 2 4 ...
$ n : Factor w/ 6 levels "N0","N1","N2a",..: 1 1 2 4 1 1 2 1 1 1 ...
I would like the x-axis to arrange by sample size, i.e. level = c("all", "S", "H", "B", "C", "A", "K", "D")
As you can see, the order is printed alphabetically.
I have tried specifying as ... aes(x=factor(nystudie, level=c(...), but that does not work. What am I doing wrong? I followed this post
library(tidyverse)
colsze <- c("#E1B930", "#2C77BF", "#E38072", "#6DBCC3", "grey40", "black", "#8B3A62")
a %>%
as_tibble() %>%
mutate(nystudie=as.factor(study),
n.seven=as.factor(n)) %>%
bind_rows(., mutate(., nystudie="all")) %>%
count(nystudie, n.seven, .drop=F) %>%
ggplot(aes(x = factor(nystudie, level = c("all", "S", "H", "B", "C", "A", "K", "D")),
n, color = n.seven, fill= n.seven, label=n)) +
geom_col(position = position_dodge2(preserve = "single", padding = 0.1))+
geom_text(aes(label=n),position = position_dodge2(0.9), vjust=-0.25, fontface=2, cex=4.5, show.legend = F) +
scale_fill_manual(values = alpha(colsze, .2),
name="Stage", label=c("N0", "N1", "N2a", "N2b", "N2c", "N3")) +
scale_color_manual(values = colsze,
name="Stage", label=c("N0", "N1", "N2a", "N2b", "N2c", "N3")) +
scale_x_discrete(name = "", label=c("All\n(n=1,905)",
"A\n(n=221)",
"B\n(n=234)",
"C\n(n=232)",
"D\n(n=108)",
"H\n(n=427)",
"K\n(n=221)",
"S\n(n=462)")) +
scale_y_continuous(name="",
breaks=seq(0,950,100)) +
coord_cartesian(ylim = c(0,950)) +
guides(fill = guide_legend(nrow = 1)) + theme(axis.text.x = element_text(color = "grey20", size =15),
legend.text=element_text(size=16), legend.title=element_text(size=16, face="bold"),
legend.position="top")
Data sample
a <- structure(list(study = structure(c(7L, 2L, 4L, 5L, 3L, 1L, 7L,
2L, 2L, 4L, 4L, 6L, 2L, 5L, 3L, 7L, 1L, 1L, 2L, 6L, 1L, 3L, 2L,
7L, 2L, 2L, 6L, 6L, 6L, 2L, 1L, 2L, 6L, 1L, 2L, 2L, 3L, 4L, 2L,
3L, 2L, 5L, 2L, 3L, 6L, 5L, 3L, 2L, 4L, 3L, 5L, 6L, 2L, 7L, 2L,
3L, 3L, 3L, 7L, 7L, 3L, 4L, 1L, 1L, 2L, 2L, 6L, 2L, 3L, 2L, 3L,
2L, 1L, 2L, 3L, 5L, 3L, 1L, 1L, 1L, 7L, 4L, 3L, 2L, 4L, 3L, 3L,
3L, 2L, 6L, 7L, 3L, 2L, 2L, 6L, 2L, 2L, 6L, 7L, 3L, 3L, 3L, 6L,
2L, 2L, 7L, 7L, 1L, 1L, 6L, 3L, 3L, 7L, 1L, 2L, 7L, 1L, 1L, 7L,
4L, 4L, 4L, 2L, 3L, 3L, 6L, 1L, 4L, 6L, 3L, 5L, 5L, 3L, 3L, 7L,
5L, 3L, 6L, 3L, 5L, 2L, 3L, 7L, 6L, 2L, 1L, 6L, 5L, 1L, 6L), .Label = c("A",
"S", "H", "D", "K", "C", "B"), class = "factor"), n = structure(c(1L,
1L, 2L, 4L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 4L, 2L, 1L, 2L,
3L, 2L, 2L, 4L, 4L, 4L, 2L, 4L, 1L, 2L, 4L, 1L, 1L, 4L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 4L, 1L, 1L, 4L, 2L, 1L, 1L, 4L, 1L, 1L, 2L,
1L, 5L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 4L, 1L, 2L, 1L,
4L, 1L, 1L, 1L, 1L, 6L, 1L, 2L, 5L, 4L, 2L, 6L, 1L, 4L, 2L, 4L,
2L, 1L, 1L, 4L, 1L, 2L, 1L, 1L, 4L, 4L, 4L, 1L, 4L, 2L, 1L, 1L,
4L, 2L, 1L, 2L, 1L, 5L, 5L, 1L, 4L, 1L, 2L, 2L, 4L, 1L, 1L, 1L,
2L, 4L, 4L, 1L, 5L, 2L, 1L, 5L, 2L, 4L, 1L, 1L, 1L, 4L, 4L, 1L,
1L, 4L, 4L, 4L, 1L, 4L, 4L, 1L, 4L, 5L, 4L, 5L, 1L, 5L, 1L, 1L,
4L, 2L, 1L, 2L, 4L), .Label = c("N0", "N1", "N2a", "N2b", "N2c",
"N3"), class = "factor")), row.names = c(NA, -150L), class = "data.frame")
The levels are being changed again at scale_x_discrete step. Try :
library(dplyr)
library(ggplot2)
a %>%
mutate(nystudie=as.factor(study),
n.seven=as.factor(n)) %>%
bind_rows(., mutate(., nystudie="all")) %>%
count(nystudie, n.seven, .drop=F) %>%
mutate(nystudie = factor(nystudie,
level = c("all", "S", "H", "B", "C", "A", "K", "D"),
labels = c("All\n(n=1,905)", "S\n(n=462)", "H\n(n=427)", "B\n(n=234)",
"C\n(n=232)", "A\n(n=221)", "K\n(n=221)", "D\n(n=108)"))) %>%
ggplot(aes(x = nystudie,
n, color = n.seven, fill= n.seven, label=n)) +
geom_col(position = position_dodge2(preserve = "single", padding = 0.1))+
geom_text(aes(label=n),position = position_dodge2(0.9), vjust=-0.25, fontface=2, cex=4.5, show.legend = F) +
scale_fill_manual(values = alpha(colsze, .2),
name="Stage", label=c("N0", "N1", "N2a", "N2b", "N2c", "N3")) +
scale_color_manual(values = colsze,
name="Stage", label=c("N0", "N1", "N2a", "N2b", "N2c", "N3")) +
scale_x_discrete(name = "") +
scale_y_continuous(name="",
breaks=seq(0,950,100)) +
coord_cartesian(ylim = c(0,950)) +
guides(fill = guide_legend(nrow = 1)) +
theme(axis.text.x = element_text(color = "grey20", size =15),
legend.text=element_text(size=16),
legend.title=element_text(size=16, face="bold"),
legend.position="top")
I would like to calculate column D based on the date column A. Column D should represent the number of observations grouped by column B.
Edit: fake data below
data <- structure(list(date = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 9L,
10L, 11L, 12L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("1/1/2015",
"1/2/2015", "1/3/2015", "1/4/2015", "1/5/2015", "1/6/2015", "5/10/2015",
"5/11/2015", "5/6/2015", "5/7/2015", "5/8/2015", "5/9/2015"), class = "factor"),
Country = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B",
"C"), class = "factor"), Value = c(215630672L, 1650864L,
124017368L, 128073224L, 97393448L, 128832128L, 14533968L,
46202296L, 214383720L, 243346080L, 85127128L, 115676688L,
79694024L, 109398680L, 235562856L, 235473648L, 158246712L,
185424928L), Number.of.Observations.So.Far = c(1L, 2L, 3L,
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L
)), class = "data.frame", row.names = c(NA, -18L))
What function in R will create a column D like so?
We can group by 'Country' and create sequence column with row_number()
library(dplyr)
df1 %>%
group_by(Country) %>%
mutate(NumberOfObs = row_number())
Or with base R
df1$NumberOfObs <- with(df1, ave(seq_along(Country), Country, FUN = seq_along))
Or with table
df1$NumberOfObs <- sequence(table(df1$Country))
Or in data.table
library(data.table)
setDT(df1)[, NumberOfObs := rowid(Country)][]
data
df1 <- read.csv('file.csv')
Can someone help me how to count from another dataframe?
df1(out)
structure(list(Item = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), class = "factor", .Label = "0S1576"), LC = structure(c(1L,
1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), class = "factor", .Label = c("MW92",
"OY01", "RM11")), Fiscal.Month = c("2019-M06", "2019-M07", "2019-M06",
"2019-M07", "2019-M08", "2019-M09", "2019-M06", "2019-M07", "2019-M08"
)), row.names = c(NA, -9L), class = "data.frame")
df2(tempdf)
structure(list(Item = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "0S1576", class = "factor"),
LC = structure(c(1L, 1L, 1L, 1L, 2L, 3L, 4L, 6L, 5L, 1L,
2L, 2L, 3L, 3L), .Label = c("MW92", "OY01", "RM11", "RS11",
"WK14", "WK15"), class = "factor"), Fiscal.Month = structure(c(1L,
2L, 3L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("2019-M06",
"2019-M07", "2019-M08", "2019-M09"), class = "factor"), fcst = c(22L,
21L, 20L, 19L, 12L, 10L, 10L, 12L, 10L, 12L, 10L, 10L, 10L,
10L)), row.names = c(NA, -14L), class = "data.frame")
I want to count the frequency of Item,LC,Fiscal.month of df1 from df2
You can count using table and merge df1 with df2 by using factor and you need interaction as you use more than one column to merge.
table(factor(interaction(df2[c("Item","LC","Fiscal.Month")]), levels=interaction(df1)))
#0S1576.MW92.2019-M06 0S1576.MW92.2019-M07 0S1576.OY01.2019-M06
# 2 1 3
#0S1576.OY01.2019-M07 0S1576.OY01.2019-M08 0S1576.OY01.2019-M09
# 0 0 0
#0S1576.RM11.2019-M06 0S1576.RM11.2019-M07 0S1576.RM11.2019-M08
# 3 0 0
Or a speed improved version using match and tabulate:
(df1$freq <- tabulate(match(interaction(df2[c("Item","LC","Fiscal.Month")]), interaction(df1)), nrow(df1)))
#[1] 2 1 3 0 0 0 3 0 0
Or sometimes even faster using fastmatch:
library(fastmatch)
df1$freq <- tabulate(fmatch(interaction(df2[c("Item","LC","Fiscal.Month")]), interaction(df1)), nrow(df1))
I have two data.frames df.1 and df.2 that I would merge or otherwise select data from to create a new data.frame. df.1 contains information about each individual (ID), sampling event (Event), Site and sample number (Sample). The tricky part for me is that Site and the corresponding Sample for each ID-Event pairing is different. For example, F3-3 has Site "plum" for Sample "1" and M6-3 has Site "pear" for Sample "1".
df.2 has Sample1 and Sample2 which corresponds to the Sample information in df.1 by way of the ID-Event pairing.
I'd like to match/merge the information between these two data.frames. Essentially, get the "word" from Site in df.1 that matches the Sample number. An example (df.3) is below.
Each ID-Event pairing will only have one Site and corresponding Sample (e.g. "Apple" will correspond to "1" not to "1" and "4"). I know I could use merge if I was only matching, for example, Sample1 or Sample2 I am not sure how to do this with both to populate Site1 and Site2 with the correctly matched word.
df.1 <- structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("F1",
"F3", "M6"), class = "factor"), Sex = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("F", "M"), class = "factor"), Event = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 4L), Site = structure(c(1L, 3L, 9L, 7L, 8L, 10L,
2L, 6L, 4L, 5L, 1L, 9L, 7L, 8L, 10L, 5L, 10L, 2L, 6L, 4L, 5L,
1L, 9L, 2L, 6L, 4L, 5L, 1L, 8L, 3L, 10L, 4L, 2L, 6L, 4L, 5L,
1L), .Label = c("Apple", "Banana", "Grape", "Guava", "Kiwi",
"Mango", "Orange", "Peach", "Pear", "Plum"), class = "factor"),
Sample = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L)), .Names = c("ID",
"Sex", "Event", "Site", "Sample"), class = "data.frame", row.names = c(NA,
-37L))
#
df.2 <- structure(list(Sample1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L), Sample2 = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
3L, 4L, 5L), V1 = c(0.12, 0.497, 0.715, 0, 0.001, 0, 0.829, 0,
0, 0.001, 0, 0.829), V2 = c(0.107, 0.273, 0.595, 0, 0.004, 0,
0.547, 0.001, 0.001, 0.107, 0.273, 0.595), ID = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("F1",
"M6"), class = "factor"), Sex = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("F", "M"), class = "factor"),
Event = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L)), .Names = c("Sample1",
"Sample2", "V1", "V2", "ID", "Sex", "Event"), class = "data.frame", row.names = c(NA,
-12L))
#
df.3 <- structure(list(Sample1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L), Sample2 = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
3L, 4L, 5L), V1 = c(0.12, 0.497, 0.715, 0, 0.001, 0, 0.829, 0,
0, 0.001, 0, 0.829), V2 = c(0.107, 0.273, 0.595, 0, 0.004, 0,
0.547, 0.001, 0.001, 0.107, 0.273, 0.595), Site1 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("Apple",
"Banana"), class = "factor"), Site2 = structure(c(2L, 8L, 6L,
7L, 9L, 1L, 5L, 3L, 4L, 5L, 3L, 4L), .Label = c("Banana", "Grape",
"Guava", "Kiwi", "Mango", "Orange", "Peach", "Pear", "Plum"), class = "factor"),
ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L), .Label = c("F1", "M6"), class = "factor"), Sex = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("F",
"M"), class = "factor"), Event = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 3L)), .Names = c("Sample1", "Sample2",
"V1", "V2", "Site1", "Site2", "ID", "Sex", "Event"), class = "data.frame", row.names = c(NA, -12L))
Two merges should do it:
first <- merge(df.2, unique(df.1[,3:5]), by.x=c("Sample1","Event"), by.y=c("Sample","Event"), all.x=TRUE)
second <- merge(first, unique(df.1[,3:5]),by.x=c("Sample2","Event"), by.y=c("Sample","Event"), all.x=TRUE)
print(second)
Sample2 Event Sample1 V1 V2 ID Sex Site.x Site.y
1 10 1 1 0.000 0.001 F1 F Apple Kiwi
2 2 1 1 0.120 0.107 F1 F Apple Grape
3 3 1 1 0.497 0.273 F1 F Apple Pear
4 3 3 2 0.001 0.107 M6 M Banana Mango
5 4 1 1 0.715 0.595 F1 F Apple Orange
6 4 3 2 0.000 0.273 M6 M Banana Guava
7 5 1 1 0.000 0.000 F1 F Apple Peach
8 5 3 2 0.829 0.595 M6 M Banana Kiwi
9 6 1 1 0.001 0.004 F1 F Apple Plum
10 7 1 1 0.000 0.000 F1 F Apple Banana
11 8 1 1 0.829 0.547 F1 F Apple Mango
12 9 1 1 0.000 0.001 F1 F Apple Guava
structure(list(Team = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = "Union", class = "factor"), Date = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 4L, 3L, 3L, 4L, 3L, 3L, 5L, 3L, 3L, 6L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 6L, 3L, 3L, 3L, 3L, 3L, 3L, 6L, 6L, 6L,
6L, 3L, 7L, 8L, 9L, 10L, 10L), .Label = c("2012-01-06", "2012-02-06",
"2012-03-06", "2012-04-06", "2012-05-06", "2012-07-06", "2012-09-06",
"2012-10-06", "2012-11-06", "2012-12-06"), class = "factor"),
STime = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = "07:03", class = "factor"), ETime = structure(c(6L,
7L, 8L, 5L, 5L, 1L, 2L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 11L,
10L, 9L, 8L, 10L, 7L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 4L, 5L, 5L, 5L, 5L), .Label = c("01:13",
"03:13", "06:13", "09:13", "10:13", "11:13", "12:13", "13:13",
"15:13", "16:13", "18:13"), class = "factor")), .Names = c("Team",
"Date", "STime", "ETime"), class = "data.frame", row.names = c(NA,
-40L))
I amd doing this:
ggplot(df, aes(Date, ETime, group="Team")) + geom_point(size=0.3) + facet_wrap(~ Team)
I would like to have y-axis from 00:00 to 23:29 with 2 hours increments. I tried scale_y_continous, which is not working. Any suggestions?
I suggest changing your date and time columns into POSIXt formated data. Then changing the axis breaks and labeling becomes easier. Currently, your dates and times are stored as factors.
library(ggplot2)
# Change relevant columns from 'factor' to 'POSIXt'.
df$ETime = strptime(as.character(df$ETime), "%H:%M")
df$Date = strptime(as.character(df$Date), "%Y-%m-%d")
plot_1 = ggplot(df, aes(x=Date, y=ETime)) +
geom_point() +
labs(title="Plot 1")
# Manually set datetime limits and breaks.
y_limits = as.POSIXct(c(strptime("00:00", "%H:%M"), strptime("23:29", "%H:%M")))
y_breaks = seq(from=strptime("00:00", "%H:%M"),
to=strptime("23:29", "%H:%M"), by="2 hours")
y_labels = format(y_breaks, "%H:%M")
plot_2 = ggplot(df, aes(x=Date, y=ETime)) +
geom_point() +
scale_y_datetime(limits=y_limits, breaks=y_breaks, labels=y_labels) +
labs(title="Plot 2")
library(gridExtra)
png("plots.png", width=8, height=4, units="in", res=120)
grid.arrange(plot_1, plot_2, nrow=1)
dev.off()