Combine 4 bar-plots into one graph in R - r

I want to create a bar-plot using R, which will describe V gene frequency in 4 compartments in the body.
I have this table:
head(my_data)
# A tibble: 6 x 8
Tumor ...2 BM ...4 DLN ...6 Blood ...8
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 V hit frequency V hit frequency V hit frequency V hit frequency
2 IGHV3-1~ 0.54386205~ IGHV10-~ 0.22723742~ IGHV1-5~ 0.1132060~ IGHV5-~ 0.1417894~
3 IGHV5-1~ 0.16148068~ IGHV5-6~ 7.62620114~ IGHV5-1~ 0.1010986~ IGHV7-~ 0.1133675~
4 IGHV10-~ 3.18440869~ IGHV1-6~ 5.68199208~ IGHV1-6~ 8.1465889~ IGHV2-~ 6.4763474~
5 IGHV5-6~ 2.71468704~ IGHV3-1~ 5.24995831~ IGHV5-6~ 7.1625980~ IGHV5-~ 6.3008918~
6 IGHV6-3~ 2.71460485~ IGHV1-9~ 4.19517008~ IGHV1-7~ 4.7428361~ IGHV1-~ 5.0785188~
>
dput(my_data)
structure(list(Tumor = c("V hit", "IGHV3-1*00", "IGHV5-17*00",
"IGHV10-1*00", "IGHV5-6*00", "IGHV6-3*00", "IGHV2-9*00", "IGHV5-4*00",
"IGHV1-9*00"), ...2 = c("frequency", "0.54386205717535796", "0.161480687577157",
"3.1844086931792998E-2", "2.7146870412713998E-2", "2.7146048502561901E-2",
"2.4098405658687001E-2", "2.1746920713615302E-2", "1.6909157558532301E-2"
), BM = c("V hit", "IGHV10-3*00", "IGHV5-6*00", "IGHV1-62-3*00",
"IGHV3-1*00", "IGHV1-9*00", "IGHV10-1*00", "IGHV2-9*00", "IGHV4-2*00"
), ...4 = c("frequency", "0.22723742785161699", "7.62620114066965E-2",
"5.6819920833780603E-2", "5.2499583155365397E-2", "4.1951700840313098E-2",
"3.5214806321420301E-2", "3.2695465872415799E-2", "3.0610100659414E-2"
), DLN = c("V hit", "IGHV1-50*00", "IGHV5-17*00", "IGHV1-62-3*00",
"IGHV5-6*00", "IGHV1-7*00", "IGHV1-4*00", "IGHV6-3*00", "IGHV10-1*00"
), ...6 = c("frequency", "0.113206013467841", "0.101098647226429",
"8.1465889741680994E-2", "7.1625980782229995E-2", "4.7428361184553902E-2",
"4.4690299561054497E-2", "4.3051740808241597E-2", "3.9509373582839201E-2"
), Blood = c("V hit", "IGHV5-6*00", "IGHV7-3*00", "IGHV2-9*00",
"IGHV5-17*00", "IGHV1-67*00", "IGHV1-62-3*00", "IGHV1-7*00",
"IGHV1-9*00"), ...8 = c("frequency", "0.141789453276464", "0.113367584335014",
"6.4763474214811906E-2", "6.3008918185343196E-2", "5.0785188057386597E-2",
"5.0504071345482703E-2", "4.52113222179139E-2", "3.8183404420318E-2"
)), row.names = c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"
))
(^this is just a little part of my entire data.)
Tumor,blood,BM, and DLN are my compartments, and each of them has its V genes and thier frequencies.
I want 1 bar-plot: X-axis will be V genes and for each v gene, I want 4 bars, 1 for each compartment.
Y-axis will be the frequencies.
I'm assuming I need to create df that has only one column of all v genes and multiple columns of frequencies in each of the compartments, but I don't know how to do that.
Any help will be appreciated !!
Thanks, Ligal.

Clean up data
df <- df[-1,] # remove unwanted row
compart <- names(df)[seq(1,8,2)] # compartment names
not_compart <- names(df)[seq(2,8,2)] # not compartment names
# melt data from wide to long
library('data.table')
setDT(df)[, id := 1:.N] # assign id
df <- melt(df, id.vars = 'id',
measure.vars = list(compart, not_compart ),
variable.name = "compartments",
value.name = c("genes", "frequency"))
# change names of compartments
df[, compartments := factor(compartments, levels = seq_along(compart), labels = compart)]
# change frequency values from character to numeric
df[, frequency := as.numeric(frequency)]
Data - output
head(df)
# id compartments genes frequency
# 1: 1 Tumor IGHV3-1*00 0.54386206
# 2: 2 Tumor IGHV5-17*00 0.16148069
# 3: 3 Tumor IGHV10-1*00 0.03184409
# 4: 4 Tumor IGHV5-6*00 0.02714687
# 5: 5 Tumor IGHV6-3*00 0.02714605
# 6: 6 Tumor IGHV2-9*00 0.02409841
Plot
library('ggplot2')
ggplot(data = df, mapping = aes(x = genes, y = frequency)) +
geom_bar(stat = "identity") +
coord_flip() +
facet_wrap(. ~ compartments, scales = "free_y" ) +
theme_bw()
Graph
Plot2
ggplot(data = df, mapping = aes(x = compartments, y = frequency)) +
geom_bar(stat = "identity") +
coord_flip() +
facet_wrap(. ~ genes, scales = "free_y" ) +
theme_bw()
Graph2
Plot-3
ggplot(data = df, mapping = aes(x = genes, y = frequency, color = compartments, fill = compartments, group = compartments)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Graph-3

Try this:
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
my_data <- structure(list(Tumor = c(
"V hit", "IGHV3-1*00", "IGHV5-17*00",
"IGHV10-1*00", "IGHV5-6*00", "IGHV6-3*00", "IGHV2-9*00", "IGHV5-4*00",
"IGHV1-9*00"
), ...2 = c(
"frequency", "0.54386205717535796", "0.161480687577157",
"3.1844086931792998E-2", "2.7146870412713998E-2", "2.7146048502561901E-2",
"2.4098405658687001E-2", "2.1746920713615302E-2", "1.6909157558532301E-2"
), BM = c(
"V hit", "IGHV10-3*00", "IGHV5-6*00", "IGHV1-62-3*00",
"IGHV3-1*00", "IGHV1-9*00", "IGHV10-1*00", "IGHV2-9*00", "IGHV4-2*00"
), ...4 = c(
"frequency", "0.22723742785161699", "7.62620114066965E-2",
"5.6819920833780603E-2", "5.2499583155365397E-2", "4.1951700840313098E-2",
"3.5214806321420301E-2", "3.2695465872415799E-2", "3.0610100659414E-2"
), DLN = c(
"V hit", "IGHV1-50*00", "IGHV5-17*00", "IGHV1-62-3*00",
"IGHV5-6*00", "IGHV1-7*00", "IGHV1-4*00", "IGHV6-3*00", "IGHV10-1*00"
), ...6 = c(
"frequency", "0.113206013467841", "0.101098647226429",
"8.1465889741680994E-2", "7.1625980782229995E-2", "4.7428361184553902E-2",
"4.4690299561054497E-2", "4.3051740808241597E-2", "3.9509373582839201E-2"
), Blood = c(
"V hit", "IGHV5-6*00", "IGHV7-3*00", "IGHV2-9*00",
"IGHV5-17*00", "IGHV1-67*00", "IGHV1-62-3*00", "IGHV1-7*00",
"IGHV1-9*00"
), ...8 = c(
"frequency", "0.141789453276464", "0.113367584335014",
"6.4763474214811906E-2", "6.3008918185343196E-2", "5.0785188057386597E-2",
"5.0504071345482703E-2", "4.52113222179139E-2", "3.8183404420318E-2"
)), row.names = c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"))
# Tidy the dataset
my_data <- slice(my_data, -1)
## Separate the data columns for each compartment and put them in a list
df_tidy <- list(tumor = my_data[1:2], bm = my_data[3:4], dln = my_data[5:6], blood = my_data[7:8]) %>%
## Rename the data columns
map(~ rename(.x, v_hit = 1, freq = 2)) %>%
## Bind the four dfs together into one df
bind_rows(.id = "compartment") %>%
## Convert the frequencies to numeric values
mutate(freq = as.numeric(freq))
head(df_tidy)
#> # A tibble: 6 x 3
#> compartment v_hit freq
#> <chr> <chr> <dbl>
#> 1 tumor IGHV3-1*00 0.544
#> 2 tumor IGHV5-17*00 0.161
#> 3 tumor IGHV10-1*00 0.0318
#> 4 tumor IGHV5-6*00 0.0271
#> 5 tumor IGHV6-3*00 0.0271
#> 6 tumor IGHV2-9*00 0.0241
# Barplot
ggplot(df_tidy, aes(v_hit, freq, fill = compartment)) +
geom_col() +
coord_flip() +
facet_wrap(~compartment, scales = "free_y") +
guides(fill = FALSE)
Created on 2020-03-29 by the reprex package (v0.3.0)

Related

pearson correlation for genes in gene expression data

I have two datasets:
one is actual count and other one is predicted counts. I want to do a pearson correlation between them.
My actual count data look like this:
My predicted counts data look like this:
I want to do pearson correlation for these two datasets for each geneID.
I have written this code:
install.packages("Rcpp")
library(Rcpp)
library("reshape2")
library("ggplot2")
# import in the actual expression values and the gene predicted values
act_cts <- read.delim("GVDS_normalized_counts_2021v1.txt", header = TRUE, sep="\t")
## fix the column names
colnames(act_cts)[1]<-"gene"
colnames(act_cts)<- substr(colnames(act_cts), 1, 7)
pred_cts<-read.delim("GVDS_PrediXcan_Test_2021v1.txt", header=TRUE, sep="\t")
colnames(pred_cts)<-substr(colnames(pred_cts), 1, 15)
## melt the predict counts, so the columns change to row entries FID, IID, gene
melt_pred_cts<-melt(pred_cts, id.vars=c("FID","IID"), variable.name="gene", value.name = "gene_exp")
## melts the actual counts, so it can be easily joined to the final prediction
melt_act_cts<-melt(act_cts, id.vars="gene", variable.name="IID", value.name = "act_gene_exp")
final_cts<-merge(melt_pred_cts,melt_act_cts)
## this takes a minute/ several minutes to run because it is joining on both gene and IID
# runs the Pearson correlation for each gene
all_genes<-unique(final_cts$gene)
pear_cor_all_df<- data.frame(gene=character(), pear_coeff=double())
## runs the correlation
for(g in all_genes)
{
wrk_cts_all<-final_cts[which(final_cts$gene==g),]
# temp working df for each gene
pear_coef_all<-cor(wrk_cts_all$gene_exp, wrk_cts_all$act_gene_exp, method="pearson")
# runs the correlation for each gene between gene_exp and act_gene_exp
new_row_all<-c(g, pear_coef_all)
pear_cor_all_df<-rbind(pear_cor_all_df, new_row_all)
#saves this to the df
}
But its not giving me the correct results.
This is data for act_count:
dput(act_counts[1:10, 1:10])
structure(list(gene = c("ENSG00000152931.6", "ENSG00000183696.9",
"ENSG00000139269.2", "ENSG00000169129.8", "ENSG00000134602.11",
"ENSG00000136237.12", "ENSG00000259425.1", "ENSG00000242284.2",
"ENSG00000235027.1", "ENSG00000228169.3"), Gene_Sy = c("ENSG00000152931.6",
"ENSG00000183696.9", "ENSG00000139269.2", "ENSG00000169129.8",
"ENSG00000134602.11", "ENSG00000136237.12", "ENSG00000259425.1",
"ENSG00000242284.2", "ENSG00000235027.1", "ENSG00000228169.3"
), Chr = c("5", "7", "12", "10", "X", "7", "15", "X", "11", "10"
), Coord = c(59783540, 48128225, 57846106, 116164515, 131157293,
22396763, 23096869, 134953994, 1781578, 116450393), HG00096 = c(0.101857770468582,
8.1838049456063, 1.19991028786682, 0.831939826228749, 27.6464223725999,
3.78850273139249, 0.0540590649819536, 0.351716382898523, 0.200791414339667,
96.1821778045089), HG00097 = c(0.0781095249582053, 5.68691050653862,
1.57357169691446, 0.0697777450667378, 24.3955715036476, 2.05096276937706,
0.112185357489692, 0.444540251941709, 0.190137938062251, 101.17926156721
), HG00099 = c(0.0489806714207954, 2.43465332606958, 0.521615781673147,
0.93108575037257, 16.4453735152148, 4.00031300285966, 0.00359181983091798,
0.227707651999832, 0.0929246302159905, 58.7830634918037), HG00100 = c(0.118597118618172,
3.83089421985197, 1.44722544015787, 0.620940765480242, 24.8066495438254,
3.27161920134705, 0.00049968321150251, 0.714112406249513, 0.108789749488722,
105.483527339859), HG00101 = c(0.00403496367614745, 6.61228835251498,
3.56579072437701, 1.66066836204679, 25.1133488775017, 1.79821591847768,
0.0293976115522442, 0.450911709524112, 0.23244822901371, 105.818192023699
), HG00102 = c(0.0109253485646219, 4.70964559086586, 1.98268073472144,
0.570481056180073, 19.2339882617972, 1.51668840574531, 0.0312661751488703,
0.491437808951175, 0.250905117203001, 136.140843495464)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
This is prd_counts:
dput(prd_counts[1:10, 1:10])
structure(list(FID = c("HG00096", "HG00097", "HG00099", "HG00100",
"HG00101", "HG00102", "HG00103", "HG00105", "HG00106", "HG00107"
), IID = c("HG00096", "HG00097", "HG00099", "HG00100", "HG00101",
"HG00102", "HG00103", "HG00105", "HG00106", "HG00107"), ENSG00000182902.8 = c(0.0223611610092831,
0.0385031316687293, -0.0682504384265577, 0.00018098416274239,
-0.045492721345375, -0.10473163051734, -0.0215970711860838, 0.060455638944161,
-0.00889260689717109, -0.102096211855105), ENSG00000183307.3 = c(0.129041336028238,
-0.13226906002202, 0.005409246530295, -0.0539556427088601, -0.00699884042001628,
-0.204743560777908, -0.0534359750800079, -0.235648260835705,
-0.10230402771496, -0.0914043464852205), ENSG00000237438.1 = c(-0.758838434524167,
-0.579236418964912, -0.695762357174973, -0.368416879945024, -0.339555280234214,
-0.809438763600528, -0.359798980325098, -0.417769387016999, -0.724636782037491,
-0.309671271758401), ENSG00000243156.2 = c(-0.58456094489168,
0.105851861253113, -0.275061563982305, -0.0406543077034047, -0.522672785138957,
-0.126100301787985, -0.288382571274346, -0.354309857822533, -0.314842662063296,
-0.141401921597711), ENSG00000099968.13 = c(0.135357355615122,
0.157616292043257, 0.180059097593111, 0.250009792099489, 0.170653230854707,
0.316157576642492, 0.314671674077333, 0.224102148083679, 0.232969333848649,
0.14963210689311), ENSG00000069998.8 = c(-0.0346986034383362,
-0.0173493017191681, 0, -0.0173493017191681, -0.645266014640116,
-0.0346986034383362, -0.0173493017191681, -0.0173493017191681,
-0.0346986034383362, 0), ENSG00000184979.8 = c(-0.160573318589815,
0.54683218159596, 0.3503062647549, 0.653899917577768, 0.321280544783323,
0.653727041876318, 0.822864620159811, 1.03780221621802, -0.195295753744408,
-0.228590172992798), ENSG00000070413.12 = c(0.775225873145799,
0.602092262450708, 1.0198591935485, 0.65587457098494, 0.306445027670957,
0.581202299884586, 0.836112660742631, 0.559373823767867, 0.46977171007116,
0.84426113999649)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
The provided test samples will not work because there are no genes in common between act_counts and prd_counts. I took the liberty of fixing that by reassigning column names:
library(dplyr)
library(tidyr)
## the line below fixes the problem with test samples
colnames(prd_counts)[3:10] <- act_counts$gene[1:8]
acts <- pivot_longer(act_counts,
cols = starts_with("HG"),
names_to = "FID",
values_to = "Actual")
prds <- pivot_longer(prd_counts,
cols = starts_with("ENSG"),
names_to = "gene",
values_to = "Predicted")
inner_join(acts, prds,
by = c("gene", "FID")) |>
select(gene, FID, Actual, Predicted) |>
group_by(gene) |>
summarize(rho = cor(Actual, Predicted))
##> # A tibble: 8 × 2
##> gene rho
##> <chr> <dbl>
##> 1 ENSG00000134602.11 -0.445
##> 2 ENSG00000136237.12 0.446
##> 3 ENSG00000139269.2 0.543
##> 4 ENSG00000152931.6 0.770
##> 5 ENSG00000169129.8 -0.802
##> 6 ENSG00000183696.9 0.405
##> 7 ENSG00000242284.2 -0.503
##> 8 ENSG00000259425.1 -0.110

Add percentage (%) for a PieChart

Hello guys i am trying to do a pie chart like the next picture:
But i am getting a lot of problem
But this is the plot that I am getting:
ggpie(s, x="costes", label ="prop", lab.pos = "in", fill = "Implementation",
lab.font = list(size= 5)) + scale_fill_manual(values = c("dodgerblue2","blue"))
How i could add percentage to my labels like 98.9%...etc
data:
structure(list(Implementation = c("2", "1"), costes = c(6204670582.33, 70561379.07), prop = c(98.9, 1.1), lab.ypos = c(49.45, 99.45)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"))
Thanks so much for your time
per <- c(0.1,0.2,0.3,0.4)
labels <- c("a","b","c","d")
pie(per,labels = paste0(labels," (",round(per,digits = 3)*100,"%)"),
border="white",col = c("red","green","blue","yellow"))
You can add with 'labels = '. I named your data dummy.
> dummy
# A tibble: 2 x 4
Implementation costes prop lab.ypos
<chr> <dbl> <dbl> <dbl>
1 2 6204670582. 98.9 49.4
2 1 70561379. 1.1 99.4
dummy %>%
ggpie(x="costes", label ="prop", lab.pos = "in", fill = "Implementation",
lab.font = list(size= 5)) +
scale_fill_manual(values = c("dodgerblue2","blue"),
labels = paste(dummy$Implementation, (dummy$prop)))

Plot multiple geom_line and geom_smooth objects in one plot

I have somewhat messy looking dataframes, like this one:
df0
# A tibble: 3 x 9
# Groups: Sequ [1]
Sequ Speaker Utterance A_intpl A_dur B_intpl B_dur C_intpl C_dur
<int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2 ID16.A cool >wha… 31.44786152… 10.5,17,1… 32.86993284… 9.5,16,17… 58.3368399… 14,17,17…
2 2 NA (0.228) 32.75735987… 15.5,17,1… 30.83469006… 14.5,16.9… 26.0386462… 3,17,16,…
3 2 ID16.B u:m Tenne… 32.05752604… 4.5,17,16… 29.95825107… 3.5,16,17… 55.9298614… 8,17,17,…
I want to plot the *_intpl values for each speaker (A, B, or C) for each of the three Utterances in a single chart both as line charts and as trend lines.
I'm just half successful doing this:
library(tidyr)
library(ggplot2)
library(dplyr)
df0 %>%
pivot_longer(cols = contains("_"),
names_to = c("Event_by", ".value"),
names_pattern = "^(.*)_([^_]+$)") %>%
separate_rows(c(intpl, dur), sep = ",", convert = TRUE) %>%
mutate(Time = cumsum(dur)) %>%
mutate(Utterance = paste0(sub(".*(.)$", "\\1",Speaker), ": ", Utterance),
Utterance = factor(Utterance, levels = unique(Utterance))) %>%
ggplot(aes(x = Time, y = log2(intpl),
group = Event_by,
colour = Event_by)) +
geom_line()+
geom_smooth(method = 'lm', color = "red", formula = y~x)+
facet_wrap(~ Utterance, ncol = 1, scales= "free_x")
Half successful because the line plots and trend lines are side-by-side, as if in three columns, whereas they should be in rows, one below the other - how can that be achieved?
Reproducible data:
structure(list(Sequ = c(2L, 2L, 2L), Speaker = c("ID16.A", NA,
"ID16.B"), Utterance = c("cool >what part?<", "(0.228)", "u:m Tennessee="
), A_intpl = c("31.4478615210995,31.5797510648522,31.7143985369445,31.651083739602,31.5806035086034,36.8956763912703,36.2882129597292,35.2124499461012,34.1366869324732,34.1366869324732,32.1927035724058,30.2487202123383,28.3047368522709,26.3607534922035,30.5278334848495,30.5919390424853,30.8898529369568,31.578968913188,31.9011198738002,32.1543265113196,31.9708002079533,31.966536408565,31.8762658607759,31.8994741472105,31.4215913971938,32.1510578328563,31.7863350712876,32.4685052625667,31.7422271490296,32.3286054977263,31.9998974949481,32.5177992323864,32.4727499785435,32.9310888953766,32.7592010033585,33.2231711877427,33.1593949301066,33.2432973964816,33.2569729073414,33.492144800249,33.317650964723,33.4835787832119,33.2377190454279,32.9200836384356,32.9684568771567,32.6400987016883,27.5447101464944,29.3948945479171,35.3449171857603,33.5932932239592,31.8416692621581,30.0900453003569,32.7850431084597,32.7589003618266,32.8365550655013,32.386716057622,32.8420792704881,32.6909995562489,32.6269434402016,32.7370944106334,32.7529759209752,32.6528826975113,32.3663573764448,32.7326853004792,32.6930038462418,32.8975978772676,33.1752899475416,33.2034433355001,33.0667431432803,32.6322933080614,33.2503168843178,32.7573598713719",
"32.7573598713719,32.7531704791313,32.7366130631104,32.918942216354,32.8309939530596,32.3856893430525,32.5368873543441,32.5628510484821,32.5628510484821,32.5628510484821,32.5506564332008,32.7477119716583,32.3458470743288,32.0575260428013",
"32.0575260428013,32.1628824338111,32.0093334061923,32.1461460586991,31.9080762250966,31.9469105074833,31.7431187667232,31.7194255656503,31.7394296413187,31.8594986292975,31.7498243274746,31.9069142374258,32.0835520942767,31.6257067057109,31.757232379438,31.9036689124911,32.1319749301918,31.7203280774998,31.7877137245706,32.3030946636177,32.2800139298454,32.164646135728,32.3636504940227,32.5657818936495,32.3859453482697,32.4797898358193,32.5319835105237,32.92233491509,32.8240561109448,32.664496027779,33.1835064752029,33.0366413969703,33.0406288190821,33.3232964677672,33.2206260057731,33.1537134269402,33.2783471506207,33.2933281566788,33.5322350394609,33.3815736723684,33.7905544185063,33.6143820666896,33.7490659591585,33.7260102344634,34.0721931066557,34.0455026427054,34.3735788774521,34.2888420421073,34.3913721165542,34.5982135545306,34.4417202731001,34.6586347152449,31.1590521215434,31.3276405983897,28.2379253186548,31.133030931336,34.0715906921349,35.8967950760285,35.9334551147377,35.8565504335515,35.7446081905229,35.6300325834155,35.8390086948751,35.9711743270411,36.0029493274176,35.8891056768339"
), A_dur = c("10.5,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,0.5",
"15.5,17,17,16,17,17,16,17,17,16,17,17,16,12.5", "4.5,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,5.5"
), B_intpl = c("32.8699328424689,32.8154348109057,32.5454364786882,32.408257038977,32.5304564519672,32.3270203236281,31.9233218634346,32.0166346064182,31.7360745988363,31.7546527359571,31.8603220354065,31.6520061326962,31.5603191463274,31.3357561466519,31.0976090032219,31.1405090978825,31.1697180784961,31.0863999545386,31.3126984044729,30.580776446803,30.7137016246273,31.0801914571091,31.2343922096768,31.2749857511594,31.3488604642844,30.9327390960718,31.0750482778561,31.1849119826023,31.4180114886183,31.5284273181104,31.147361398529,31.1128597713973,31.5551385744611,31.7479939892741,31.5890352680344,31.5470790538009,31.5427330200078,31.3901913024084,31.5423214446953,31.4814325586741,31.4937336232021,31.3483738841556,31.2516462059018,31.2233881922543,31.2572951780583,31.0087226975291,31.1197589042273,31.053748381687,30.8202174718598,30.845143129195,30.8727194789634,30.4231467151428,30.7254093759809,30.2757746547116,30.6047530953025,29.6835591414008,28.257421076205,29.4634886416064,29.183064807185,28.6935506287734,29.3989017421637,30.8936090542518,30.6884831327852,30.805770713392,30.6938909098627,30.8317757801268,30.8509115577427,30.6836198471168,30.7979978629801,31.0260101704105,30.6248844591805,30.8346900656087",
"30.8346900656087,30.9826158466835,29.814086001996,29.7839590794955,30.7928804535206,31.1589874726521,31.0547403039501,31.2268131145794,31.155503802286,31.3036925274762,31.4782621660348,31.0928322383151,31.589958621025,29.9582510795225",
"29.9582510795225,29.9796434055214,29.9405638729798,30.2602098442174,30.5011865525849,30.6753859842987,28.9331380886365,30.7736467776919,30.8457967803438,30.843630408183,30.8767570425033,30.9178344980247,30.734598946287,30.8877440413271,30.9225051837881,30.9534076039184,31.0172861192043,30.9371712793451,30.9806052132295,31.0593603717961,31.1156928565737,30.4713263393479,26.028518302418,28.1426546887905,29.4308434671559,30.7190322455213,31.2289674937063,31.7389027418913,32.2488379900763,32.7587732382613,33.2687084864463,33.7786437346312,34.2885789828162,34.7985142310012,35.3084494791862,35.8183847273712,36.3283199755562,36.8382552237412,37.3481904719262,37.8581257201112,38.3680609682962,25.5986933949893,29.7968031963901,30.5336819967028,30.1876589408847,30.4260367500101,30.2997107671214,30.3429716412578,30.3537316791924,30.4111899964144,30.7293520851914,30.7778983966343,30.9712137067708,30.9072589183658,31.0696990205164,30.5713926084448,31.3458855877875,31.4169903025083,31.5148974986093,31.5972499257413,31.2293401943969,31.2033325602348,31.1657434266985,30.6784877073261,30.6991365599664,30.6763195188897"
), B_dur = c("9.5,16,17,17,16,17,17,16,17.0000000000146,16.9999999999854,16,17,16.9999999999854,16.0000000000146,17,17,16,17,17,16,17,17,16,17.0000000000146,16.9999999999854,16,17,16.9999999999854,16.0000000000146,17,17,16,17,17,16,17,17,16,17.0000000000146,16.9999999999854,16,17,16.9999999999854,16.0000000000146,17,17,16,17,17,16,17,17,16,17.0000000000146,16.9999999999854,16,17,16.9999999999854,16.0000000000146,17,17,16,17,17,16,17,17,16,17.0000000000146,16.9999999999854,16,2.5",
"14.5,16.9999999999854,16.0000000000146,17,17,16,17,17,16,17,17,16,17.0000000000146,13.4999999999854",
"3.5,16,17,16.9999999999854,16.0000000000146,17,17,16,17,17,16,17,17,16,17.0000000000146,16.9999999999854,16,17,16.9999999999854,16.0000000000146,17,17,16,17,17,16,17,17,16,17.0000000000146,16.9999999999854,16,17,16.9999999999854,16.0000000000146,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,7.5"
), C_intpl = c("58.3368399069697,58.249224089011,59.5198368051218,58.8722012497097,58.4418996252205,58.5849059154389,59.2752163985494,52.8407480422202,51.6276603912397,48.0255346632529,44.753541512539,41.4815483618252,38.2095552111114,34.9375620603975,31.6655689096837,28.3935757589698,25.121582608256,19.4712933827274,22.0108873782783,24.5504813738291,24.8441573376901,24.6902151101703,24.4029572181118,24.9753161974674,24.8664406826514,24.8486668451201,25.1137001504163,25.1142578332509,25.4902077628339,25.4075561268027,25.6622548410237,61.2421678149908,25.1600975771354,25.6667198263373,25.442560744158,25.8736383423437,25.5859074180431,24.7860400673889,24.4337707697216,24.3214953242744,23.915753514736,23.7363185577661,23.7186569801299,23.4313514771952,23.5730151254578,62.5124513171595,23.3260531660862,23.4498217326665,23.2145314844252,57.5586745434594,63.4646233226955,23.0706406704345,23.3318690599491,62.044649715831,62.2720656330432,22.2532276715887,62.7059140614625,22.9511208849958,22.5603175709988,23.3456453893988,63.2523901625561,60.6655429980934,60.2358824325868,59.957910796633,57.3999702562457,54.8277282980263,43.0269305132552,31.2261327284841,19.425334943713,22.7319906068577,26.0386462700023",
"26.0386462700023,29.345301933147,32.6519575962917,35.9586132594364,48.3773995023798,60.7961857453232,49.4980424442242,55.9907960862667,57.2956837917999,58.1409925994177,59.025022056064,60.0098263540792,60.4028460580062,61.2629030450653,55.9298614021542",
"55.9298614021542,55.3877180252389,61.3547152702855,61.7847919095391,56.2457623439544,62.5477315546977,62.3078007189967,62.4272469013149,57.6479672147315,62.9844338801191,58.0081708266629,63.3872796098875,59.0138830718112,58.0612924481098,58.38680047729,58.687179350318,63.8724230039733,63.4126777597892,63.6865154626743,63.5670658627636,63.4496590540706,63.7595297692908,58.9069708176601,63.4547681163061,64.3198376700797,63.415319961042,64.0985879957056,64.1201809531605,63.677902665454,64.1934303628317,64.4682003346273,64.2868853545462,24.8444135816353,64.1579626357752,63.8897139146875,58.5472675827292,64.5784992977498,64.0848591719068,63.8841268679761,64.2901359712354,64.395692486112,64.5425896391638,64.8060565909917,64.3618830026368,64.7088481705444,64.5005944199885,64.5540289192148,64.7408010459365,63.378880767685,63.3415589069662,63.5362700331647,63.5924807719723,63.575801461932,63.6799360982113,64.0041021410894,64.3144923757986,63.8692943755376,63.8594574363473,64.2731841085802,63.3314657812309,64.2758880216293,64.1011768977101,64.0261661917799,64.2865302330478,63.724697791255,64.1202175712152"
), C_dur = c("14,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,14",
"3,17,16,17,17,16,17,17,16,17,17,16,17,17,8", "8,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,2"
)), row.names = c(NA, -3L), groups = structure(list(Sequ = 2L,
.rows = structure(list(1:3), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
There's a possible solution with use of grid.arrange() func from library(gridExtra) library(grid) packages.
I've wrapped your data into unique charts and combined them together into arranged chart.
df1 = df0 %>%
pivot_longer(cols = contains("_"),
names_to = c("Event_by", ".value"),
names_pattern = "^(.*)_([^_]+$)") %>%
separate_rows(c(intpl, dur), sep = ",", convert = TRUE) %>%
mutate(Time = cumsum(dur)) %>%
mutate(Utterance = paste0(sub(".*(.)$", "\\1",Speaker), ": ", Utterance),
Utterance = factor(Utterance, levels = unique(Utterance)))
Set chart objects into enviroment:
for (i in unique(df1$Event_by)){
for (j in levels(df1$Utterance)){
assign(x = paste0(i,j), value = ggplot(data = df1[df1$Event_by == i & df1$Utterance == j,], aes(x = Time, y = log2(intpl))) +
geom_line()+
geom_smooth(method = 'lm', color = "red", formula = y~x))
}
}
Create grided chart:
library(gridExtra) library(grid)
grid.arrange(
`AA: cool >what part?<`,
`AB: u:m Tennessee=` ,
`ANA: (0.228)` ,
`BA: cool >what part?<` ,
`BB: u:m Tennessee=` ,
`BNA: (0.228)` ,
`CA: cool >what part?<` ,
`CB: u:m Tennessee=` ,
`CNA: (0.228)` ,
nrow = 3)
Although i think there should be better solution for that.
You can also try to explore below articlesfor arranging plots:
http://www.sthda.com/english/articles/24-ggpubr-publication-ready-plots/81-ggplot2-easy-way-to-mix-multiple-graphs-on-the-same-page/
https://ggplot2-book.org/facet.html
Moreover, there's is no themming added to my solution

Converting coordinates from degree with unconventional format to decimal degree

I am trying to convert my data so that it can be plotting on a map. For example the data looks like:
# A tibble: 2 x 2
Latitud Longitud
<chr> <chr>
1 10º 35' 28.98'' N 3º 41' 33.91'' O
2 10º 35' 12.63'' N 3º 45' 46.22'' O
I am trying to mutate it using the following:
df %>%
mutate(
Latitud = str_replace_all(Latitud, "''", ""),
lat_edit = sp::char2dms(Latitud), "°")
Which returns and error:
Error in if (any(abs(object#deg) > 90)) return("abs(degree) > 90") :
missing value where TRUE/FALSE needed
In addition: Warning message:
In asMethod(object) : NAs introduced by coercion
I would like to plot these two points on a map in ggplot (or another spatial package)
Data:
structure(list(Latitud = c("40º 25' 25.98'' N", "40º 25' 17.63'' N"
), Longitud = c("3º 42' 43.91'' O", "3º 40' 56.22'' O")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -2L))
You can use the following custom function (I am assuming N, S, W, E. Not sure what O means in longitude):
angle2dec <- function(angle) {
angle <- as.character(angle)
angle <- ifelse(grepl("S|W", angle), paste0("-", angle), angle)
angle <- trimws(gsub("[^- +.0-9]", "", angle))
x <- do.call(rbind, strsplit(angle, split=' '))
x <- apply(x, 1L, function(y) {
y <- as.numeric(y)
(abs(y[1]) + y[2]/60 + y[3]/3600) * sign(y[1])
})
return(x)
}
Applying on the data:
df1[] <- lapply(df1, angle2dec)
df1
#> Latitud Longitud
#> 1 -40.42388 3.712197
#> 2 40.42156 -3.682283
Plotting:
library(ggplot2)
ggplot(df1, aes(x = Longitud, y = Latitud)) +
geom_point()
Slightly Modified Data to Show for Different Hemispheres:
df1 <- structure(list(Latitud = c("40<U+623C><U+3E61> 25' 25.98'' S",
"40<U+623C><U+3E61> 25' 17.63'' N"),
Longitud = c("3<U+623C><U+3E61> 42' 43.91'' E",
"3<U+623C><U+3E61> 40' 56.22'' W")),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -2L))
In reference to Converting geo coordinates from degree to decimal .
I'll preface this by saying I hadn't used char2dms until right now, so there may be intricacies I missed (such as my question above about "O" as a direction). Looking at the docs and examples, you need to give the characters used to demarcate degrees, minutes, and seconds. In your case, these are "º", "'", and "''", respectively. I skipped the step of removing the third of these, because it's necessary to see where the seconds are written. (Update: added a step to replace the regex "O$" (oeste) with "W" (west)). That gets you what's below:
library(dplyr)
library(ggplot2)
library(sp)
dat <- structure(list(Latitud = c("40º 25' 25.98'' N", "40º 25' 17.63'' N"
), Longitud = c("3º 42' 43.91'' O", "3º 40' 56.22'' O")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -2L)) %>%
mutate_at(vars(Latitud, Longitud), stringr::str_replace_all, "O$", "W")
char2dms(dat$Latitud, chd = "º", chm = "'", chs = "''")
#> [1] 40d25'25.98"N 40d25'17.63"N
This is a DMS S3 object, not a vector (here's where my knowledge of this ends), so you can't put it directly into the data frame columns. Instead, convert to a numeric vector, and you've got numeric coordinates in your data frame.
dat_numeric <- dat %>%
mutate(lat_edit = as.numeric(char2dms(dat$Latitud, chd = "º", chm = "'", chs = "''")),
lon_edit = as.numeric(char2dms(dat$Longitud, chd = "º", chm = "'", chs = "''")))
dat_numeric
#> # A tibble: 2 x 4
#> Latitud Longitud lat_edit lon_edit
#> <chr> <chr> <dbl> <dbl>
#> 1 40º 25' 25.98'' N 3º 42' 43.91'' W 40.4 -3.71
#> 2 40º 25' 17.63'' N 3º 40' 56.22'' W 40.4 -3.68
Plot like normal numbers:
ggplot(dat_numeric, aes(x = lon_edit, y = lat_edit)) +
geom_point()
Or convert to an sf object and plot with the appropriate aspect ratio, projection, etc.
sf::st_as_sf(dat_numeric, coords = c("lon_edit", "lat_edit")) %>%
ggplot() +
geom_sf()

Building legends with time series data, in ggplot

Aloha all,
I've struggled to build a legend for a mix/match of time series data I'm making. Here is some code:
My understanding is that I need to somehow clean my data and put it all in the same data frame, but all of the time series don't line up very well. Some is at 15 minutes, other one hour. Is there any way to force a legend for these datasets? I don't know what else to post here - since the 5 datasets are quite large.
Plot I'm working on:
q<- ggplot(subset(cr200_Auwai1, timedate>startd & timedate<endd), aes(timedate, Turb_SS)) +
geom_point(color="coral4")+
geom_point(data=subset(dsloi_wl, timedate>startd & timedate<endd), aes(timedate, level), color="blue")+
#geom_point(data=subset(flow_data, mdate>startd & mdate<endd), aes(as.POSIXct(mdate), flow_cfs*1000), color="red")+
geom_point(data=subset(cr300_Wai1, timedate>startd & timedate<endd), aes(timedate, Lvl_m*1000), color="forestgreen", size=1)+ #aquamarine3
geom_point(data=subset(cr300_Wai1, timedate>startd & timedate<endd), aes(timedate, Turb_SS), color="orange")+
#geom_point(data=subset(hihimanu_wl, timedate>startd & timedate<endd), aes(timedate, level), color="azure4", size=0.1)+
#geom_point(data=subset(rain_data, timedate>startd & timedate<endd), aes(timedate, rainmm), color="red",size=5)+
geom_point(data=subset(haptuk_ysi, datetime>startd & datetime<endd), aes(datetime, Turb), color="pink")+
#scale_x_date(breaks=date_breaks("month"), labels = date_format("%b-%y"))+
xlab("Date")+
ylab("Turbidity (NTU) and Water Level (mm)")+
coord_cartesian(ylim=c(0, 1500))+
theme_bw()+
theme(axis.text=element_text(size=14),
axis.title=element_text(size=16,face="bold"),
legend.justification = c(1, 1),
legend.position = c(1, 1),
legend.title=element_text(size=14),
legend.text=element_text(size=12))
Here is a sample of two of the datasets: Note that the times don't line up at all... since I'm mixing sources.
dsloi_wl:
structure(list(ReceptionTime = c(1533895414.1134, 1533895414.1733,
1533895414.19397, 1533895414.20708, 1533895414.22283, 1533895414.23634,
1533895414.25135, 1533895414.26387, 1533895414.27653, 1533895414.29126,
1533896013.68755, 1533896013.7638, 1533896013.79232, 1533896013.80917,
1533896013.82312, 1533896013.83648, 1533896013.84988, 1533896013.8648,
1533896013.87724, 1533896013.8894), d2w = c(776.7, 789.7, 790.2,
777.1, 777.2, 777.7, 778.4, 793.4, 779.6, 794.1, 819.9, 780.7,
794.1, 806.9, 781.9, 781.9, 782.7, 782.8, 783.1, 783.4), timedate = structure(c(1533895414.1134,
1533895414.1733, 1533895414.19397, 1533895414.20708, 1533895414.22283,
1533895414.23634, 1533895414.25135, 1533895414.26387, 1533895414.27653,
1533895414.29126, 1533896013.68755, 1533896013.7638, 1533896013.79232,
1533896013.80917, 1533896013.82312, 1533896013.83648, 1533896013.84988,
1533896013.8648, 1533896013.87724, 1533896013.8894), class = c("POSIXct",
"POSIXt"), tzone = ""), level = c(723.3, 710.3, 709.8, 722.9,
722.8, 722.3, 721.6, 706.6, 720.4, 705.9, 680.1, 719.3, 705.9,
693.1, 718.1, 718.1, 717.3, 717.2, 716.9, 716.6)), .Names = c("ReceptionTime",
"d2w", "timedate", "level"), row.names = c(NA, 20L), class = "data.frame")
CR300_Wai1
structure(list(RECORD = 73027:73046, Temp_C = c(24.62861, 24.62332,
24.61533, 24.60857, 24.60189, 24.59733, 24.59068, 24.58404, 24.57869,
24.57327, 24.56781, 24.5606, 24.55551, 24.55218, 24.54648, 24.5416,
24.5358, 24.5319, 24.52781, 24.52294), Turb_BS = c(94.50522,
88.65939, 109.354, 57.71527, 134.1903, 46.37191, 78.17719, 52.22319,
58.07111, 96.95719, 51.47488, 44.65616, 70.43825, 99.58217, 93.68374,
87.4787, 175.5395, 167.6757, 110.8119, 132.5971), Turb_SS = c(36.63349,
34.31228, 37.02223, 32.97258, 36.68553, 33.82083, 37.43391, 33.43639,
31.17306, 33.6327, 34.69954, 30.99891, 34.69988, 33.64369, 32.54948,
32.1177, 32.86558, 48.97706, 30.65004, 33.71646), Temp_C_2 = c(24.9014,
24.89474, 24.88837, 24.88279, 24.87574, 24.86852, 24.86357, 24.85751,
24.85236, 24.84759, 24.84091, 24.83577, 24.83192, 24.82713, 24.8229,
24.81832, 24.81237, 24.80821, 24.8051, 24.80015), WD_OBS = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Lvl_m = c(0.6907353, 0.6905226, 0.6896195, 0.6890779,
0.6881586, 0.6878724, 0.6862501, 0.6848835, 0.6844589, 0.6837503,
0.6836612, 0.6831629, 0.6821692, 0.6812283, 0.6799452, 0.6791196,
0.6782504, 0.6772775, 0.6763596, 0.6755115), timedate = structure(c(1533895500,
1533895800, 1533896100, 1533896400, 1533896700, 1533897000, 1533897300,
1533897600, 1533897900, 1533898200, 1533898500, 1533898800, 1533899100,
1533899400, 1533899700, 1533900000, 1533900300, 1533900600, 1533900900,
1533901200), class = c("POSIXct", "POSIXt"), tzone = "")), .Names = c("RECORD",
"Temp_C", "Turb_BS", "Turb_SS", "Temp_C_2", "WD_OBS", "Lvl_m",
"timedate"), row.names = c(NA, 20L), class = "data.frame")
Here is a solution using mock data (next time provide a sample of your data) :
library(tidyverse)
library(lubridate)
#>
#> Attachement du package : 'lubridate'
#> The following object is masked from 'package:base':
#>
#> date
# mock data
time_15m <- seq(as.POSIXct("2018-08-30 00:00:00"), as.POSIXct("2018-08-31 00:00:00"), by = "15 min")
time_30m <- seq(as.POSIXct("2018-08-30 00:00:00"), as.POSIXct("2018-08-31 00:00:00"), by = "30 min")
time_60m <- seq(as.POSIXct("2018-08-30 00:00:00"), as.POSIXct("2018-08-31 00:00:00"), by = "60 min")
data_1 <- data.frame(time = time_15m,
var_1 = cos(hour(time_15m) + minute(time_15m)))
data_2 <- data.frame(time = time_30m,
var_2 = sin(hour(time_30m) + minute(time_30m)))
data_3 <- data.frame(time = time_60m,
var_3 = cos(1 - hour(time_60m) + minute(time_60m)))
# the kind of plot you have (prefer the 2nd version)
ggplot(data_1, aes(x = time, y = var_1)) +
geom_point(color = "red") +
geom_point(data = data_2, aes(time, var_2), color = "green") +
geom_point(data = data_3, aes(time, var_3), color = "blue") +
theme_bw()
# a version with long format data and use of gather function
data_1 %>%
left_join(data_2) %>% # join data from data_2 (timestep = 30m), missing data is NA
left_join(data_3) %>% # join data from data_3 (timestep = 60m), missing data is NA
gather(variable_name, variable_value, var_1, var_2, var_3) %>% # gather var_1, var_2 and var_3 in a single column
ggplot(., aes(x = time, y = variable_value, color = variable_name)) +
theme_bw() +
geom_point(size = 2)
#> Joining, by = "time"
#> Joining, by = "time"
#> Warning: Removed 120 rows containing missing values (geom_point).
Created on 2018-08-22 by the reprex package (v0.2.0).
EDIT 1 (include provided datasets)
library(tidyverse)
dsloi_wl %>%
full_join(cr300_Wai1) %>%
mutate(Lvl_m = 100 * Lvl_m) %>%
gather(variable_name, variable_value, level, Lvl_m, Turb_SS) %>%
ggplot(., aes(x = timedate, y = variable_value, color = variable_name)) +
geom_point() +
scale_color_manual("Legend title",
values = c("level" = "blue",
"Lvl_m" = "forestgreen",
"Turb_SS" = "orange"))
#> Joining, by = "timedate"
#> Warning: Removed 60 rows containing missing values (geom_point).
Created on 2018-08-23 by the reprex package (v0.2.0).

Resources