Related
I have two datasets:
one is actual count and other one is predicted counts. I want to do a pearson correlation between them.
My actual count data look like this:
My predicted counts data look like this:
I want to do pearson correlation for these two datasets for each geneID.
I have written this code:
install.packages("Rcpp")
library(Rcpp)
library("reshape2")
library("ggplot2")
# import in the actual expression values and the gene predicted values
act_cts <- read.delim("GVDS_normalized_counts_2021v1.txt", header = TRUE, sep="\t")
## fix the column names
colnames(act_cts)[1]<-"gene"
colnames(act_cts)<- substr(colnames(act_cts), 1, 7)
pred_cts<-read.delim("GVDS_PrediXcan_Test_2021v1.txt", header=TRUE, sep="\t")
colnames(pred_cts)<-substr(colnames(pred_cts), 1, 15)
## melt the predict counts, so the columns change to row entries FID, IID, gene
melt_pred_cts<-melt(pred_cts, id.vars=c("FID","IID"), variable.name="gene", value.name = "gene_exp")
## melts the actual counts, so it can be easily joined to the final prediction
melt_act_cts<-melt(act_cts, id.vars="gene", variable.name="IID", value.name = "act_gene_exp")
final_cts<-merge(melt_pred_cts,melt_act_cts)
## this takes a minute/ several minutes to run because it is joining on both gene and IID
# runs the Pearson correlation for each gene
all_genes<-unique(final_cts$gene)
pear_cor_all_df<- data.frame(gene=character(), pear_coeff=double())
## runs the correlation
for(g in all_genes)
{
wrk_cts_all<-final_cts[which(final_cts$gene==g),]
# temp working df for each gene
pear_coef_all<-cor(wrk_cts_all$gene_exp, wrk_cts_all$act_gene_exp, method="pearson")
# runs the correlation for each gene between gene_exp and act_gene_exp
new_row_all<-c(g, pear_coef_all)
pear_cor_all_df<-rbind(pear_cor_all_df, new_row_all)
#saves this to the df
}
But its not giving me the correct results.
This is data for act_count:
dput(act_counts[1:10, 1:10])
structure(list(gene = c("ENSG00000152931.6", "ENSG00000183696.9",
"ENSG00000139269.2", "ENSG00000169129.8", "ENSG00000134602.11",
"ENSG00000136237.12", "ENSG00000259425.1", "ENSG00000242284.2",
"ENSG00000235027.1", "ENSG00000228169.3"), Gene_Sy = c("ENSG00000152931.6",
"ENSG00000183696.9", "ENSG00000139269.2", "ENSG00000169129.8",
"ENSG00000134602.11", "ENSG00000136237.12", "ENSG00000259425.1",
"ENSG00000242284.2", "ENSG00000235027.1", "ENSG00000228169.3"
), Chr = c("5", "7", "12", "10", "X", "7", "15", "X", "11", "10"
), Coord = c(59783540, 48128225, 57846106, 116164515, 131157293,
22396763, 23096869, 134953994, 1781578, 116450393), HG00096 = c(0.101857770468582,
8.1838049456063, 1.19991028786682, 0.831939826228749, 27.6464223725999,
3.78850273139249, 0.0540590649819536, 0.351716382898523, 0.200791414339667,
96.1821778045089), HG00097 = c(0.0781095249582053, 5.68691050653862,
1.57357169691446, 0.0697777450667378, 24.3955715036476, 2.05096276937706,
0.112185357489692, 0.444540251941709, 0.190137938062251, 101.17926156721
), HG00099 = c(0.0489806714207954, 2.43465332606958, 0.521615781673147,
0.93108575037257, 16.4453735152148, 4.00031300285966, 0.00359181983091798,
0.227707651999832, 0.0929246302159905, 58.7830634918037), HG00100 = c(0.118597118618172,
3.83089421985197, 1.44722544015787, 0.620940765480242, 24.8066495438254,
3.27161920134705, 0.00049968321150251, 0.714112406249513, 0.108789749488722,
105.483527339859), HG00101 = c(0.00403496367614745, 6.61228835251498,
3.56579072437701, 1.66066836204679, 25.1133488775017, 1.79821591847768,
0.0293976115522442, 0.450911709524112, 0.23244822901371, 105.818192023699
), HG00102 = c(0.0109253485646219, 4.70964559086586, 1.98268073472144,
0.570481056180073, 19.2339882617972, 1.51668840574531, 0.0312661751488703,
0.491437808951175, 0.250905117203001, 136.140843495464)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
This is prd_counts:
dput(prd_counts[1:10, 1:10])
structure(list(FID = c("HG00096", "HG00097", "HG00099", "HG00100",
"HG00101", "HG00102", "HG00103", "HG00105", "HG00106", "HG00107"
), IID = c("HG00096", "HG00097", "HG00099", "HG00100", "HG00101",
"HG00102", "HG00103", "HG00105", "HG00106", "HG00107"), ENSG00000182902.8 = c(0.0223611610092831,
0.0385031316687293, -0.0682504384265577, 0.00018098416274239,
-0.045492721345375, -0.10473163051734, -0.0215970711860838, 0.060455638944161,
-0.00889260689717109, -0.102096211855105), ENSG00000183307.3 = c(0.129041336028238,
-0.13226906002202, 0.005409246530295, -0.0539556427088601, -0.00699884042001628,
-0.204743560777908, -0.0534359750800079, -0.235648260835705,
-0.10230402771496, -0.0914043464852205), ENSG00000237438.1 = c(-0.758838434524167,
-0.579236418964912, -0.695762357174973, -0.368416879945024, -0.339555280234214,
-0.809438763600528, -0.359798980325098, -0.417769387016999, -0.724636782037491,
-0.309671271758401), ENSG00000243156.2 = c(-0.58456094489168,
0.105851861253113, -0.275061563982305, -0.0406543077034047, -0.522672785138957,
-0.126100301787985, -0.288382571274346, -0.354309857822533, -0.314842662063296,
-0.141401921597711), ENSG00000099968.13 = c(0.135357355615122,
0.157616292043257, 0.180059097593111, 0.250009792099489, 0.170653230854707,
0.316157576642492, 0.314671674077333, 0.224102148083679, 0.232969333848649,
0.14963210689311), ENSG00000069998.8 = c(-0.0346986034383362,
-0.0173493017191681, 0, -0.0173493017191681, -0.645266014640116,
-0.0346986034383362, -0.0173493017191681, -0.0173493017191681,
-0.0346986034383362, 0), ENSG00000184979.8 = c(-0.160573318589815,
0.54683218159596, 0.3503062647549, 0.653899917577768, 0.321280544783323,
0.653727041876318, 0.822864620159811, 1.03780221621802, -0.195295753744408,
-0.228590172992798), ENSG00000070413.12 = c(0.775225873145799,
0.602092262450708, 1.0198591935485, 0.65587457098494, 0.306445027670957,
0.581202299884586, 0.836112660742631, 0.559373823767867, 0.46977171007116,
0.84426113999649)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
The provided test samples will not work because there are no genes in common between act_counts and prd_counts. I took the liberty of fixing that by reassigning column names:
library(dplyr)
library(tidyr)
## the line below fixes the problem with test samples
colnames(prd_counts)[3:10] <- act_counts$gene[1:8]
acts <- pivot_longer(act_counts,
cols = starts_with("HG"),
names_to = "FID",
values_to = "Actual")
prds <- pivot_longer(prd_counts,
cols = starts_with("ENSG"),
names_to = "gene",
values_to = "Predicted")
inner_join(acts, prds,
by = c("gene", "FID")) |>
select(gene, FID, Actual, Predicted) |>
group_by(gene) |>
summarize(rho = cor(Actual, Predicted))
##> # A tibble: 8 × 2
##> gene rho
##> <chr> <dbl>
##> 1 ENSG00000134602.11 -0.445
##> 2 ENSG00000136237.12 0.446
##> 3 ENSG00000139269.2 0.543
##> 4 ENSG00000152931.6 0.770
##> 5 ENSG00000169129.8 -0.802
##> 6 ENSG00000183696.9 0.405
##> 7 ENSG00000242284.2 -0.503
##> 8 ENSG00000259425.1 -0.110
I want to create a bar-plot using R, which will describe V gene frequency in 4 compartments in the body.
I have this table:
head(my_data)
# A tibble: 6 x 8
Tumor ...2 BM ...4 DLN ...6 Blood ...8
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 V hit frequency V hit frequency V hit frequency V hit frequency
2 IGHV3-1~ 0.54386205~ IGHV10-~ 0.22723742~ IGHV1-5~ 0.1132060~ IGHV5-~ 0.1417894~
3 IGHV5-1~ 0.16148068~ IGHV5-6~ 7.62620114~ IGHV5-1~ 0.1010986~ IGHV7-~ 0.1133675~
4 IGHV10-~ 3.18440869~ IGHV1-6~ 5.68199208~ IGHV1-6~ 8.1465889~ IGHV2-~ 6.4763474~
5 IGHV5-6~ 2.71468704~ IGHV3-1~ 5.24995831~ IGHV5-6~ 7.1625980~ IGHV5-~ 6.3008918~
6 IGHV6-3~ 2.71460485~ IGHV1-9~ 4.19517008~ IGHV1-7~ 4.7428361~ IGHV1-~ 5.0785188~
>
dput(my_data)
structure(list(Tumor = c("V hit", "IGHV3-1*00", "IGHV5-17*00",
"IGHV10-1*00", "IGHV5-6*00", "IGHV6-3*00", "IGHV2-9*00", "IGHV5-4*00",
"IGHV1-9*00"), ...2 = c("frequency", "0.54386205717535796", "0.161480687577157",
"3.1844086931792998E-2", "2.7146870412713998E-2", "2.7146048502561901E-2",
"2.4098405658687001E-2", "2.1746920713615302E-2", "1.6909157558532301E-2"
), BM = c("V hit", "IGHV10-3*00", "IGHV5-6*00", "IGHV1-62-3*00",
"IGHV3-1*00", "IGHV1-9*00", "IGHV10-1*00", "IGHV2-9*00", "IGHV4-2*00"
), ...4 = c("frequency", "0.22723742785161699", "7.62620114066965E-2",
"5.6819920833780603E-2", "5.2499583155365397E-2", "4.1951700840313098E-2",
"3.5214806321420301E-2", "3.2695465872415799E-2", "3.0610100659414E-2"
), DLN = c("V hit", "IGHV1-50*00", "IGHV5-17*00", "IGHV1-62-3*00",
"IGHV5-6*00", "IGHV1-7*00", "IGHV1-4*00", "IGHV6-3*00", "IGHV10-1*00"
), ...6 = c("frequency", "0.113206013467841", "0.101098647226429",
"8.1465889741680994E-2", "7.1625980782229995E-2", "4.7428361184553902E-2",
"4.4690299561054497E-2", "4.3051740808241597E-2", "3.9509373582839201E-2"
), Blood = c("V hit", "IGHV5-6*00", "IGHV7-3*00", "IGHV2-9*00",
"IGHV5-17*00", "IGHV1-67*00", "IGHV1-62-3*00", "IGHV1-7*00",
"IGHV1-9*00"), ...8 = c("frequency", "0.141789453276464", "0.113367584335014",
"6.4763474214811906E-2", "6.3008918185343196E-2", "5.0785188057386597E-2",
"5.0504071345482703E-2", "4.52113222179139E-2", "3.8183404420318E-2"
)), row.names = c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"
))
(^this is just a little part of my entire data.)
Tumor,blood,BM, and DLN are my compartments, and each of them has its V genes and thier frequencies.
I want 1 bar-plot: X-axis will be V genes and for each v gene, I want 4 bars, 1 for each compartment.
Y-axis will be the frequencies.
I'm assuming I need to create df that has only one column of all v genes and multiple columns of frequencies in each of the compartments, but I don't know how to do that.
Any help will be appreciated !!
Thanks, Ligal.
Clean up data
df <- df[-1,] # remove unwanted row
compart <- names(df)[seq(1,8,2)] # compartment names
not_compart <- names(df)[seq(2,8,2)] # not compartment names
# melt data from wide to long
library('data.table')
setDT(df)[, id := 1:.N] # assign id
df <- melt(df, id.vars = 'id',
measure.vars = list(compart, not_compart ),
variable.name = "compartments",
value.name = c("genes", "frequency"))
# change names of compartments
df[, compartments := factor(compartments, levels = seq_along(compart), labels = compart)]
# change frequency values from character to numeric
df[, frequency := as.numeric(frequency)]
Data - output
head(df)
# id compartments genes frequency
# 1: 1 Tumor IGHV3-1*00 0.54386206
# 2: 2 Tumor IGHV5-17*00 0.16148069
# 3: 3 Tumor IGHV10-1*00 0.03184409
# 4: 4 Tumor IGHV5-6*00 0.02714687
# 5: 5 Tumor IGHV6-3*00 0.02714605
# 6: 6 Tumor IGHV2-9*00 0.02409841
Plot
library('ggplot2')
ggplot(data = df, mapping = aes(x = genes, y = frequency)) +
geom_bar(stat = "identity") +
coord_flip() +
facet_wrap(. ~ compartments, scales = "free_y" ) +
theme_bw()
Graph
Plot2
ggplot(data = df, mapping = aes(x = compartments, y = frequency)) +
geom_bar(stat = "identity") +
coord_flip() +
facet_wrap(. ~ genes, scales = "free_y" ) +
theme_bw()
Graph2
Plot-3
ggplot(data = df, mapping = aes(x = genes, y = frequency, color = compartments, fill = compartments, group = compartments)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Graph-3
Try this:
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
my_data <- structure(list(Tumor = c(
"V hit", "IGHV3-1*00", "IGHV5-17*00",
"IGHV10-1*00", "IGHV5-6*00", "IGHV6-3*00", "IGHV2-9*00", "IGHV5-4*00",
"IGHV1-9*00"
), ...2 = c(
"frequency", "0.54386205717535796", "0.161480687577157",
"3.1844086931792998E-2", "2.7146870412713998E-2", "2.7146048502561901E-2",
"2.4098405658687001E-2", "2.1746920713615302E-2", "1.6909157558532301E-2"
), BM = c(
"V hit", "IGHV10-3*00", "IGHV5-6*00", "IGHV1-62-3*00",
"IGHV3-1*00", "IGHV1-9*00", "IGHV10-1*00", "IGHV2-9*00", "IGHV4-2*00"
), ...4 = c(
"frequency", "0.22723742785161699", "7.62620114066965E-2",
"5.6819920833780603E-2", "5.2499583155365397E-2", "4.1951700840313098E-2",
"3.5214806321420301E-2", "3.2695465872415799E-2", "3.0610100659414E-2"
), DLN = c(
"V hit", "IGHV1-50*00", "IGHV5-17*00", "IGHV1-62-3*00",
"IGHV5-6*00", "IGHV1-7*00", "IGHV1-4*00", "IGHV6-3*00", "IGHV10-1*00"
), ...6 = c(
"frequency", "0.113206013467841", "0.101098647226429",
"8.1465889741680994E-2", "7.1625980782229995E-2", "4.7428361184553902E-2",
"4.4690299561054497E-2", "4.3051740808241597E-2", "3.9509373582839201E-2"
), Blood = c(
"V hit", "IGHV5-6*00", "IGHV7-3*00", "IGHV2-9*00",
"IGHV5-17*00", "IGHV1-67*00", "IGHV1-62-3*00", "IGHV1-7*00",
"IGHV1-9*00"
), ...8 = c(
"frequency", "0.141789453276464", "0.113367584335014",
"6.4763474214811906E-2", "6.3008918185343196E-2", "5.0785188057386597E-2",
"5.0504071345482703E-2", "4.52113222179139E-2", "3.8183404420318E-2"
)), row.names = c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"))
# Tidy the dataset
my_data <- slice(my_data, -1)
## Separate the data columns for each compartment and put them in a list
df_tidy <- list(tumor = my_data[1:2], bm = my_data[3:4], dln = my_data[5:6], blood = my_data[7:8]) %>%
## Rename the data columns
map(~ rename(.x, v_hit = 1, freq = 2)) %>%
## Bind the four dfs together into one df
bind_rows(.id = "compartment") %>%
## Convert the frequencies to numeric values
mutate(freq = as.numeric(freq))
head(df_tidy)
#> # A tibble: 6 x 3
#> compartment v_hit freq
#> <chr> <chr> <dbl>
#> 1 tumor IGHV3-1*00 0.544
#> 2 tumor IGHV5-17*00 0.161
#> 3 tumor IGHV10-1*00 0.0318
#> 4 tumor IGHV5-6*00 0.0271
#> 5 tumor IGHV6-3*00 0.0271
#> 6 tumor IGHV2-9*00 0.0241
# Barplot
ggplot(df_tidy, aes(v_hit, freq, fill = compartment)) +
geom_col() +
coord_flip() +
facet_wrap(~compartment, scales = "free_y") +
guides(fill = FALSE)
Created on 2020-03-29 by the reprex package (v0.3.0)
I have some data which looks like:
long_bnk lat_bnk
[1,] "3<U+00B0> 52' 30.1\" W" "40<U+00B0> 44' 3.7\" N"
[2,] "2<U+00B0> 44' 54.4\" E" "42<U+00B0> 7' 18.1\" N"
[3,] NA NA
[4,] "2<U+00B0> 7' 54.2\" E" "41<U+00B0> 31' 21.9\" N"
[5,] "0<U+00B0> 1' 54.5\" W" "39<U+00B0> 58' 59.3\" N"
[6,] "3<U+00B0> 41' 15.5\" W" "40<U+00B0> 27' 47.2\" N"
I am trying to put the data into the correct lat/long format. I am running the following:
pts_bnk[pts_bnk==""] <- NA
pts_bnk <- pts_bnk[complete.cases(pts_bnk),]
pts_bnk <- matrix(as.numeric(sp::char2dms(as.vector(pts_bnk), "°")), ncol=2)
However, I keep getting:
Error in if (any(abs(object#deg) > 90)) return("abs(degree) > 90") :
missing value where TRUE/FALSE needed In addition: Warning message: In
asMethod(object) : NAs introduced by coercion
Where am I going wrong in the conversion to the correct lat/long format?
Data:
pts_bnk <- structure(c("3<U+00B0> 52' 30.1\" W", "2<U+00B0> 44' 54.4\" E",
NA, "2<U+00B0> 7' 54.2\" E", "0<U+00B0> 1' 54.5\" W", "3<U+00B0> 41' 15.5\" W",
"40<U+00B0> 44' 3.7\" N", "42<U+00B0> 7' 18.1\" N", NA, "41<U+00B0> 31' 21.9\" N",
"39<U+00B0> 58' 59.3\" N", "40<U+00B0> 27' 47.2\" N"), .Dim = c(6L,
2L), .Dimnames = list(NULL, c("long_bnk", "lat_bnk")))
EDIT:
Essentially I would like to plot the data using:
library(ggrepel)
library(ggmap)
register_google(key = "MyKey")
spain <- get_map("Spain", zoom = 6)
ggmap(spain, extent = "normal") +
geom_point()
EDIT 2:
The original data I had (which works) was the following:
dms_lat <- readLines(n=5)
1 40° 25' 35.8" N
2 40° 26' 28.4" N
3 40° 28' 39.8" N
4
5 38° 59' 15.0" N
dms_long <-readLines(n=5)
1 3° 41' 19.9" W
2 3° 47' 42.2" W
3 3° 41' 11.7" W
4
5 3° 55' 29.6" W
pts <- cbind(dms_long, dms_lat)
pts <- sub("^\\d+\\s+", "", pts)
pts[pts==""] <- NA
pts <- pts[complete.cases(pts),]
pts <- matrix(as.numeric(sp::char2dms(as.vector(pts), "°")), ncol=2)
library(rworldmap)
plot(subset(getMap(resolution = "low"), NAME=="Spain"))
points(pts[,1], pts[,2], col = "red", pch=3, cex = 0.6)
The current data I have (which does not work) is:
x <- structure(c("3<U+00B0> 52' 30.1\" W", "2<U+00B0> 44' 54.4\" E",
NA, "2<U+00B0> 7' 54.2\" E", "0<U+00B0> 1' 54.5\" W", "3<U+00B0> 41' 15.5\" W",
"40<U+00B0> 44' 3.7\" N", "42<U+00B0> 7' 18.1\" N", NA, "41<U+00B0> 31' 21.9\" N",
"39<U+00B0> 58' 59.3\" N", "40<U+00B0> 27' 47.2\" N"), .Dim = c(6L,
2L), .Dimnames = list(NULL, c("long_bnk", "lat_bnk")))
x %>%
data.frame() %>%
mutate(
lat = sub("<U\\+00B0>", "\u00B0", lat_bnk),
long = sub("<U\\+00B0>", "\u00B0", long_bnk)
) %>%
select(lat, long) %>%
drop_na()
I am trying to make this second data be equal to the first so it can be plotted using ggmap().
Let me know if this works for you based on our comments:
library(rworldmap)
library(sp)
library(dplyr)
pts <- x %>%
data.frame() %>%
mutate(
lat = sub("<U\\+00B0>", "d", lat_bnk),
long = sub("<U\\+00B0>", "d", long_bnk)
) %>%
select(lat, long) %>%
drop_na()
pts_long <- as.numeric(char2dms(pts[["long"]]))
pts_lat <- as.numeric(char2dms(pts[["lat"]]))
plot(subset(getMap(resolution = "low"), NAME=="Spain"))
points(pts_long, pts_lat, col = "red", pch=3, cex = 0.6)
Note that the char2dms is used as follows:
char2dms(from, chd = "d", chm = "'", chs = "\"")
where the default for degree character terminator is the letter d (an alternative to the degree symbol, if chd not specified).
Using ggmap you can pass longitude and latitude in geom_point:
library(ggrepel)
library(ggmap)
library(ggthemes)
pts_data <- data.frame(pts_long, pts_lat)
# Note requires Google key
spain <- ggmap::get_map("Madrid, Spain", zoom = 6)
ggmap(spain, extent = "normal") +
geom_point(data = pts_data, aes(x = pts_long, y = pts_lat)) +
theme_map()
We can add mutate_at at the end
library(dplyr)
x %>%
data.frame() %>%
mutate(
lat = sub("<U\\+00B0>", "\u00B0", lat_bnk),
long = sub("<U\\+00B0>", "\u00B0", long_bnk)
) %>%
select(lat, long) %>%
drop_na()%>%
mutate_at(vars(matches('^(lat|long)')), ~ as.numeric(sp::char2dms(., "°")))
I am trying to use kmeans to show what states have similar statistics with one another from the Lahman database, my code is as follows:
battingInfo <- Batting %>% filter(yearID >= 1999)
total <- merge(battingInfo,People,by="playerID")
totalN <- total[,-c(24,25,28:47)]
filterByState <- totalN %>% group_by(birthState) %>% summarise(players = length(playerID))
newMerge <- merge(totalN, filterByState, by="birthState")
newTest <- newMerge %>% group_by(birthState) %>% summarise_at(vars(G, AB, R, H, X2B, X3B, HR, RBI, SB, CS, BB,
SO, IBB, HBP, SH, SF, GIDP), sum, na.rm = TRUE)
updateTest <- newMerge %>% group_by(birthState) %>% summarise(Players = n_distinct(playerID), G = sum(G), AB = sum(AB),
R = sum(R), H = sum(H), X2B = sum(X2B), X3B = sum(X3B),
HR = sum(HR), RBI = sum(RBI), SB = sum(SB), CS = sum(CS),
BB = sum(BB), SO = sum(SO), IBB = sum(IBB), HBP = sum(HBP),
SH = sum(SH), SF = sum(SF), GIDP = sum(GIDP))
finalUpdate <- newMerge %>% group_by(birthState = case_when(!birthState %in% state.abb ~ "Other",
TRUE ~ birthState)) %>% summarise(Players = n_distinct(playerID),
G = sum(G), AB = sum(AB),
R = sum(R), H = sum(H), X2B = sum(X2B), X3B = sum(X3B),
HR = sum(HR), RBI = sum(RBI), SB = sum(SB), CS = sum(CS),
BB = sum(BB), SO = sum(SO), IBB = sum(IBB), HBP = sum(HBP),
SH = sum(SH), SF = sum(SF), GIDP = sum(GIDP))
This gives me the data frame I want. Now my code for kmeans is:
subDat5 <- finalUpdate[, c(2:19)]
subDatSc5 <- scale(subDat5)
distDat5 <- dist(subDatSc5)
k2<-5
km3new<-kmeans(subDatSc5, k2, nstart = 40)
fitNew <-cmdscale(distDat5) # k is the number of dim to PLOT
plot(fitNew, xlab="Coordinate 1",ylab="Coordinate 2", pch=16, col=km3new$cluster)
birthState=as.character(finalUpdate[,1])
View(birthState)
text(fitNew+.1, labels = birthState, cex=.5)
Everything seems to work perfectly up until the last line, when I label all the points and it outputs a graph with each point being labeled 50 times.
Is there any fix to this?
dput(fitNew) =
structure(c(-1.65773726259238, -0.534080004429963, -1.25224081559503,
-0.77600324658737, 13.7591986092784, -1.48285027332317, -1.0685046710528,
-1.40697098882713, 4.45857203274176, 1.31053002832658, -1.35540549966184,
-1.29910272287957, -1.68908570162927, 0.480144496416969, -0.592812161743823,
-1.23667901504586, -0.844421560951474, -0.827147650450116, -1.22861495063773,
-1.09472770146309, -1.68944621276222, -1.04378183282088, -1.34915033496973,
-0.951660697104605, -0.45483103293441, -1.70655513856763, -0.0616193106609581,
-1.48510165062592, -1.46251714293967, -1.66524625215651, -0.302561452071198,
-1.56675666458699, -1.28344728331308, 0.864956587539308, 0.16173394975142,
-0.850595975621662, -0.756783746315003, 24.7256817273653, -0.427398940139082,
-1.39925870808987, -0.755785801532488, -1.51858748511865, -0.944152303255372,
2.99465893267538, -1.67729960185572, -0.428860890332761, -1.66997803522651,
-0.392867003697617, -1.30257694125332, -1.66036447381944, -1.6019072254532,
-0.0137738939595427, -0.296070047308066, -0.00473553953140588,
0.0641385777789144, 1.13842140049119, -0.0268651281540734, -0.128806499497676,
-0.00491611456401126, 0.364126276181306, -0.143046769591177,
-0.0283493696039194, -0.0485069239634975, -0.0287370449451863,
0.095714493198601, -0.124528071666917, -0.0332600735692987, 0.0352695212129851,
-0.119261467201306, -0.0381525968696119, 0.0551469698282207,
-0.0115458694920637, -0.0250933419027217, 0.0406395856647227,
0.12482265126378, -0.17954163594865, -0.0113245644618699, -0.0894498877336694,
0.0305207676977073, 0.0323710265810206, -0.0491296972494748,
-0.121635810491615, 0.0175346179372083, 0.0127983868546243, 0.21663582448027,
0.0803333481747664, -0.0309611163272855, 0.0201356804088859,
-0.696293053438086, 0.133550765173667, 0.108119095159391, -0.136003613852937,
0.00557290379285935, 0.0602630898597761, -0.196004062948666,
-0.0161895096280255, -0.178283625530885, -0.0170000868214074,
0.107232630021258, 0.0375464632562086, -0.00276496483054615,
0.0193363060673037), .Dim = c(51L, 2L), .Dimnames = list(NULL,
NULL))
and dput(birthState) =
"c(\"AK\", \"AL\", \"AR\", \"AZ\", \"CA\", \"CO\", \"CT\", \"DE\", \"FL\", \"GA\", \"HI\", \"IA\", \"ID\", \"IL\", \"IN\", \"KS\", \"KY\", \"LA\", \"MA\", \"MD\", \"ME\", \"MI\", \"MN\", \"MO\", \"MS\", \"MT\", \"NC\", \"ND\", \"NE\", \"NH\", \"NJ\", \"NM\", \"NV\", \"NY\", \"OH\", \"OK\", \"OR\", \"Other\", \"PA\", \"RI\", \"SC\", \"SD\", \"TN\", \"TX\", \"UT\", \"VA\", \"VT\", \"WA\", \"WI\", \"WV\", \"WY\")"
As I mentioned in my comment, your problem is probably due to the fact that birthState is a string of an R character vector and not the actual vector.
The following code
birthState <- eval(parse(text = birthState))
plot(fitNew, xlab="Coordinate 1",ylab="Coordinate 2", pch=16)
text(fitNew, labels = birthState, cex=.5, pos = 4)
Yielded this for me
I have a dataframe (test) in R. Inside one of the columns contains coordinates in this list structure:
> dput(test$coordinates)
list(structure(list(x = c(-1.294832, -1.294883, -1.294262,
-1.249478), y = c(54.61024, 54.61008, 54.610016, 54.610006
)), .Names = c("x", "y"), row.names = c(NA, -284L), class = c("tbl_df",
"tbl", "data.frame")))
I've reduced the number of coordinates for clarity.
Ultimately I wish to convert the dataframe into a spaitial lines dataframe but to do that I need the test$coordinates in a lines form. However, I get the following error
> lines(test$coordinates)
Error in xy.coords(x, y) :
'x' is a list, but does not have components 'x' and 'y'
I have tried to convert the test$coordinates to other forms but it usually results in some error. How do I transform this list into a line?
Extra info this is a follow up question to
Convert data frame to spatial lines data frame in R with x,y x,y coordintates
UPDATE as requested dput(head(test)):
> dput(head(test))
structure(list(rid = 1, start_id = 1L, start_code = "E02002536",
end_id = 106L, end_code = "E02006909", strategy = "fastest",
distance = 12655L, time_seconds = 2921L, calories = 211L,
document.id = 1L, array.index = 1L, start = "Geranium Close",
finish = "Hylton Road", startBearing = 0, startSpeed = 0,
start_longitude = -1.294832, start_latitude = 54.610241,
finish_longitude = -1.249478, finish_latitude = 54.680691,
crow_fly_distance = 8362, event = "depart", whence = 1473171787,
speed = 20, itinerary = 419956, clientRouteId = 0, plan = "fastest",
note = "", length = 12655, time = 2921, busynance = 42172,
quietness = 30, signalledJunctions = 3, signalledCrossings = 2,
west = -1.300074, south = 54.610006, east = -1.232447, north = 54.683814,
name = "Geranium Close to Hylton Road", walk = 0, leaving = "2016-09-06 15:23:07",
arriving = "2016-09-06 16:11:48", grammesCO2saved = 2359,
calories2 = 211, type = "route", coordinates = list(structure(list(
x = c(-1.294832, -1.294883, -1.294262, -1.294141, -1.29371,
-1.293726, -1.293742, -1.29351, -1.293368, -1.292816,
-1.248019, -1.249478), y = c(54.61024, 54.61008, 54.610016,
54.610006, 54.610038, 54.610142, 54.610247, 54.610262,
54.681238, 54.680975, 54.680601, 54.680404
)), .Names = c("x", "y"), row.names = c(NA, -284L), class = c("tbl_df",
"tbl", "data.frame")))), .Names = c("rid", "start_id", "start_code",
"end_id", "end_code", "strategy", "distance", "time_seconds",
"calories", "document.id", "array.index", "start", "finish",
"startBearing", "startSpeed", "start_longitude", "start_latitude",
"finish_longitude", "finish_latitude", "crow_fly_distance", "event",
"whence", "speed", "itinerary", "clientRouteId", "plan", "note",
"length", "time", "busynance", "quietness", "signalledJunctions",
"signalledCrossings", "west", "south", "east", "north", "name",
"walk", "leaving", "arriving", "grammesCO2saved", "calories2",
"type", "coordinates"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))
lines is a plotting function. I'm assuming you want sp::SpatialLines. See ?"SpatialLines-class" for how to construct such an object.
Here's for your case, provided you don't have a "corrupt" data.frame (see at the bottom of this post).
library(sp)
coords <- as.data.frame(xy$coordinates[[1]])[1:12, ]
out <- SpatialLines(list(Lines(list(Line(coords)), ID = 1)))
An object of class "SpatialLines"
Slot "lines":
[[1]]
An object of class "Lines"
Slot "Lines":
[[1]]
An object of class "Line"
Slot "coords":
x y
1 -1.294832 54.61024
2 -1.294883 54.61008
3 -1.294262 54.61002
4 -1.294141 54.61001
5 -1.293710 54.61004
6 -1.293726 54.61014
7 -1.293742 54.61025
8 -1.293510 54.61026
9 -1.293368 54.68124
10 -1.292816 54.68097
11 -1.248019 54.68060
12 -1.249478 54.68040
Slot "ID":
[1] "1"
Slot "bbox":
min max
x -1.294883 -1.248019
y 54.610006 54.681238
Slot "proj4string":
CRS arguments: NA
To add data to this object, you should use
SpatialLinesDataFrame(out, data = yourdata)
but see this example for more info.
There's a warning when I tried to coerce your coordinates to a data.frame. Hopefully this isnt' the case for your dataset.
> as.data.frame(xy$coordinates[[1]])
x y
1 -1.294832 54.61024
2 -1.294883 54.61008
3 -1.294262 54.61002
...
281 <NA> <NA>
282 <NA> <NA>
283 <NA> <NA>
284 <NA> <NA>
Warning message:
In format.data.frame(x, digits = digits, na.encode = FALSE) :
corrupt data frame: columns will be truncated or padded with NAs