I'm making world maps with the rworldmap package. I'm using a function to access trade data from UN Comtrade.
I edited my original question so I can show a real example of what I'm doing. Here is a map that I could make:
Function
library(rjson)
library(rworldmap)
get.Comtrade <- function(url="http://comtrade.un.org/api/get?", maxrec=50000,
type="C", freq="A", px="HS", ps="now", r, p, rg="all",
cc="TOTAL", fmt="json") {
string <- paste(url
, "max=", maxrec,"&" # maximum no. of records returned
, "type=", type, "&" # type of trade (c=commodities)
, "freq=", freq, "&" # frequency
, "px=", px, "&" # classification
, "ps=", ps, "&" # time period
, "r=", r, "&" # reporting area
, "p=", p, "&" # partner country
, "rg=", rg, "&" # trade flow
, "cc=", cc, "&" # classification code
, "fmt=", fmt # Format
, sep="")
if (fmt == "csv") {
raw.data <- read.csv(string,header=TRUE)
return(list(validation=NULL, data=raw.data))
} else {
if (fmt == "json" ) {
raw.data <- fromJSON(file=string)
data <- raw.data$dataset
validation <- unlist(raw.data$validation, recursive=TRUE)
ndata <- NULL
if (length(data) > 0) {
var.names <- names(data[[1]])
data <- as.data.frame(t(sapply(data,rbind)))
ndata <- NULL
for (i in 1:ncol(data)) {
data[sapply(data[, i], is.null), i] <- NA
ndata <- cbind(ndata, unlist(data[, i]))
}
ndata <- as.data.frame(ndata)
colnames(ndata) <- var.names
}
return(list(validation=validation, data=ndata))
}
}
}
Usage
dt2 <- get.Comtrade(r=32, p="all", rg=1, fmt="csv")
dt2df <- as.data.frame(do.call(rbind, dt2))
total <- sum(dt2df$Trade.Value..US..)
dt2df$p <- 100*dt2df$Trade.Value..US../total
dt2df <- dt2df[order(-dt2df[, "p"]), ]
top3 <- dt2df[4, "p"]
top10 <- dt2df[11, "p"]
q3 <- dt2df[as.integer(1*nrow(dt2df)/4), "p"]
q2 <- dt2df[as.integer(2*nrow(dt2df)/4), "p"]
q1 <- dt2df[as.integer(3*nrow(dt2df)/4), "p"]
mapped_data <- joinCountryData2Map(dt2df, joinCode="ISO3",
nameJoinColumn="Partner.ISO")
mapCountryData(mapped_data, nameColumnToPlot="p", numCats=6,
catMethod=c(0, q1, q2, q3, top10, top3, 100),
colourPalette=c('cornsilk', 'cornsilk2', 'palegreen1',
'palegreen2', 'palegreen4', 'darkgreen'),
mapTitle="", addLegend=FALSE)
The result is the map that I'm looking for, except that I don't need to see the Antarctica. How can I remove it?
I tried with xlim & ylim, but it didn't work.
Try
new_world <- subset(mapped_data, continent != "Antarctica")
after
mapped_data <- joinCountryData2Map(dt2df, joinCode = "ISO3", nameJoinColumn = "Partner.ISO")
then continue
mapCountryData(new_world, nameColumnToPlot = "p", numCats=6, catMethod =
c(0,q1,q2,q3,top10,top3,100), colourPalette = c('cornsilk','cornsilk2','palegreen1','palegreen2','palegreen4','darkgreen'), mapTitle="", addLegend=FALSE)
Related
"I have one data frame in which bank related information of each applicant id is present. suppose applicants has multiple account and data frame reflects this information in multiple rows. Now I want to create a data frame in which each applicant all information is in one record"
I have tried it with for and if loop. Now I want to optimised code
com_data <- function(X) {
data_set <- data.frame(table(X$id))
a <- 3
n <- 3
for (i in 1:nrow(data_set)) {
for (j in 1:nrow(X[1:4])) {
if (data_set$Var1[i] == X$id[j]) {
count <- count + 1
#k <- j
}
if (count == 1) {
for (k in 3:ncol(X))
data_set[i, n] <- X[j, k]
n <- n + 1
} else{
for (k in 3:ncol(X))
data_set[i, n] <- X[j, k]
n <- n + 1
}
}
count = 0
n <- 3
}
return(data_set)
}
Gets a little messy assumes your dataframe isn't comprised of list vectors. "Var" should be applicant id:
# Sample data used:
df <- data.frame(
Date = as.Date(c("27/9/2019", "28/9/2019", "1/10/2019", "2/10/2019"), "%d/%m/%y"),
dateTime = as.POSIXct(c("27/9/2019", "28/9/2019", "1/10/2019", "2/10/2019"), "%d/%m/%y %H:M:S"),
Var = as.factor(c("A", "A", "B", "B")),
Value = c(56, 50, 90, 100),
stringsAsFactors = F
)
# Convert factors & dates to strings:
convert_descriptors_to_char <- function(df){
as.data.frame(lapply(df,
function(x){
if(is.factor(x) | inherits(x, "Date") | inherits(x, "POSIXct") | inherits(x, "POSIXlt")) {
as.character(trimws(x, which = "both"))
} else{
x
}
}
),
stringsAsFactors = FALSE)
}
# Convert data types:
df <- convert_descriptors_to_char(df)
# Merge the separate lists into one:
df_aggd <- lapply(df, function(x){
if(is.character(x)){
aggregate(x~df$Var, df, paste0, collapse = ", ")
}else if(is.numeric(x)){
aggregate(x~df$Var, df, sum)
}else{
x
}
}
)
# Vector to rename "x" to:
x_vect_names <- names(sapply(df_aggd, function(x){deparse(substitute(x))}))
# Iterate through list to rename:
for (i in seq_along(df_aggd)){
colnames(df_aggd[[i]]) <- c("Var", x_vect_names[i])
}
# Remove Var df:
df_aggd <- df_aggd[names(df_aggd) != "Var"]
# Merge the separate dataframes into one:
Reduce(function(x, y){merge(x, y, all = TRUE, by = intersect(colnames(x), colnames(y)))}, df_aggd)
I have an output of RNA-seq reads from CLC genomics workbench, for Arabidopsis thaliana. The list of genes contains a mix of gene names (i.e. "TRY", "TMM", "SVP", "FLC"), and IDs (e.g. "AT1G01390", "AT1G01310", "AT1G01240"). I would like to convert them all to gene names, so I can run it through a GO terms R package (the package seemingly does not read IDs like AT1G01390).
When I use biomaRt's getBM() function, it returns a lot less genes than the list of genes I'm reading into it. The original list from CLC has all Arabidopsis genes (27,655) and the outputs from getBM() generally have 12,085 gene names or less.
Anybody done this type of conversion before with success?
Thanks in advance!
I've tried various types of attributes, but none of them have worked.
#data load in and conversions, meta matrix/design creation:
#reads file was created in CLC Genomics Workbench, then the reads column copied and pasted for
#each sample
reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))
meta <- read.table("metatest4.txt", header = TRUE, fileEncoding= "UTF-16LE")
mart = useMart(biomart="plants_mart",host="plants.ensembl.org")
listDatasets(useMart(biomart="plants_mart",host="plants.ensembl.org"))
ensembl = useDataset("athaliana_eg_gene",mart= mart)
genes <- row.names(reads)
test1 <- getBM(attributes='external_gene_name',
values = genes,
mart = ensembl)
Okay, I found a round about way to solve this, at least for my scenario.
The gmt and fgsea information I'm using can only read gene symbols (e.g. "TRY") or entrez IDs. So I wrote a function to convert all of the information I had to either symbols or entrez IDs. The code is:
reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))
genes <- row.names(reads)
sum(lengths(regmatches(genes, gregexpr("\\AT[0-9]", genes, ignore.case = TRUE))))
#genes <- c("TRY", "AT2G46410", "AT5G41315", "AT2G42200", "AT1G10280")
IDconvert <- function(genes) {
for (i in genes){
if (grepl("AT[0-9]", i) == TRUE) {
if (is.na(getSYMBOL(i, data='org.At.tair.db')) == TRUE) {
if (is.na(getEG(i, data='org.At.tair')) == TRUE) {
i <- i
} else{
name <- getEG(i, data='org.At.tair')
name.l <- as.list(name)
newname <- as.character(name.l[[1]])
genes <- sub(i, newname, genes)
}
} else{
name <- getSYMBOL(i, data='org.At.tair')
name.l <- as.list(name)
newname <- as.character(name.l[[1]])
genes <- sub(i, newname, genes)
}
} else{
NULL
}
}
return(genes)
}
genes2 <- IDconvert(genes)
sum(lengths(regmatches(genes2, gregexpr("\\AT[0-9]", genes2, ignore.case = TRUE))))
row.names(reads) <- genes2
gmt <- read.gmt("GSEA_BIO.gmt")
gmt.ids <- read.gmt("IB_BIO_GMT.gmt")
gmt.combo <- c(gmt, gmt.ids)
#Stage 3 GO terms
names3 <- row.names(sub.break3)
sub.break3$names=names3
ranks <- sub.break3$stat
names(ranks) <- sub.break3$names
sub.break3.rank <- sort(ranks, decreasing = T)
fgseaRes3 <- fgsea(pathways = gmt.combo,
stats = sub.break3.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea3.sig <- fgseaRes3[pval < 0.05]
pathways.stg3 <- fgsea3.sig$pathway
#Stage 1 GO terms
names1 <- row.names(sub.break1)
sub.break1$names=names1
ranks <- sub.break1$stat
names(ranks) <- sub.break1$names
sub.break1.rank <- sort(ranks, decreasing = T)
fgseaRes1 <- fgsea(pathways = gmt.combo,
stats = sub.break1.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea1.sig <- fgseaRes1[pval < 0.05]
pathways.stg1 <- fgsea1.sig$pathway
#Stage 2 GO terms
names2 <- row.names(sub.break2)
sub.break2$names=names2
ranks <- sub.break2$stat
names(ranks) <- sub.break2$names
sub.break2.rank <- sort(ranks, decreasing = T)
fgseaRes2 <- fgsea(pathways = gmt.combo,
stats = sub.break2.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea2.sig <- fgseaRes2[pval < 0.05]
pathways.stg2 <- fgsea2.sig$pathway
#Stage 4 GO terms
names4 <- row.names(sub.break4)
sub.break4$names=names4
ranks <- sub.break4$stat
names(ranks) <- sub.break4$names
sub.break4.rank <- sort(ranks, decreasing = T)
fgseaRes4 <- fgsea(pathways = gmt.combo,
stats = sub.break4.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea4.sig <- fgseaRes4[pval < 0.05]
pathways.stg4 <- fgsea4.sig$pathway
#openxlsx::write.xlsx(fgsea4.sig, "fgsea_stg4_t1.xlsx")
#GO Venn-----------------------------------
group.venn(list(One = pathways.stg1,
Two = pathways.stg2,
Three = pathways.stg3,
Four = pathways.stg4),
fill = c("orange", "green", "red", "blue"))
I'm trying to calculate specific quartile number (in this example Q2) of column data with the positive gap opening value of a stock list.
I try to explain you my approach:
Load my tickers list from a .csv file, create a list with all of them (OK)
library(quantmod)
Tickers <- read.csv("nasdaq_tickers_list.csv", stringsAsFactors = FALSE)
getSymbols(Tickers$Tickers,from="2018-08-01", src="yahoo" )
stock_data = sapply(.GlobalEnv, is.xts)
all_stocks <- do.call(list, mget(names(stock_data)[stock_data]))
I have the following function working fine to calculate the quartiles of a column (Stock_name.Postitivegap) and split them into their corresponding quartile rank:
Posgapqrank <- function(x) {
stock_name <- stringi::stri_extract(names(x)[1], regex = "^[A-Z]+")
stock_name <- paste0(stock_name, ".Volqrank")
column_names <- c(names(x), stock_name)
x$posgapqrank <- as.integer(cut(x[, grep(".Positivegap", colnames(x))],quantile(x[,grep(".Positivegap",colnames(x))],probs=0:4/4),include.lowest=TRUE))
x <- setNames(x, column_names)
return(x)
}
Now I'd like a function to calculate an specific quantile of the same original data column , ".Positivegap" i.e. Q2
For that purpose I introduced the 0.25 in the quartile function, but getting error... any help here?
Q2 <- function(x) {
stock_name <- stringi::stri_extract(names(x)[1], regex = "^[A-Z]+")
stock_name <- paste0(stock_name, ".Q2")
column_names <- c(names(x), stock_name)
x$gapq2 <- as.integer(quantile(x[,grep(".Positivegap",colnames(x))],0.25)))
x <- setNames(x, column_names)
return(x)
}
Thank you very much for any comment.
Let's make it simpler and start with a less complex example
getSymbols("SQ", from="2018-01-01", src="yahoo")
quantile(SQ$SQ.Volume, 0.25)
#How do I ad a new column to the SQ dataset with the Q2 volume data for each day?
I created 2 functions. Q2 and rolling_Q2.
I tested them on the grep of the column name ".Volume" and both work. The Q2 function will calculate the Q2 from the whole dataset you give it. The rolling_Q2 will calculate the value of the Q2 based on a rolling window. Default 22.
Q2 <- function(x) {
stock_name <- stringi::stri_extract(names(x)[1], regex = "^[A-Z]+")
stock_name <- paste0(stock_name, ".Q2")
column_names <- c(names(x), stock_name)
x$gapq2 <- as.integer(quantile(x[,grep(".Positivegap",colnames(x))], 0.25))
x <- setNames(x, column_names)
return(x)
}
rolling_Q2 <- function(x, width = 22) {
stock_name <- stringi::stri_extract(names(x)[1], regex = "^[A-Z]+")
stock_name <- paste0(stock_name, ".Q2")
column_names <- c(names(x), stock_name)
x$gapq2 <- rollapply(x[,grep(".Positivegap",colnames(x))], width = width, FUN = function(x) as.integer(quantile(x, 0.25)))
x <- setNames(x, column_names)
return(x)
}
I have a list of 701 given csv files. Each one has the same number of columns (7) but different number of rows (between 25000 and 28000).
Here is an extract of the first file:
Date,Week,Week Day,Hour,Price,Volume,Sale/Purchase
18/03/2011,11,5,1,-3000.00,17416,Sell
18/03/2011,11,5,1,-1001.10,17427,Sell
18/03/2011,11,5,1,-1000.00,18055,Sell
18/03/2011,11,5,1,-500.10,18057,Sell
18/03/2011,11,5,1,-500.00,18064,Sell
18/03/2011,11,5,1,-400.10,18066,Sell
18/03/2011,11,5,1,-400.00,18066,Sell
18/03/2011,11,5,1,-300.10,18068,Sell
18/03/2011,11,5,1,-300.00,18118,Sell
Now I am trying to plot the coefficients of my following regression (in the price intervall -50 and 150) of the supply curve for the ninth hour over one year.
First I made the regression:
allenamen <- dir(pattern="*.csv")
alledat <- lapply(allenamen, read.csv, header = TRUE, sep = ",", stringsAsFactors = FALSE)
h <- list()
for(i in 1:length(alledat)){
g <- function(a, b, c, d, p) {a*atan(b*p+c)+d}
f <- nlsLM(Volume ~ g(a,b,c,d,Price), data=subset(alledat[[i]], (Hour==9) & (Sale.Purchase == "Sell") & (!Price %in% as.character(-50:150))), start = list(a=4000, b=0.1, c=-5, d=32000))
h[[i]] <- coef(f)
}
h.df <- setNames(do.call(rbind.data.frame, h), names(h[[1]]))
Then I just took the datas of the supply curve and the ninth hour and changed the format of the date:
files <- list.files(pattern="*.csv")
df <- data.frame()
for(i in 1:length(files)) {
xx <- read.csv(as.character(files[i]))
xx <- subset(xx, Sale.Purchase == "Sell" & Hour == 9)
df <- rbind(df, xx)
}
df$Date <- as.Date(as.character(df$Date), format="%d/%m/%Y")
And then I tried to plot the coefficient a:
plot(h.df$a ~ Date, df, xlim = as.Date(c("2012-01-01", "2012-12-31")))
But I get this error:
Error in (function (formula, data = NULL, subset = NULL, na.action = na.fail, :
variable lengths differ (found for 'Date')
Thank you for your help!
I want to visualize a mosaic plot in form of a tree. For example
mosaicplot(~ Sex + Age + Survived, data = Titanic, color = TRUE)
Now what I want is to represent this in a tree form where the first node
for example be sex the second node be age and at the terminal node be number of people survived. May be it should something like http://addictedtor.free.fr/graphiques/RGraphGallery.php?graph=84 where instead of p giving the number of counts.
Is there an function in R to do this or should I write it on my own by taking at a look
at the party:::plot.BinaryTree function
Here is how I managed to get what I wanted with the lovely igraph package. The code is an ugly hack. It will be great to have you suggestions
library(igraph)
rm(list=ls())
req.data <- as.data.frame(Titanic)
lookup <- c("M","F","C","A","N","Y")
names(lookup) <- c("Male","Female","Child","Adult","Yes","No")
req.data$board <- "board"
req.data$Class.m <- paste(req.data$board,req.data$Class,sep="_")
req.data$Sex.m <- paste(req.data$board,req.data$Class,req.data$Sex,
sep="_")
req.data$Age.m <- paste(req.data$board,req.data$Class,req.data$Sex,
req.data$Age,sep="_")
req.data$Survived.m <- paste(req.data$board,req.data$Class,req.data$Sex,
req.data$Age,req.data$Survived,sep="_")
tmp <- data.frame(from=
do.call("c",lapply(req.data[,c("board",
"Class.m",
"Sex.m",
"Age.m")],as.character)),
to=do.call("c",lapply(req.data[,c("Class.m",
"Sex.m",
"Age.m",
"Survived.m")],as.character)),
stringsAsFactors=FALSE)
tmp <- tmp [!duplicated(tmp ),];rownames(tmp) <- NULL
tmp$num <- unlist(lapply(strsplit(tmp$to,"_"),
FUN=function(x){
check1 <- req.data$Class==x[2]
check2 <- req.data$Sex == x[3]
check3 <- req.data$Age == x[4]
check4 <- req.data$Survived == x[5]
sum(req.data$Freq[ifelse(is.na(check1),TRUE,check1) &
ifelse(is.na(check2),TRUE,check2) &
ifelse(is.na(check3),TRUE,check3) &
ifelse(is.na(check4),TRUE,check4)])}))
g <- graph.data.frame(tmp, directed=TRUE)
V(g)$label <- unlist(lapply(strsplit(V(g)$name,"_"),
FUN=function(y){ifelse(y[length(y)] %in% names(lookup),
lookup[y[length(y)]],y[length(y)])}))
E(g)$label <- tmp$num
plot(g,layout=layout.reingold.tilford,ylim=c(1,-1),edge.arrow.size=0.5,vertex.size=7)
legend("topleft", paste(lookup ,names(lookup),sep=" : "),ncol=2,bty="n",cex=0.7)
### To find the case for crew members
tmp1 <- tmp [grepl("Crew",tmp$from),];rownames(tmp1) <- NULL
g <- graph.data.frame(tmp1, directed=TRUE)
V(g)$label <- unlist(lapply(strsplit(V(g)$name,"_"),
FUN=function(y){ifelse(y[length(y)] %in% names(lookup),
lookup[y[length(y)]],y[length(y)])}))
E(g)$label <- tmp1$num
plot(g,layout=layout.reingold.tilford,ylim=c(1,-1),edge.arrow.size=0.5)
legend("topleft", paste(lookup ,names(lookup),sep=" : "),ncol=2,bty="n",cex=0.7)
Here is the plot I generate. You can modify the vertex/edge colors/size as you want
This is pretty close and looks a lot easier to me.. I post it here in case it may be of use. First I convert the ftable to a more traditional long data frame using expand.dft https://stat.ethz.ch/pipermail/r-help/2009-January/185561.html Then I just use the plot.dendrite function from the plotrix package.
expand.dft <- function(x, var.names = NULL, freq = "Freq", ...)
{
# allow: a table object, or a data frame in frequency form
if(inherits(x, "table"))
x <- as.data.frame.table(x, responseName = freq)
freq.col <- which(colnames(x) == freq)
if (length(freq.col) == 0)
stop(paste(sQuote("freq"), "not found in column names"))
DF <- sapply(1:nrow(x),
function(i) x[rep(i, each = x[i, freq.col]), ],
simplify = FALSE)
DF <- do.call("rbind", DF)[, -freq.col]
for (i in 1:ncol(DF))
{
DF[[i]] <- type.convert(as.character(DF[[i]]), ...)
}
rownames(DF) <- NULL
if (!is.null(var.names))
{
if (length(var.names) < dim(DF)[2])
{
stop(paste("Too few", sQuote("var.names"), "given."))
} else if (length(var.names) > dim(DF)[2]) {
stop(paste("Too many", sQuote("var.names"), "given."))
} else {
names(DF) <- var.names
}
}
DF
}
library(plotrix)
r = ftable(Titanic)
plot.dendrite(makeDendrite(expand.dft(data.frame(r))))