R: Plotting data using qplot

R: Plotting data using qplot - r

movies_df is a data frame with 100 records and a structure:
The genre with highest Runtime is plotted using qplot
qplot(data = movies_df, Runtime, fill = Genre, bins = 30)
From the graph above, 4 Action movies have the highest runtime (160)
How to get plot the Title of the movies that have genre = Action (the genre with the highest runtime) ?
Code to reproduce the data:
library("rvest")
url = "https://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature"
webpage = read_html(url)
## ---- PRE-PROCESSING ---- ##
# rank scraping
rank_data_html = html_nodes(webpage, ".text-primary")
rank_data = html_text(rank_data_html)
rank_data = as.numeric(rank_data)
#title scraping
title_data_html = html_nodes (webpage, ".lister-item-header a")
title_data = html_text(title_data_html)
#description scraping
desc_nodes = html_nodes(webpage, ".ratings-bar+.text-muted")
desc_data = html_text(desc_nodes)
desc_data = gsub("\n","",desc_data)
runtime_data_html = html_nodes (webpage, ".text-muted .runtime")
runtime_data = html_text(runtime_data_html)
runtime_data = gsub(" min", "", runtime_data)
runtime_data = as.numeric(runtime_data)
genre_data_html = html_nodes (webpage, ".genre")
genre_data = html_text (genre_data_html)
genre_data = gsub("\n", "", genre_data)
genre_data = gsub (" ","", genre_data)
genre_data = gsub(",.*", "", genre_data)
genre_data = as.factor(genre_data)
rating_data_html = html_nodes(webpage, ".ratings-imdb-rating strong")
rating_data = html_text(rating_data_html)
rating_data = as.numeric(rating_data)
votes_data_html = html_nodes(webpage, ".sort-num_votes-visible span:nth-child(2)")
votes_data = html_text(votes_data_html)
votes_data = gsub(",", "", votes_data)
votes_data = as.numeric(votes_data)
directors_data_html = html_nodes(webpage, ".text-muted+ p a:nth-child(1)")
directors_data = html_text(directors_data_html)
directors_data = as.factor(directors_data)
actors_data_html = html_nodes(webpage, ".lister-item-content .ghost+ a")
actors_data = html_text(actors_data_html)
actors_data = as.factor(actors_data)
metascore_data_html = html_nodes(webpage, ".metascore")
metascore_data = html_text(metascore_data_html)
metascore_data = gsub(" ", "", metascore_data)
for (i in c(39, 73, 80)){
a = metascore_data[1:(i-1)]
b = metascore_data[i:length(metascore_data)]
metascore_data = append(a, list("NA"))
metascore_data = append(metascore_data, b)
metascore_data = as.numeric(metascore_data)
}
gross_data_html = html_nodes(webpage, ".ghost~ .text-muted+ span")
gross_data = html_text(gross_data_html)
gross_data = gsub("M","",gross_data)
gross_data = substring(gross_data, 2, 6)
for (i in c(1,2,3,4,5,6,7,8,9,10)){
a = gross_data[1:(i-1)]
b = gross_data[i:length(gross_data)]
gross_data = append(a,list("NA"))
gross_data = append(gross_data,b)
}
gross_data = as.numeric(gross_data)
movies_df = data.frame(Rank = rank_data, Title = title_data,
Description = desc_data, Runtime = runtime_data,
Genre = genre_data, Rating = rating_data,
Metascore = metascore_data, Votes = votes_data,
Gross_Earning_in_Mil = gross_data, Director = directors_data,
Actor = actors_data)

At first you need to install the package dplyr AND load the library to filter youre data.
Then you need to filter youre data to all movies that have genre = Action. Thats it
library(dplyr)
newDataset <- movies_df %>% filter( Genre == "Action" )
qplot(data = newDataset, Runtime, fill = Title , bins = 30)
please rate my answer well if it helped you. And i hope, its all clear now

Related

Implement call retries with httr::RETRY() function in API call (R)

I use the UN Comtrade data API with R.
library(rjson)
get.Comtrade <- function(url="http://comtrade.un.org/api/get?"
,maxrec=50000
,type="C"
,freq="A"
,px="HS"
,ps="now"
,r
,p
,rg="all"
,cc="TOTAL"
,fmt="json"
)
{
string<- paste(url
,"max=",maxrec,"&" #maximum no. of records returned
,"type=",type,"&" #type of trade (c=commodities)
,"freq=",freq,"&" #frequency
,"px=",px,"&" #classification
,"ps=",ps,"&" #time period
,"r=",r,"&" #reporting area
,"p=",p,"&" #partner country
,"rg=",rg,"&" #trade flow
,"cc=",cc,"&" #classification code
,"fmt=",fmt #Format
,sep = ""
)
if(fmt == "csv") {
raw.data<- read.csv(string,header=TRUE)
return(list(validation=NULL, data=raw.data))
} else {
if(fmt == "json" ) {
raw.data<- fromJSON(file=string)
data<- raw.data$dataset
validation<- unlist(raw.data$validation, recursive=TRUE)
ndata<- NULL
if(length(data)> 0) {
var.names<- names(data[[1]])
data<- as.data.frame(t( sapply(data,rbind)))
ndata<- NULL
for(i in 1:ncol(data)){
data[sapply(data[,i],is.null),i]<- NA
ndata<- cbind(ndata, unlist(data[,i]))
}
ndata<- as.data.frame(ndata)
colnames(ndata)<- var.names
}
return(list(validation=validation,data =ndata))
}
}
}
However, sometimes it fails to connect server and I need to run the code several times to start working. Solution given here, to use Retry() function, which retries a request until it succeeds, seems attractive.
However, I have some difficulties implementing this function in the code given above. has anybody used it before and knows how to recode it?

An API call using httr::RETRY could look like the following:
library(httr)
library(jsonlite)
res <- RETRY(
verb = "GET",
url = "http://comtrade.un.org/",
path = "api/get",
encode = "json",
times = 3,
query = list(
max = 50000,
type = "C",
freq = "A",
px = "HS",
ps = "now",
r = 842,
p = "124,484",
rg = "all",
cc = "TOTAL",
fmt = "json"
)
)
# alternativ: returns dataset as a `list`:
# parsed_content <- content(res, as = "parsed")
# returns dataset as a `data.frame`:
json_content <- content(res, as = "text")
parsed_content <- parse_json(json_content, simplifyVector = TRUE)
parsed_content$validation
parsed_content$dataset
I'd suggest rewriting the get.Comtrade function using httr:
get.Comtrade <- function(verb = "GET",
url = "http://comtrade.un.org/",
path = "api/get",
encode = "json",
times = 3,
max = 50000,
type = "C",
freq = "A",
px = "HS",
ps = "now",
r,
p,
rg = "all",
cc = "TOTAL",
fmt = "json") {
res <- httr::RETRY(
verb = verb,
url = url,
path = path,
encode = encode,
times = times,
query = list(
max = max,
type = type,
freq = freq,
px = px,
ps = ps,
r = r,
p = p,
rg = rg,
cc = cc,
fmt = fmt
)
)
jsonlite::parse_json(content(res, as = "text"), simplifyVector = TRUE)
}
s1 <- get.Comtrade(r = "842", p = "124,484", times = 5)
print(s1)
Please see this and this for more information on library(httr).

Highcharter Unlimited Drill down using R

I looked at this SO question, that SO question. I followed this github method check out ISSUES to help get ideas on how to solve this.
Step 1: Copy the following into text editor and save as test.csv file:
My datasource is NVD JSON FEEDS which I processed and cleaned and created this dataset with the first 25 rows shown here.
Year,Vendor name,Product name,CVE,Major,Minor,Build,Revision
1988,eric_allman,sendmail,CVE-1999-0095,5,5.58,,
1988,ftp,ftp,CVE-1999-0082,*,,,
1988,ftpcd,ftpcd,CVE-1999-0082,*,,,
1989,bsd,bsd,CVE-1999-1471,4,4.2,,
1989,bsd,bsd,CVE-1999-1471,4,4.3,,
1989,sun,sunos,CVE-1999-1122,4,4.0,,
1989,sun,sunos,CVE-1999-1122,4,4.0,4.0.1,
1989,sun,sunos,CVE-1999-1122,4,4.0,4.0.3,
1989,sun,sunos,CVE-1999-1467,4,4.0,,
1989,sun,sunos,CVE-1999-1467,4,4.0,4.0.1,
1989,sun,sunos,CVE-1999-1467,4,4.0,4.0.2,
1989,sun,sunos,CVE-1999-1467,4,4.0,4.0.3,
1989,sun,sunos,CVE-1999-1467,4,4.0,4.0.3c,
1990,digital,vms,CVE-1999-1057,5,5.3,,
1990,freebsd,freebsd,CVE-2000-0388,3,3.0,,
1990,freebsd,freebsd,CVE-2000-0388,3,3.1,,
1990,freebsd,freebsd,CVE-2000-0388,3,3.2,,
1990,freebsd,freebsd,CVE-2000-0388,3,3.3,,
1990,freebsd,freebsd,CVE-2000-0388,3,3.4,,
1990,hp,apollo_domain_os,CVE-1999-1115,sr10,sr10.2,,
1990,hp,apollo_domain_os,CVE-1999-1115,sr10,sr10.3,,
1990,next,nex,CVE-1999-1392,1,1.0a,,
1990,next,next,CVE-1999-1198,2,2.0,,
1990,next,next,CVE-1999-1391,1,1.0,,
1990,next,next,CVE-1999-1391,1,1.0a,,
Step 2: Copy and paste following code into R and run it:
My codes follow this SO question especially the discussion by #NinjaElvis. I think it is possible to create 3 levels or more. Just doing more research and figuring out.
############################
suppressPackageStartupMessages(library("highcharter"))
library("dplyr")
library("purrr")
library("data.table")
second_el_to_numeric <- function(ls){
map(ls, function(x){
x[[2]] <- as.numeric(x[[2]])
x
})
}
cve_affected_product <- fread("test.csv")
# LAYER ONE YEAR DRILLDOWN VIEW #########################
Year <- cve_affected_product[,c(1:2)]
Year <- unique(Year[,list(Year,`Vendor name`)])
Year <- Year[,c(1)][,count:=1]
Year <- setDT(aggregate(.~ Year ,data=Year,FUN=sum))
#setorder(Year, Year, `Vendor name`)
years_df <- tibble(
name = c(Year$Year),
y = c(Year$count),
drilldown = tolower(paste(name,'id'))
)
ds <- list_parse(years_df)
names(ds) <- NULL
# Vendor View HC ###########
hc <- highchart() %>%
hc_chart(type = "column") %>%
hc_title(text = "Basic drilldown") %>%
hc_xAxis(type = "category") %>%
hc_yAxis(visible = FALSE,reversed = FALSE) %>%
hc_legend(enabled = FALSE) %>%
hc_plotOptions(
series = list(
boderWidth = 0,
dataLabels = list(enabled = TRUE)
)
) %>%
hc_add_series(
name = "Vendors",
colorByPoint = TRUE,
data = ds
)
# LAYER TWO VENDOR DRILLDOWN VIEW #########################
Vendor <- cve_affected_product[,c(1:3)]
Vendor <- unique(Vendor[,list(Year,`Vendor name`,`Product name`)])
Vendor <- Vendor[,c(1:2)][,count:=1]
Vendor <- setDT(aggregate(.~ Year+`Vendor name` ,data=Vendor,FUN=sum))
setorder(Vendor, Year, `Vendor name`)
years <- as.character(unique(Vendor$Year))
for(i in 1:length(years)){
tempdf <- Vendor[Vendor$Year==years[i],]
dfname <- paste("df",years[i],sep="")
dsname <- paste("ds",years[i],sep="")
X <- tibble(
name = c(tempdf$`Vendor name`),
y = c(tempdf$count),
drilldown = tolower(paste(name,'id'))
)
Y <- second_el_to_numeric(list_parse2(assign(dfname,X)))
assign(dsname,Y)
}
# Vendor View HC ###########
hc <- hc %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series = list(
list(
id = "1988 id",
data = ds1988,
colorByPoint = TRUE,
keys = list('name','y','drilldown')
),
list(
id = "1989 id",
data = ds1989,
keys = list('name','y','drilldown')
),
list(
id = "1990 id",
data = ds1990,
keys = list('name','y','drilldown')
)#,
)
)
# LAYER THREE PRODUCT DRILLDOWN VIEW #########################
Product <- cve_affected_product[,c(1:4)]
Product <- unique(Product[,list(Year,`Vendor name`,`Product name`, CVE)])
Product <- Product[,c(1:3)][,count:=1]
Product <- setDT(aggregate(.~ Year+`Vendor name`+`Product name` ,data=Product,FUN=sum))
setorder(Product, Year, `Vendor name`,`Product name`)
vendors <- as.character(unique(Product$`Vendor name`))
for(i in 1:length(vendors)){
tempdf <- Product[Product$`Vendor name`==vendors[i],]
dfname <- paste("df",vendors[i],sep="")
dsname <- paste("ds",vendors[i],sep="")
X <- tibble(
name = c(tempdf$`Product name`),
y = c(tempdf$count),
drilldown = tolower(paste(name,'id'))
)
Y <- second_el_to_numeric(list_parse2(assign(dfname,X)))
assign(dsname,Y)
}
# Product View HC ###########
hc <- hc %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series = list(
list(
id = "eric_allman id",
data = dseric_allman,
keys = list('name','y','drilldown')
),
list(
id = "ftp id",
data = dsftp,
keys = list('name','y','drilldown')
),
list(
id = "ftpcd id",
data = dsftpcd,
keys = list('name','y','drilldown')
),
list(
id = "bsd id",
data = dsbsd,
keys = list('name','y','drilldown')
),
list(
id = "sun id",
data = dssun,
keys = list('name','y','drilldown')
),
list(
id = "digital id",
data = dsdigital,
keys = list('name','y','drilldown')
),
list(
id = "freebsd id",
data = dsfreebsd,
keys = list('name','y','drilldown')
),
list(
id = "hp id",
data = dshp,
keys = list('name','y','drilldown')
),
list(
id = "next id",
data = dsnext,
keys = list('name','y','drilldown')
)#,
)
)
# LAYER FOUR CVE DRILLDOWN VIEW #########################
Product_CVE <- cve_affected_product[,c(1:5)]
Product_CVE <- unique(Product_CVE[,list(Year,`Vendor name`,`Product name`, CVE, Major)])
Product_CVE <- Product_CVE[,c(1:4)][,count:=1]
Product_CVE <- setDT(aggregate(.~ Year+`Vendor name`+`Product name`+CVE ,data=Product_CVE,FUN=sum))
setorder(Product_CVE, Year, `Vendor name`,`Product name`, CVE)
products <- as.character(unique(Product_CVE$`Product name`))
for(i in 1:length(products)){
tempdf <- Product_CVE[Product_CVE$`Product name`==products[i],]
ifelse(tempdf$`Vendor name`==tempdf$`Product name`,
dfname <- paste("df_",products[i],sep=""),
dfname <- paste("df",products[i],sep=""))
ifelse(tempdf$`Vendor name`==tempdf$`Product name`,
dsname <- paste("ds_",products[i],sep=""),
dsname <- paste("ds",products[i],sep=""))
X <- tibble(
name = gsub("-", "", c(tempdf$CVE)),
y = c(tempdf$count),
drilldown = tolower(paste(name,'id'))
)
Y <- second_el_to_numeric(list_parse2(assign(dfname,X)))
assign(dsname,Y)
}
# CVE View HC ###########
hc <- hc %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series = list(
list(
id = "sendmail id",
data = dssendmail,
keys = list('name','y','drilldown')
),
list(
id = "ftp id",
data = ds_ftp,
keys = list('name','y','drilldown')
),
list(
id = "ftpcd id",
data = ds_ftpcd,
keys = list('name','y','drilldown')
),
list(
id = "bsd id",
data = ds_bsd,
keys = list('name','y','drilldown')
),
list(
id = "sunos id",
data = dssunos,
keys = list('name','y','drilldown')
),
list(
id = "vms id",
data = dsvms,
keys = list('name','y','drilldown')
),
list(
id = "freebsd id",
data = ds_freebsd,
keys = list('name','y','drilldown')
),
list(
id = "apollo_domain_os id",
data = dsapollo_domain_os,
keys = list('name','y','drilldown')
),
list(
id = "nex id",
data = dsnex,
keys = list('name','y','drilldown')
),
list(
id = "next id",
data = ds_next,
keys = list('name','y','drilldown')
)
)
)
Output First Level:
Output Second Level Clicking on 1988:
ISSUES:
I should be able to click on vendor eric_allman and get a drill down, however I am not. I want to be able to drill down all the way to Revision if it exists. This is just prototypying new functionality for my app to get it working. However, highcharter does not make this easy or efficient. My dataset has almost 4 million observations. That will be the next struggle how to handle that.
I am even considering using D3 by creating a JSON file in python if I cannot do it using R. However, creating the JSON file in python is not trivial but doable. I am currently working on python code as a back up.
Thank you for the help and any suggestions

Interestingly enough after posting this question, i decided to do more research, since most of the research was pointing me to example of how to do it in javascript. Google search took me to this SO response by #K. Rohde
This post is from 2015 and I really appreciate how he explained the two different approaches. I ended up using hybrid approach borrowing from both.
For those interested in seeing how the drill down is working, go to my shiny app. Once you on the page Click on Visualizations, then "Drill Down For Vendor CVE Affected Products Versions and Revisions"and try it out. Again I would not have done it without #K. Rohde write up.

How to insert greek letter delta (∆) into header of flextable object?

I am using RMarkdown to create a word document (I need the output to be in .docx format).
I'd like to use flextable (or any other package) to format my headers properly.
I'm trying to get the greek symbol delta (∆) to display properly... it seems possible because in the help pages here (https://davidgohel.github.io/flextable/articles/format.html#display-function) the author successfully uses \u03BC to insert the "μ" symbol (and I can too if I use his code, below), but I can't get it to work for delta using \u2206 or \u0394, if I replace \u03BC with either code below. The code I'm using produces this table, but I want to replace the highlighted bit with delta.
This is what I get when I try, for example, \u2206.
Any suggestions?
library(flextable)
if( require("xtable") ){
mat <- round(matrix(c(0.9, 0.89, 200, 0.045, 2.0), c(1, 5)), 4)
mat <- xtable(mat)
ft <- xtable_to_flextable(x = mat, NA.string = "-")
print(ft$col_keys)
ft <- flextable::display(ft, i = 1, col_key = "X1",
pattern = "{{val}}{{pow}}", part = "header",
formatters = list(val ~ as.character("R"), pow ~ as.character("2") ),
fprops = list(pow = fp_text(vertical.align = "superscript", font.size = 8))
)
ft <- flextable::display(ft, i = 1, col_key = "X2",
pattern = "{{val}}{{pow}}", part = "header",
formatters = list(val ~ as.character("\u03BC"), pow ~ as.character("x") ),
fprops = list(pow = fp_text(vertical.align = "superscript", font.size = 8))
)
ft <- flextable::display(ft, i = 1, col_key = "rowname",
pattern = "{{val}}{{pow}}", part = "body",
formatters = list(val ~ as.character("y"), pow ~ as.character("t-1") ),
fprops = list(pow = fp_text(vertical.align = "subscript", font.size = 8))
)
ft <- set_header_labels(ft, X3 = "F-stat", X4 = "S.E.E", X5 = "DW", rowname = "")
ft <- autofit(ft)
ft
}
Update
I am getting closer thanks to a helpful suggestion from David, but (not being very familiar with flextable) I am getting strange behaviour when I try to modify the header in the way suggested:
library(magrittr)
library(flextable)
library(officer)
AICtable <- data.frame(Model = "test", Parameters = 9, AICc = 4000, dAICc = 0, w = 1)
v.epi.aic <- flextable(AICtable) %>%
font(fontname = "Times New Roman", part = "all") %>%
flextable::display(col_key = "dAICc", part = "header",
pattern = "{{D}}{{A}}{{cbit}}",
formatters = list(D ~ as.character("D"),
A ~ as.character("AIC"),
cbit ~ as.character("c") ),
fprops = list(D = fp_text(font.family = "Symbol"),
A = fp_text(font.family = "Times New Roman"),
cbit = fp_text(vertical.align = "subscript")))
v.epi.aic
Notice that column headers are now duplicated, and "AIC" appears before the "∆". The column names should be:
Model, Parameters, AICc, ∆AICc, w (and the "c" in the ∆AICc should be a subscript).

Please use "\u394" instead of "\u0394" to generate the capital delta symbol

How to prepare input data for a sankey diagrams in R?

I am trying to produce a sankey diagram in R, which is also referred as a river plot. I've seen this question Sankey Diagrams in R? where a broad variaty of packages producing sankey diagrams are listed. Since I have input data and know different tools/packages I can produce such diagram BUT my euqestion is: how can I prepare input data for such?
Let's assume we would like to present how users have migrated between various states over 10 days and have start data set like the one below:
data.frame(userID = 1:100,
day1_state = sample(letters[1:8], replace = TRUE, size = 100),
day2_state = sample(letters[1:8], replace = TRUE, size = 100),
day3_state = sample(letters[1:8], replace = TRUE, size = 100),
day4_state = sample(letters[1:8], replace = TRUE, size = 100),
day5_state = sample(letters[1:8], replace = TRUE, size = 100),
day6_state = sample(letters[1:8], replace = TRUE, size = 100),
day7_state = sample(letters[1:8], replace = TRUE, size = 100),
day8_state = sample(letters[1:8], replace = TRUE, size = 100),
day9_state = sample(letters[1:8], replace = TRUE, size = 100),
day10_state = sample(letters[1:8], replace = TRUE, size = 100)
) -> dt
Now if one would like to create a sankey diagram with networkD3 package how should one tranform this dt data.frame into required input
so that we would have input like from this example
library(networkD3)
URL <- paste0(
"https://cdn.rawgit.com/christophergandrud/networkD3/",
"master/JSONdata/energy.json")
Energy <- jsonlite::fromJSON(URL)
# Plot
sankeyNetwork(Links = Energy$links, Nodes = Energy$nodes, Source = "source",
Target = "target", Value = "value", NodeID = "name",
units = "TWh", fontSize = 12, nodeWidth = 30)
EDIT
I have found such script which prepares data in other situation and reproduced it so I assume it might be closed now:
https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R

I have found such script which prepares data in other situation and reproduced it so I assume it might be closed now:
https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R
Then this code generates such sankey diagram for mentioned in question data.frame
fixtable <- function(...) {
tab <- table(...)
if (substr(colnames(tab)[1],1,1) == "_" &
substr(rownames(tab)[1],1,1) == "_") {
tab2 <- tab
colnames(tab2) <- sapply(strsplit(colnames(tab2), split=" "), `[`, 1)
rownames(tab2) <- sapply(strsplit(rownames(tab2), split=" "), `[`, 1)
tab2[1,1] <- 0
# mandat w klubie
for (par in names(which(tab2[1,] > 0))) {
delta = min(tab2[par, 1], tab2[1, par])
tab2[par, par] = tab2[par, par] + delta
tab2[1, par] = tab2[1, par] - delta
tab2[par, 1] = tab2[par, 1] - delta
}
# przechodzi przez niezalezy
for (par in names(which(tab2[1,] > 0))) {
tab2["niez.", par] = tab2["niez.", par] + tab2[1, par]
tab2[1, par] = 0
}
for (par in names(which(tab2[,1] > 0))) {
tab2[par, "niez."] = tab2[par, "niez."] + tab2[par, 1]
tab2[par, 1] = 0
}
tab[] <- tab2[]
}
tab
}
flow2 <- rbind(
data.frame(fixtable(z = paste0(dat$day1_state, " day1"), do = paste0(dat$day2_state, " day2"))),
data.frame(fixtable(z = paste0(dat$day2_state, " day2"), do = paste0(dat$day3_state, " day3"))),
data.frame(fixtable(z = paste0(dat$day3_state, " day3"), do = paste0(dat$day4_state, " day4"))),
data.frame(fixtable(z = paste0(dat$day4_state, " day4"), do = paste0(dat$day5_state, " day5"))),
data.frame(fixtable(z = paste0(dat$day5_state, " day5"), do = paste0(dat$day6_state, " day6"))),
data.frame(fixtable(z = paste0(dat$day6_state, " day6"), do = paste0(dat$day7_state, " day7"))),
data.frame(fixtable(z = paste0(dat$day7_state, " day7"), do = paste0(dat$day8_state, " day8"))),
data.frame(fixtable(z = paste0(dat$day8_state, " day8"), do = paste0(dat$day9_state, " day9"))),
data.frame(fixtable(z = paste0(dat$day9_state, " day9"), do = paste0(dat$day10_state, " day10"))))
flow2 <- flow2[flow2[,3] > 0,]
nodes2 <- data.frame(name=unique(c(levels(factor(flow2[,1])), levels(factor(flow2[,2])))))
nam2 <- seq_along(nodes2[,1])-1
names(nam2) <- nodes2[,1]
links2 <- data.frame(source = nam2[as.character(flow2[,1])],
target = nam2[as.character(flow2[,2])],
value = flow2[,3])
sankeyNetwork(Links = links, Nodes = nodes,
Source = "source", Target = "target",
Value = "value", NodeID = "name",
fontFamily = "Arial", fontSize = 12, nodeWidth = 40,
colourScale = "d3.scale.category20()")

I asked a similar question while ago. And I guess I better post it here how it can be done with the tidyverse magic.
library(ggplot2)
library(ggalluvial)
library(tidyr)
library(dplyr)
library(stringr)
# The actual data preperation happens here
dt_new <- dt %>%
gather(day, state, -userID) %>% # Long format
mutate(day = str_match(day, "[0-9]+")[,1]) %>% # Get the numbers
mutate(day = as.integer(day), # Convert to proper data types
state = as.factor(state))
Here is how the data dt_new looks like
userID day state
1 1 1 d
2 2 1 d
3 3 1 g
4 4 1 a
5 5 1 a
6 6 1 d
7 7 1 d
8 8 1 b
9 9 1 d
10 10 1 e
...
Now plotting the Sankey plot:
ggplot(dt_new,
aes(x = day, stratum = state, alluvium = userID, fill = state, label = state)) +
geom_stratum() +
geom_text(stat = "stratum") +
geom_flow()
Here is the output

R - web scraping dynamic forms skipping missing data

I am using RSelenium to scrape data off of a [website][1] that has a dynamic form where the multiple dropdown menus change depending on what is chosen. I am trying to pull the variable 'Number & Area of Operational Holdings' for every district in every state.
I am able to get the code working, but have an issue when the district does not have a table (The websites database has a few districts with no data). When my code runs into a district with no data, it finishes and I am left with an incomplete dataset.
How would I create a code that can skip over these districts that lack a table? My code is pasted below. A special shout out goes to the previous stack exchange thread on this, [link here][2], as I adapted their code. Also, if anyone can clean up my final output to avoid repeating the variable headers with every new district, it would be appreciated.
rm(list=ls(all=TRUE))
library(RSelenium)
library(XML)
library(dplyr)
library(magrittr)
library(devtools)
library(rvest)
# Start Selenium Server --------------------------------------------------------
checkForServer()
startServer()
remDrv <- remoteDriver()
remDrv$open()
# Simulate browser session and fill out form -----------------------------------
remDrv$navigate('http://agcensus.dacnet.nic.in/districtsummarytype.aspx')
# Select year
remDrv$findElement(using = "xpath",
"//select[#name = '_ctl0:ContentPlaceHolder1:DropDownList2']/option[#value = '2010']")$clickElement()
# Select 1 == Number & Area of Operational Holdings
remDrv$findElement(using = "xpath",
"//select[#name = '_ctl0:ContentPlaceHolder1:DropDownList3']/option[#value = '1']")$clickElement()
# Select 4 == All Social Group
remDrv$findElement(using = "xpath",
"//select[#name = '_ctl0:ContentPlaceHolder1:DropDownList4']/option[#value = '4']")$clickElement()
# Select 3 == All Gender (Total)
remDrv$findElement(using = "xpath",
"//select[#name = '_ctl0:ContentPlaceHolder1:DropDownList8']/option[#value = '3']")$clickElement()
# Get all state IDs and the respective names
state_IDs <- remDrv$findElements(using = "xpath",
"//select[#name = '_ctl0:ContentPlaceHolder1:DropDownList1']/option") %>%
lapply(function(x){x$getElementAttribute('value')}) %>%
unlist
state_names <- remDrv$findElements(using = "xpath",
"//select[#name = '_ctl0:ContentPlaceHolder1:DropDownList1']/option") %>%
lapply(function(x){x$getElementText()}) %>%
unlist
# Retrieve and download results ------------------------------------------------
result <- data.frame(state = character(), district = character(),
V1 = character(), V2 = character(), V3 = character(),
V4 = character(), V5 = character(), V6 = character(),
V7 = character(), V8 = character(), V9 = character(),
V10 = character(), V11 = character(), V12 = character())
for (i in seq_along(state_IDs)) {
remDrv$findElement(using = "xpath",
paste0("//select[#name = '_ctl0:ContentPlaceHolder1:DropDownList1']/option[#value = ",
"'", state_IDs[i], "']"))$clickElement()
Sys.sleep(2)
# Get all district IDs and names from the currently selected states
district_IDs <- remDrv$findElements(using = "xpath",
"//div[#id = '_ctl0_ContentPlaceHolder1_Panel14']/select/option") %>%
lapply(function(x){x$getElementAttribute('value')}) %>%
unlist
district_names <- remDrv$findElements(using = "xpath",
"//div[#id = '_ctl0_ContentPlaceHolder1_Panel14']/select/option") %>%
lapply(function(x){x$getElementText()}) %>%
unlist
for (j in seq_along(district_IDs)) {
remDrv$findElement(using = "xpath",
paste0("//div[#id = '_ctl0_ContentPlaceHolder1_Panel14']/select/option[#value = ",
"'", district_IDs[j], "']"))$clickElement()
Sys.sleep(2)
# Click submit and download data of the selected district
remDrv$findElement(using = "xpath",
"//input[#value = 'Submit']")$clickElement()
Sys.sleep(2)
######### if ##########
if (remDrv$findElement("xpath", "//input[#value ='No Records found'")) { #this isnt input value, but rather a "No Records found" lookup
remDrv$goBack()
Sys.sleep(2)
}
else {
# Download data for current district
district_data <- remDrv$getPageSource()[[1]] %>%
htmlParse %>%
readHTMLTable %>%
extract2(4) %>%
extract(c(-1, -2), )
result <- data.frame(state = state_names[i], district = district_names[j],
district_data) %>% rbind(result, .)
remDrv$goBack()
Sys.sleep(2)
}
}
}
remDrv$quit()
remDrv$closeServer()
result %<>% as_data_frame %>%
rename(
si_no = V1,
holding_size = V2,
Individual_Number = V3,
Individual_Area = V4,
Joint_Number = V5,
Joint_Area = V6,
Subtotal_Number = V7,
Subtotal_Area = V8,
Institutional_Number = V9,
Institutional_Area = V10,
Total_Number = V11,
Total_Area = V12
) %>%
mutate(
si_no = as.numeric(as.character(si_no))
)
str(result)
levels(result$state)
levels(result$district)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

R: Plotting data using qplot - r

Related

Implement call retries with httr::RETRY() function in API call (R)

Highcharter Unlimited Drill down using R

How to insert greek letter delta (∆) into header of flextable object?

How to prepare input data for a sankey diagrams in R?

R - web scraping dynamic forms skipping missing data

Categories

Resources