ggplot histogram split on a boolean - r

using the public dataset (note it takes a few minutes to pull):
library(data.table)
dl <- tempfile()
download.file("http://files.grouplens.org/datasets/movielens/ml-10m.zip", dl)
ratings <- fread(text = gsub("::", "\t", readLines(unzip(dl, "ml-10M100K/ratings.dat"))),
col.names = c("userId", "movieId", "rating", "timestamp"))
movies <- str_split_fixed(readLines(unzip(dl, "ml-10M100K/movies.dat")), "\\::", 3)
colnames(movies) <- c("movieId", "title", "genres")
if (as.numeric(version$year) < 2020 | (version$year=="2020" & as.numeric(version$month) < 3)){
# if using R 3.6 or earlier
movies <- as.data.frame(movies) %>% mutate(movieId = as.numeric(levels(movieId))[movieId],
title = as.character(title),
genres = as.character(genres))
} else {
# if using R 4.0 or later
movies <- as.data.frame(movies) %>% mutate(movieId = as.numeric(movieId),
title = as.character(title),
genres = as.character(genres))}
movielens <- left_join(ratings, movies, by = "movieId")
as in:
> head(movielens)
userId movieId rating timestamp title genres
1: 1 122 5 838985046 Boomerang (1992) Comedy|Romance
2: 1 185 5 838983525 Net, The (1995) Action|Crime|Thriller
3: 1 231 5 838983392 Dumb & Dumber (1994) Comedy
4: 1 292 5 838983421 Outbreak (1995) Action|Drama|Sci-Fi|Thriller
5: 1 316 5 838983392 Stargate (1994) Action|Adventure|Sci-Fi
6: 1 329 5 838983392 Star Trek: Generations (1994) Action|Adventure|Drama|Sci-Fi
>
i'm trying to split a ggplot histogram with fill to show difference between whole and half ratings per below:
movielens %>%
mutate(whole = rating == round(rating)) %>%
ggplot(mapping=aes(x=rating), fill=whole) +
geom_histogram()
as the half ratings are a lot less common but fill does not work for some reason...

You need to include fill in your aesthetic (aka mapping), not in your ggplot() call:
library(data.table)
library(stringr)
library(dplyr)
library(ggplot2)
dl <- tempfile()
download.file("http://files.grouplens.org/datasets/movielens/ml-10m.zip", dl)
ratings <- fread(text = gsub("::", "\t", readLines(unzip(dl, "ml-10M100K/ratings.dat"))),
col.names = c("userId", "movieId", "rating", "timestamp"))
movies <- str_split_fixed(readLines(unzip(dl, "ml-10M100K/movies.dat")), "\\::", 3)
colnames(movies) <- c("movieId", "title", "genres")
if (as.numeric(version$year) < 2020 | (version$year=="2020" & as.numeric(version$month) < 3)){
# if using R 3.6 or earlier
movies <- as.data.frame(movies) %>% mutate(movieId = as.numeric(levels(movieId))[movieId],
title = as.character(title),
genres = as.character(genres))
} else {
# if using R 4.0 or later
movies <- as.data.frame(movies) %>% mutate(movieId = as.numeric(movieId),
title = as.character(title),
genres = as.character(genres))}
movielens <- left_join(ratings, movies, by = "movieId")
movielens %>%
mutate(whole = rating == round(rating)) %>%
ggplot(mapping=aes(x=rating, fill=whole)) +
geom_histogram()

Related

How to draw bar chart in R, based on Levels?

I will put my data first, to better understand the question:
amount city agent address
1 Madras Vinod 45/BA
2 Kalkta Bola 56/AS
3 Mumbai Pavan 44/AA
4 Tasha Barez 58/SD
5 Tasha Khan 22/AW
6 Madras Baaz 56/QE
7 Mumbai Neer 99/CC
8 Mumbai Bazan 97/DF
I am learning R. In a scenario, I want to calculate the total numbers of amount in a specific city and then draw a bar chart for that, showing all cities. Considering the data above, I want something like this:
amount city
7 Madras
2 Kalkta
18 Mumbai
9 Tasha
After some searching I found that aggregate function can help, but I faced a problem that says the length is not the same.
Would you please tell me, how can I achieve this?
base R
res <- do.call(rbind,
by(dat, dat$city, FUN = function(z) data.frame(city = z$city[1], amount = sum(z$amount)))
)
barplot(res$amount, names.arg=res$city)
tidyverse
library(dplyr)
res <- dat %>%
group_by(city) %>%
summarize(amount = sum(amount))
barplot(res$amount, names.arg=res$city)
Data
dat <- structure(list(amount = 1:8, city = c("Madras", "Kalkta", "Mumbai", "Tasha", "Tasha", "Madras", "Mumbai", "Mumbai"), agent = c("Vinod", "Bola", "Pavan", "Barez", "Khan", "Baaz", "Neer", "Bazan"), address = c("45/BA", "56/AS", "44/AA", "58/SD", "22/AW", "56/QE", "99/CC", "97/DF")), class = "data.frame", row.names = c(NA, -8L))
Another way to do it using the tidyverse
amount <- c(1,2,3,4,5,6,7,8)
city <- c("Madras", "Kalkta", "Mumbai", "Tasha", "Tasha", "Madras", "Mumbai",
"Mumbai")
df <- tibble(amount = amount, city = city)
df %>%
group_by(city) %>%
summarise(amount = sum(amount, na.rm = T)) %>%
ggplot(aes(x = city, y = amount)) +
geom_col() +
geom_label(aes(label = amount)) +
theme_bw()

How to render multiple values in ggplot title in a Shiny app?

I'm having some problems with my ggplot title in this Shinyapp. I'm comparing countries and I want the countries the plot shows (colour = input$stat) to also be visible in the ggplot title. With the current code I'm only getting the first one. Is there perhaps an elegant solution to this problem?
If I'm comparing France, The United Kingdom and Spain, I want the title to be:
"Coronalandskampen, France, The United Kingdom, Spain"
library(tidyverse)
library(readxl)
library(httr)
library(zoo)
library(caTools)
library(shiny)
library(data.table)
#Get data
url <- paste("https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-",format(Sys.time(), "%Y-%m-%d"), ".xlsx", sep = "")
GET(url, authenticate(":", ":", type="ntlm"), write_disk(tf <- tempfile(fileext = ".xlsx")))
df <- read_excel(tf)
df <- df %>%
rename(land = countriesAndTerritories,
`Antal fall` = cases,
`Antal döda` = deaths) %>%
arrange(land, dateRep) %>%
group_by(land) %>%
mutate(`Antal döda, kumulativt` = cumsum(`Antal döda`),
`Antal fall, kumulativt` = cumsum(`Antal fall`)) %>%
ungroup() %>%
filter(`Antal döda, kumulativt` > 10) %>%
group_by(land) %>%
mutate(antal_dagar = row_number(),
start_datum = min(dateRep),
`Antal Fall, rullande medeltal över sju dagar` = rollmean(`Antal fall`, 7, fill = NA),
`Antal döda, rullande medeltal över sju dagar` = rollmean(`Antal döda`, 7, fill = NA)) %>%
ungroup() %>%
mutate(`Döda per 100 000 invånare` = `Antal döda, kumulativt` * 100000 / popData2019) %>%
select(land, antal_dagar, `Antal fall`, `Antal fall, kumulativt`, `Antal döda`, `Antal döda, kumulativt`, `Döda per 100 000 invånare`,
`Antal Fall, rullande medeltal över sju dagar`, `Antal döda, rullande medeltal över sju dagar`, start_datum, geoId)
ui <- fluidPage(
navbarPage("Statistik Covid-19",
sidebarLayout(
sidebarPanel(
selectInput("stat", "Välj länder:", choices = unique(df$land), selected = "Sweden", multiple = TRUE),
varSelectInput("var", "Variabel:", df[c(3,4,5,6,7, 8, 9)])),
mainPanel(plotOutput("covid"))
)))
server <- function(input, output, session) {
df_graf <- reactive({df %>%
req(input$stat) %>%
filter(land %in% input$stat)
})
output$covid <- renderPlot({
ggplot(df_graf(), aes(antal_dagar, df_graf()[[input$var]], colour = land)) +
geom_line(size = 1.25) +
theme_Skane() +
labs(title = paste0("Coronalandskampen, ", input$stat),
x = "Antal dagar sedan 10:e dödsfallet",
y = as.name(input$var),
colour = NULL,
caption = "Source: European Centre for Disease Prevention and Control")
})
}
shinyApp(ui, server)
So basically, your question boils down to making sure that:
title = paste0("Coronalandskampen, ", input$stat)
returns the string "Coronalandskampen, France, The United Kingdom, Spain".
When running:
> paste0("Coronalandskampen, ", c("A", "B", "C", "D"))
[1] "Coronalandskampen, A" "Coronalandskampen, B"
[3] "Coronalandskampen, C" "Coronalandskampen, D"
We see that the result is a character vector with more than one element. The labsfunction uses only the first element of this vector. Therefore you need to build a single string.
Lets try this:
> paste0("Coronalandskampen, ", paste0(c("A", "B", "C", "D"), collapse = ", "))
[1] "Coronalandskampen, A, B, C, D"
So, in your code you can write:
title = paste0("Coronalandskampen, ", paste0(input$stat, collapse = ", "))

Formatting an ftable in R

I have the following 3 way table I created in R.
with(dataset, ftable(xtabs(count ~ dos + sex + edu)))
The output looks like
edu high low medium unknown
dos sex
five-to-ten-years female 247776 44916 127133 23793
male 225403 37858 147821 20383
five-years-or-less female 304851 58018 182152 33649
male 253977 55720 193621 28972
more-than-ten-years female 709303 452605 539403 165675
male 629162 309193 689299 121336
native-born female 1988476 1456792 2094297 502153
male 1411509 1197395 2790522 395953
unknown female 57974 75480 73204 593141
male 40176 57786 93108 605542
I want to rename the variables and format the table so that I can include it in a report. I know that I can use dnn to rename the variables, but are there any other recommendations to rename the variables? And to format the table (similar to using kable)?
You could convert the output to a text matrix using the following function, after which you can style with kable however you choose:
ftab_to_matrix <- function(ft)
{
row_vars <- attr(ft, "row.vars")
for(i in seq_along(row_vars)){
row_vars[[i]] <- c(names(row_vars[i]), row_vars[[i]])}
rowvar_widths <- sapply(row_vars, function(x) max(nchar(x))) + 1
col_vars <- attr(ft, "col.vars")
rowvar_widths <- c(1, cumsum(c(rowvar_widths, max(nchar(names(col_vars))))))
ft_text <- capture.output(print(ft))
row_cols <- sapply(seq_along(rowvar_widths)[-1], function(x)
substr(ft_text, rowvar_widths[x - 1], rowvar_widths[x]))
ft_text <- substr(ft_text, rowvar_widths[length(rowvar_widths)] + 2, 100)
ft_breaks <- c(1, cumsum(lapply(strsplit(ft_text[length(ft_text)], "\\d "),
function(x) nchar(x) + 2)[[1]]))
col_cols <- sapply(seq_along(ft_breaks)[-1], function(x)
substr(ft_text, ft_breaks[x - 1], ft_breaks[x]))
trimws(cbind(row_cols, col_cols))
}
So, for example, using my example data from your last question, you could do something like:
my_tab <- with(`3waydata`, ftable(xtabs(count ~ duration + sex + education)))
as_image(kable_styling(kable(ftab_to_df(my_tab))), file = "kable.png")
Might have been easier had you given the full picture when you asked your first question... You could use gt to make fancy tables for reports. This is an edited version more fully demonstrating some capabilities.
library(dplyr)
library(gt)
way3data <- data %>%
group_by(duration, education, sex) %>%
summarise(count = sum(number)) %>%
ungroup
# Reorder with select and Titlecase with stringr
longer <- tidyr::pivot_wider(way3data,
values_from = count,
names_from = "education") %>%
select(duration, sex, high, medium, low, unknown) %>%
rename_with(stringr::str_to_title)
# Demonstrating some of the features of gt
# obviously could have done some of this
# to the original dataframe
myresults <- longer %>%
group_by(Duration) %>%
gt(rowname_col = "Sex") %>%
row_group_order(
groups = c("native-born",
"more-than-ten-years",
"five-to-ten-years",
"five-years-or-less",
"unknown")
) %>%
tab_spanner(label = "Education",
columns = matches("High|Low|Medium|Unknown")) %>%
tab_stubhead(label = "Duration or something") %>%
tab_style(
style = cell_text(style = "oblique", weight = "bold"),
locations = cells_row_groups()) %>%
tab_style(
style = cell_text(align = "right", style = "italic", weight = "bold"),
locations = cells_column_labels(
columns = vars(High, Low, Medium, Unknown)
)) %>%
tab_style(
style = cell_text(align = "right", weight = "bold"),
locations = cells_stub()) %>%
tab_header(
title = "Fancy table of counts with Duration, Education and Gender") %>%
tab_source_note(md("More information is available at https://stackoverflow.com/questions/62284264."))
# myresults
# Can save in other formats including .rtf
myresults %>%
gtsave(
"tab_1.png", expand = 10
)
You can read about all the formatting choices here
Data compliments of Allan
set.seed(69)
data <- data.frame(education = sample(c("high","low","medium","unknown"), 600, T),
sex = rep(c("Male", "Female"), 300),
duration = sample(c("unknown", "native-born",
"five-years-or-less", "five-to-ten-years",
"more-than-ten-years"), 600, T),
number = rpois(600, 10))

how to plot networks over a map with the least overlap

I have some authors with their city or country of affiliation. I would like to know if it is possible to plot the coauthors' networks (figure 1), on the map, having the coordinates of the countries. Please consider multiple authors from the same country. [EDIT: Several networks could be generated as in the example and should not show avoidable overlaps]. This is intended for dozens of authors. A zooming option is desirable. Bounty promise +100 for future better answer.
refs5 <- read.table(text="
row bibtype year volume number pages title journal author
Bennett_1995 article 1995 76 <NA> 113--176 angiosperms. \"Annals of Botany\" \"Bennett Md, Leitch Ij\"
Bennett_1997 article 1997 80 2 169--196 estimates. \"Annals of Botany\" \"Bennett MD, Leitch IJ\"
Bennett_1998 article 1998 82 SUPPL.A 121--134 weeds. \"Annals of Botany\" \"Bennett MD, Leitch IJ, Hanson L\"
Bennett_2000 article 2000 82 SUPPL.A 121--134 weeds. \"Annals of Botany\" \"Bennett MD, Someone IJ\"
Leitch_2001 article 2001 83 SUPPL.A 121--134 weeds. \"Annals of Botany\" \"Leitch IJ, Someone IJ\"
New_2002 article 2002 84 SUPPL.A 121--134 weeds. \"Annals of Botany\" \"New IJ, Else IJ\"" , header=TRUE,stringsAsFactors=FALSE)
rownames(refs5) <- refs5[,1]
refs5<-refs5[,2:9]
citations <- as.BibEntry(refs5)
authorsl <- lapply(citations, function(x) as.character(toupper(x$author)))
unique.authorsl<-unique(unlist(authorsl))
coauth.table <- matrix(nrow=length(unique.authorsl),
ncol = length(unique.authorsl),
dimnames = list(unique.authorsl, unique.authorsl), 0)
for(i in 1:length(citations)){
paper.auth <- unlist(authorsl[[i]])
coauth.table[paper.auth,paper.auth] <- coauth.table[paper.auth,paper.auth] + 1
}
coauth.table <- coauth.table[rowSums(coauth.table)>0, colSums(coauth.table)>0]
diag(coauth.table) <- 0
coauthors<-coauth.table
bip = network(coauthors,
matrix.type = "adjacency",
ignore.eval = FALSE,
names.eval = "weights")
authorcountry <- read.table(text="
author country
1 \"LEITCH IJ\" Argentina
2 \"HANSON L\" USA
3 \"BENNETT MD\" Brazil
4 \"SOMEONE IJ\" Brazil
5 \"NEW IJ\" Brazil
6 \"ELSE IJ\" Brazil",header=TRUE,fill=TRUE,stringsAsFactors=FALSE)
matched<- authorcountry$country[match(unique.authorsl, authorcountry$author)]
bip %v% "Country" = matched
colorsmanual<-c("red","darkgray","gainsboro")
names(colorsmanual) <- unique(matched)
gdata<- ggnet2(bip, color = "Country", palette = colorsmanual, legend.position = "right",label = TRUE,
alpha = 0.9, label.size = 3, edge.size="weights",
size="degree", size.legend="Degree Centrality") + theme(legend.box = "horizontal")
gdata
In other words, adding the names of authors, lines and bubbles to the map. Note, several authors maybe from the same city, or country and should not overlap.
Figure 1 Network
EDIT: The current JanLauGe answer overlaps two non-related networks. authors "ELSE" and "NEW" need to be apart from others as in figure 1.
Are you looking for a solution using exactly the packages you used, or would you be happy to use suite of other packages? Below is my approach, in which I extract the graph properties from the network object and plot them on a map using the ggplot2 and map package.
First I recreate the example data you gave.
library(tidyverse)
library(sna)
library(maps)
library(ggrepel)
set.seed(1)
coauthors <- matrix(
c(0,3,1,1,3,0,1,0,1,1,0,0,1,0,0,0),
nrow = 4, ncol = 4,
dimnames = list(c('BENNETT MD', 'LEITCH IJ', 'HANSON L', 'SOMEONE ELSE'),
c('BENNETT MD', 'LEITCH IJ', 'HANSON L', 'SOMEONE ELSE')))
coords <- data_frame(
country = c('Argentina', 'Brazil', 'USA'),
coord_lon = c(-63.61667, -51.92528, -95.71289),
coord_lat = c(-38.41610, -14.23500, 37.09024))
authorcountry <- data_frame(
author = c('LEITCH IJ', 'HANSON L', 'BENNETT MD', 'SOMEONE ELSE'),
country = c('Argentina', 'USA', 'Brazil', 'Brazil'))
Now I generate the graph object using the snp function network
# Generate network
bip <- network(coauthors,
matrix.type = "adjacency",
ignore.eval = FALSE,
names.eval = "weights")
# Graph with ggnet2 for centrality
gdata <- ggnet2(bip, color = "Country", legend.position = "right",label = TRUE,
alpha = 0.9, label.size = 3, edge.size="weights",
size="degree", size.legend="Degree Centrality") + theme(legend.box = "horizontal")
From the network object we can extract the values of each edge, and from the ggnet2 object we can get degree of centrality for nodes as below:
# Combine data
authors <-
# Get author numbers
data_frame(
id = seq(1, nrow(coauthors)),
author = sapply(bip$val, function(x) x$vertex.names)) %>%
left_join(
authorcountry,
by = 'author') %>%
left_join(
coords,
by = 'country') %>%
# Jittering points to avoid overlap between two authors
mutate(
coord_lon = jitter(coord_lon, factor = 1),
coord_lat = jitter(coord_lat, factor = 1))
# Get edges from network
networkdata <- sapply(bip$mel, function(x)
c('id_inl' = x$inl, 'id_outl' = x$outl, 'weight' = x$atl$weights)) %>%
t %>% as_data_frame
dt <- networkdata %>%
left_join(authors, by = c('id_inl' = 'id')) %>%
left_join(authors, by = c('id_outl' = 'id'), suffix = c('.from', '.to')) %>%
left_join(gdata$data %>% select(label, size), by = c('author.from' = 'label')) %>%
mutate(edge_id = seq(1, nrow(.)),
from_author = author.from,
from_coord_lon = coord_lon.from,
from_coord_lat = coord_lat.from,
from_country = country.from,
from_size = size,
to_author = author.to,
to_coord_lon = coord_lon.to,
to_coord_lat = coord_lat.to,
to_country = country.to) %>%
select(edge_id, starts_with('from'), starts_with('to'), weight)
Should look like this now:
dt
# A tibble: 8 × 11
edge_id from_author from_coord_lon from_coord_lat from_country from_size to_author to_coord_lon
<int> <chr> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
1 1 BENNETT MD -51.12756 -16.992729 Brazil 6 LEITCH IJ -65.02949
2 2 BENNETT MD -51.12756 -16.992729 Brazil 6 HANSON L -96.37907
3 3 BENNETT MD -51.12756 -16.992729 Brazil 6 SOMEONE ELSE -52.54160
4 4 LEITCH IJ -65.02949 -35.214117 Argentina 4 BENNETT MD -51.12756
5 5 LEITCH IJ -65.02949 -35.214117 Argentina 4 HANSON L -96.37907
6 6 HANSON L -96.37907 36.252312 USA 4 BENNETT MD -51.12756
7 7 HANSON L -96.37907 36.252312 USA 4 LEITCH IJ -65.02949
8 8 SOMEONE ELSE -52.54160 -9.551913 Brazil 2 BENNETT MD -51.12756
# ... with 3 more variables: to_coord_lat <dbl>, to_country <chr>, weight <dbl>
Now moving on to plotting this data on a map:
world_map <- map_data('world')
myMap <- ggplot() +
# Plot map
geom_map(data = world_map, map = world_map, aes(map_id = region),
color = 'gray85',
fill = 'gray93') +
xlim(c(-120, -20)) + ylim(c(-50, 50)) +
# Plot edges
geom_segment(data = dt,
alpha = 0.5,
color = "dodgerblue1",
aes(x = from_coord_lon, y = from_coord_lat,
xend = to_coord_lon, yend = to_coord_lat,
size = weight)) +
scale_size(range = c(1,3)) +
# Plot nodes
geom_point(data = dt,
aes(x = from_coord_lon,
y = from_coord_lat,
size = from_size,
colour = from_country)) +
# Plot names
geom_text_repel(data = dt %>%
select(from_author,
from_coord_lon,
from_coord_lat) %>%
unique,
colour = 'dodgerblue1',
aes(x = from_coord_lon, y = from_coord_lat, label = from_author)) +
coord_equal() +
theme_bw()
Obviously you can change the colour and design in the usual way with ggplot2 grammar. Notice that you could also use geom_curve and the arrow aesthetic to get a plot similar to the one in the uber post linked in the comments above.
As an effort to avoid the overlapping of the 2 networks, I came to this modification of the x and y coordenates of the ggplot, which by default does not overlap the networks, see figure 1 in the question.
# get centroid positions for countries
# add coordenates to authorcountry table
# download and unzip
# https://worldmap.harvard.edu/data/geonode:country_centroids_az8
setwd("~/country_centroids_az8")
library(rgdal)
cent <- readOGR('.', "country_centroids_az8", stringsAsFactors = F)
countrycentdf<-cent#data[,c("name","Longitude","Latitude")]
countrycentdf$name[which(countrycentdf$name=="United States")]<-"USA"
colnames(countrycentdf)[names(countrycentdf)=="name"]<-"country"
authorcountry$Longitude<-countrycentdf$Longitude[match(authorcountry$country,countrycentdf$country)]
authorcountry$Latitude <-countrycentdf$Latitude [match(authorcountry$country,countrycentdf$country)]
# original coordenates of plot and its transformation
ggnetbuild<-ggplot_build(gdata)
allcoord<-ggnetbuild$data[[3]][,c("x","y","label")]
allcoord$Latitude<-authorcountry$Latitude [match(allcoord$label,authorcountry$author)]
allcoord$Longitude<-authorcountry$Longitude [match(allcoord$label,authorcountry$author)]
allcoord$country<-authorcountry$country [match(allcoord$label,authorcountry$author)]
# increase with factor the distance among dots
factor<-7
allcoord$coord_lat<-allcoord$y*factor+allcoord$Latitude
allcoord$coord_lon<-allcoord$x*factor+allcoord$Longitude
allcoord$author<-allcoord$label
# plot as in answer of JanLauGe, without jitter
library(tidyverse)
library(ggrepel)
authors <-
# Get author numbers
data_frame(
id = seq(1, nrow(coauthors)),
author = sapply(bip$val, function(x) x$vertex.names)) %>%
left_join(
allcoord,
by = 'author')
# Continue as in answer of JanLauGe
networkdata <- ##
dt <- ##
world_map <- map_data('world')
myMap <- ##
myMap

R - web scraping dynamic forms skipping missing data

I am using RSelenium to scrape data off of a [website][1] that has a dynamic form where the multiple dropdown menus change depending on what is chosen. I am trying to pull the variable 'Number & Area of Operational Holdings' for every district in every state.
I am able to get the code working, but have an issue when the district does not have a table (The websites database has a few districts with no data). When my code runs into a district with no data, it finishes and I am left with an incomplete dataset.
How would I create a code that can skip over these districts that lack a table? My code is pasted below. A special shout out goes to the previous stack exchange thread on this, [link here][2], as I adapted their code. Also, if anyone can clean up my final output to avoid repeating the variable headers with every new district, it would be appreciated.
rm(list=ls(all=TRUE))
library(RSelenium)
library(XML)
library(dplyr)
library(magrittr)
library(devtools)
library(rvest)
# Start Selenium Server --------------------------------------------------------
checkForServer()
startServer()
remDrv <- remoteDriver()
remDrv$open()
# Simulate browser session and fill out form -----------------------------------
remDrv$navigate('http://agcensus.dacnet.nic.in/districtsummarytype.aspx')
# Select year
remDrv$findElement(using = "xpath",
"//select[#name = '_ctl0:ContentPlaceHolder1:DropDownList2']/option[#value = '2010']")$clickElement()
# Select 1 == Number & Area of Operational Holdings
remDrv$findElement(using = "xpath",
"//select[#name = '_ctl0:ContentPlaceHolder1:DropDownList3']/option[#value = '1']")$clickElement()
# Select 4 == All Social Group
remDrv$findElement(using = "xpath",
"//select[#name = '_ctl0:ContentPlaceHolder1:DropDownList4']/option[#value = '4']")$clickElement()
# Select 3 == All Gender (Total)
remDrv$findElement(using = "xpath",
"//select[#name = '_ctl0:ContentPlaceHolder1:DropDownList8']/option[#value = '3']")$clickElement()
# Get all state IDs and the respective names
state_IDs <- remDrv$findElements(using = "xpath",
"//select[#name = '_ctl0:ContentPlaceHolder1:DropDownList1']/option") %>%
lapply(function(x){x$getElementAttribute('value')}) %>%
unlist
state_names <- remDrv$findElements(using = "xpath",
"//select[#name = '_ctl0:ContentPlaceHolder1:DropDownList1']/option") %>%
lapply(function(x){x$getElementText()}) %>%
unlist
# Retrieve and download results ------------------------------------------------
result <- data.frame(state = character(), district = character(),
V1 = character(), V2 = character(), V3 = character(),
V4 = character(), V5 = character(), V6 = character(),
V7 = character(), V8 = character(), V9 = character(),
V10 = character(), V11 = character(), V12 = character())
for (i in seq_along(state_IDs)) {
remDrv$findElement(using = "xpath",
paste0("//select[#name = '_ctl0:ContentPlaceHolder1:DropDownList1']/option[#value = ",
"'", state_IDs[i], "']"))$clickElement()
Sys.sleep(2)
# Get all district IDs and names from the currently selected states
district_IDs <- remDrv$findElements(using = "xpath",
"//div[#id = '_ctl0_ContentPlaceHolder1_Panel14']/select/option") %>%
lapply(function(x){x$getElementAttribute('value')}) %>%
unlist
district_names <- remDrv$findElements(using = "xpath",
"//div[#id = '_ctl0_ContentPlaceHolder1_Panel14']/select/option") %>%
lapply(function(x){x$getElementText()}) %>%
unlist
for (j in seq_along(district_IDs)) {
remDrv$findElement(using = "xpath",
paste0("//div[#id = '_ctl0_ContentPlaceHolder1_Panel14']/select/option[#value = ",
"'", district_IDs[j], "']"))$clickElement()
Sys.sleep(2)
# Click submit and download data of the selected district
remDrv$findElement(using = "xpath",
"//input[#value = 'Submit']")$clickElement()
Sys.sleep(2)
######### if ##########
if (remDrv$findElement("xpath", "//input[#value ='No Records found'")) { #this isnt input value, but rather a "No Records found" lookup
remDrv$goBack()
Sys.sleep(2)
}
else {
# Download data for current district
district_data <- remDrv$getPageSource()[[1]] %>%
htmlParse %>%
readHTMLTable %>%
extract2(4) %>%
extract(c(-1, -2), )
result <- data.frame(state = state_names[i], district = district_names[j],
district_data) %>% rbind(result, .)
remDrv$goBack()
Sys.sleep(2)
}
}
}
remDrv$quit()
remDrv$closeServer()
result %<>% as_data_frame %>%
rename(
si_no = V1,
holding_size = V2,
Individual_Number = V3,
Individual_Area = V4,
Joint_Number = V5,
Joint_Area = V6,
Subtotal_Number = V7,
Subtotal_Area = V8,
Institutional_Number = V9,
Institutional_Area = V10,
Total_Number = V11,
Total_Area = V12
) %>%
mutate(
si_no = as.numeric(as.character(si_no))
)
str(result)
levels(result$state)
levels(result$district)

Resources