I have constructed a data frame by row binding different web-scraped tables.
# html files
filelist <- c("Prod223_2688_00185641_20190930.html","Prod224_0078_SO305092_20191130.html",
"Prod224_0078_SO305426_20190831.html", "Prod224_0078_SO305431_20190831.html",
"Prod224_0078_SO305440_20190831.html", "Prod224_0078_SO305451_20200331.html",
"Prod224_0078_SO306088_20190531.html", "Prod224_0078_SO306098_20180630.html",
"Prod224_0078_SO306098_20190630.html", "Prod224_0078_SO306411_20190530.html")
# web scraping tables
mydata <- lapply(filelist, function(x) {
read_html(x) %>% rvest::html_table(fill = T) %>%
dplyr::nth(2)
})
# row binding (adding a new column with row .id)
mydata <- rbindlist(mydata, idcol=T, fill = T)
I want to create a new column company with the respective name from filelist based on row .id. The company name is the third code in between _. To get something like this:
mydata
company id. X2 ..
00185641 1 ..
00185641 1 ..
SO305092 2 ..
SO305426 3 ..
SO305426 3 ..
This may be quite a simple question but I am not confident with functions in R yet. I have seen this similar questions and tried:
mydata2 <- mydata2 %>% mutate(company=lapply(mydata2,filelist))
# and this:
mydata2$company <- rep(paste(filelist), length(mydata2$.id))
Don't have data to test this on but you can try the following :
library(dplyr)
library(rvest)
mydata <- sapply(filelist, function(x) {
read_html(x) %>% rvest::html_table(fill = TRUE) %>%
dplyr::nth(2)
}, simplify = FALSE)
mydata <- bind_rows(mydata, .id = ='company')
mydata$company <- sub('.*_(\\w+)_\\w+', '\\1', mydata$company)
We used sapply with simplify = FALSE to get a named list with filelist as names, when we use bind_rows that name is assigned as a new column company. Using regex we extract the relevant part of the data.
Related
Suppose I have multiple data frames with the same prefixes and same structure.
mydf_1 <- data.frame('fruit' = 'apples', 'n' = 2)
mydf_2 <- data.frame('fruit' = 'pears', 'n' = 0)
mydf_3 <- data.frame('fruit' = 'oranges', 'n' = 3)
I have a for-loop that grabs all the tables with this prefix, and appends those that match a certain condition.
res <- data.frame()
for(i in mget(apropos("^mydf_"), envir = .GlobalEnv)){
if(sum(i$n) > 0){
res <- rbind.data.frame(res, data.frame('name' = paste0(i[1]),
'n' = sum(i$n)))
}
}
res
This works fine, but I want my 'res' table to identify the name of the original data frame itself in the 'name' column, instead of the column name. My desired result is:
The closest I have gotten to solving this issue is:
'name' = paste0(substitute(i))
instead of
'name' = paste0(i[1])
but it just returns 'i'.
Any simple solution? Base preferred but not essential.
As mentioned in the comments, it is better to put dataframes into a list as it much easier to handle and manipulate them. However, we could still grab the dataframes from the global environment, get the sum for each dataframe, then bind them together and add the dataframe name as a row.
library(tidyverse)
df_list <-
do.call("list", mget(grep("^mydf_", names(.GlobalEnv), value = TRUE))) %>%
map(., ~ .x %>% summarise(n = sum(n))) %>%
discard(~ .x == 0) %>%
bind_rows(., .id = "name")
Or we could use map_dfr to bind together and summarise, then filter out the 0 values:
map_dfr(mget(ls(pattern = "^mydf_")), ~ c(n = sum(.x$n)), .id = "name") %>%
filter(n != 0)
Output
name n
1 mydf_1 2
2 mydf_3 3
To bind a list of data.frames and store the list names as a new column, a convenient way is to set the arg .id in dplyr::bind_rows().
library(dplyr)
mget(apropos("^mydf_")) %>%
bind_rows(.id = "name") %>%
count(name, wt = n) %>%
filter(n > 0)
# name n
# 1 mydf_1 2
# 2 mydf_3 3
Someone who know how I can do this code nicer and more efficient? I
also get an message when I try to run dframe if someone know about whats
wrong there.
Column 1 ['Uke 3'] of item 2 is missing in item 1. Use fill=TRUE to
fill with NA (NULL for list columns), or use.names=FALSE to ignore
column names. use.names='check' (default from v1.12.2) emits this
message and proceeds as if use.names=FALSE for backwards
compatibility. See news item 5 in v1.12.2 for options to control this
message
library(rvest)
library(tidyverse)
library(rlist)
#URL
full_timeplan <- list(
'https://timeplan.uit.no/emne_timeplan.php?sem=22v&module%5B%5D=SOK-1005-1&week=1-20&View=list',
'https://timeplan.uit.no/emne_timeplan.php?sem=22v&module%5B%5D=SOK-1006-1&View=list',
'https://timeplan.uit.no/emne_timeplan.php?sem=22v&module%5B%5D=SOK-1016-1&View=list')
page <- url[[1]]%>%
map(read_html)
table <- html_nodes(page, 'table') # one table per week
table <- html_table(table, fill=TRUE) # force them into a list
dframe <- list.stack(table) # stack the list into a data frame
# define first row as variable name
colnames(dframe) <- dframe[1,]
# remove the rows with Dato in it
dframe <- dframe %>% filter(!Dato=="Dato")
# Separate the Dato into two columns:
dframe <- dframe %>% separate(Dato,
into = c("Dag", "Dato"),
sep = "(?<=[A-Za-z])(?=[0-9])")
# code into date format
dframe$Dato <- as.Date(dframe$Dato, format="%d.%m.%Y")
# generate a week variable
dframe$Uke <- strftime(dframe$Dato, format = "%V")
# select
dframe <- dframe %>% select(Dag,Dato,Uke,Tid,Rom)
I have a dataframe that has one column which contains json data. I want to extract some attributes from this json data into named columns of the data frame.
Sample data
json_col = c('{"name":"john"}','{"name":"doe","points": 10}', '{"name":"jane", "points": 20}')
id = c(1,2,3)
df <- data.frame(id, json_col)
I was able to achieve this using
library(tidyverse)
library(jsonlite)
extract_json_attr <- function(from, attr, default=NA) {
value <- from %>%
as.character() %>%
jsonlite::fromJSON(txt = .) %>%
.[attr]
return(ifelse(is.null(value[[1]]), default, value[[1]]))
}
df <- df %>%
rowwise() %>%
mutate(name = extract_json_attr(json_col, "name"),
points = extract_json_attr(json_col, "points", 0))
In this case the extract_json_attr needs to parse the json column multiple times for each attribute to be extracted.
Is there a better way to extract all attributes at one shot?
I tried this function to return multiple values as a list, but I am not able to use it with mutate to set multiple columns.
extract_multiple <- function(from, attributes){
values <- from %>%
as.character() %>%
jsonlite::fromJSON(txt = .) %>%
.[attributes]
return (values)
}
I am able to extract the desired values using this function
extract_multiple(df$json_col[1],c('name','points'))
extract_multiple(df$json_col[2],c('name','points'))
But cannot apply this to set multiple columns in a single go. Is there a better way to do this efficiently?
Here is one way using bind_rows from dplyr
dplyr::bind_rows(lapply(as.character(df$json_col), jsonlite::fromJSON))
# A tibble: 3 x 2
# name points
# <chr> <int>
#1 john NA
#2 doe 10
#3 jane 20
To subset specific attribute from the function, we can do
bind_rows(lapply(as.character(df$json_col), function(x)
jsonlite::fromJSON(x)[c('name', 'points')]))
On the R4DS slack channel I received an alternative approach for handling json arrays as columns. Using that, I found another approach that seems to work better on larger datasets.
library(tidyverse)
library(jsonlite)
extract <- function(input, fields){
json_df <- fromJSON(txt=input)
missing <- setdiff(fields, names(json_df))
json_df[missing] <- NA
return (json_df %>% select(fields))
}
df <- data.frame(id=c(1,2,3),
json_col=c('{"name":"john"}','{"name":"doe","points": 10}', '{"name":"jane", "points": 20}'),
stringsAsFactors=FALSE)
df %>%
mutate(json_col = paste0('[',json_col,']'),
json_col = map(json_col, function(x) extract(input=x, fields=c('name', 'points')))) %>%
unnest(cols=c(json_col))
I am trying to convert this xml_file (and many other similar ones) to a data.frame in R. Desired outcome: a data.frame (or tibble, data.table, etc) with:
One row per Deputado (which is the main tag/level of xml_file, there are 4 of those)
All variables within each Deputado should be columns.
Neste categories with multiple values (such as comissao, cargoComissoes, etc) can be ignored.
In the code below, I tried to follow Example 2 in the readme of github/.../xmltools closely, but I got the error:
...
+ dplyr::mutate_all(empty_as_na)
Error: Argument 4 must be length 4, not 39
Any help fixing this (or different strategy with complete example) would be greatly appreciated.
The code (with reproducible error) is:
file <- "https://www.camara.leg.br/SitCamaraWS/Deputados.asmx/ObterDetalhesDeputado?ideCadastro=141428&numLegislatura="
doc <- file %>%
xml2::read_xml()
nodeset <- doc %>%
xml2::xml_children()
length(nodeset) # lots of nodes!
nodeset[1] %>% # lets look at ONE node's tree
xml_view_tree()
# lets assume that most nodes share the same structure
terminal_paths <- nodeset[1] %>%
xml_get_paths(only_terminal_parent = TRUE)
terminal_xpaths <- terminal_paths %>% ## collapse xpaths to unique only
unlist() %>%
unique()
# xml_to_df (XML package based)
## note that we use file, not doc, hence is_xml = FALSE
# df1 <- lapply(xpaths, xml_to_df, file = file, is_xml = FALSE, dig = FALSE) %>%
# dplyr::bind_cols()
# df1
# xml_dig_df (xml2 package based)
## faster!
empty_as_na <- function(x){
if("factor" %in% class(x)) x <- as.character(x) ## since ifelse wont work with factors
if(class(x) == "character") ifelse(as.character(x)!="", x, NA) else x
}
terminal_nodesets <- lapply(terminal_xpaths, xml2::xml_find_all, x = doc) # use xml docs, not nodesets! I think this is because it searches the 'root'.
df2 <- terminal_nodesets %>%
purrr::map(xml_dig_df) %>%
purrr::map(dplyr::bind_rows) %>%
dplyr::bind_cols() %>%
dplyr::mutate_all(empty_as_na)
Here is an approach with XML package.
library(tidyverse)
library(XML)
df = xmlInternalTreeParse("./Data/ObterDetalhesDeputado.xml")
df_root = xmlRoot(df)
df_children = xmlChildren(df_root)
df_flattened = map_dfr(df_children, ~.x %>%
xmlToList() %>%
unlist %>%
stack %>%
mutate(ind = as.character(ind),
ind = make.unique(ind)) %>% # for duplicate identifiers
spread(ind, values))
Following Nodes are nested lists. So they will appear as duplicate columns with numbers affixed. You can remove them accordingly.
cargosComissoes 2
partidoAtual 3
gabinete 3
historicoLider 4
comissoes 11
I have an input file like:
1A10, 77002, 77003, 77010, 77020
1A20, 77002, 77006, 77007, 77019
1A30, 77006, 77019, 77098
1A40, 77007, 77019, 77027, 77098
1A50, 77005, 77007, 77019, 77024, 77027, 77046, 77081, 77098, 77401
etc....
I want to create a data frame (tibble) where the first column is the same as the first column of my csv, and the second column is a list corresponding to the rest of the columns.
I have failed miserably. Here is my last failure
library(stringr)
library(tidyverse)
options(stringsAsFactors = FALSE)
infile <- "~/Rprojects/CrimeStats/BeatZipcodes.csv"
# create empty data frame
BeatToZip <- data_frame(
beat=character(),
zips=list()
)
con=file(infile,open="r")
line=readLines(con)
long=length(line)
for (i in 1:long){
print(line[i])
line[i] <- trimws(line[i])
beat <- str_split(line[i],", *")[[1]][1]
zips <- as.list(str_split(line[i],", *")[[1]][-1])
temp <- data_frame(beat, zips)
BeatToZip <- rbind(BeatToZip, temp)
}
close(con)
One option after reading the file with read.csv and fill = TRUE
library(tidyverse)
df1 <- read.csv(infile, fill = TRUE, header = FALSE)
gather all the columns except the first one, grouped by the first column, summarise the other columns into a list
df1 %>%
gather(key, val, -1, na.rm = TRUE) %>%
group_by(key) %>%
summarise(listCol = list(val))