Import table from url with r - but numeric columns are characters - r

I have following code:
url <- "https://lebensmittel-naehrstoffe.de/calciumhaltige-lebensmittel/"
page <- read_html(url) #Creates an html document from URL
Ca <- html_table(page, fill = TRUE, dec = ",") #Parses tables into data frames
Ca <- data.frame(Ca)
But my last column of my data.frame Ca[,4] consists of values containing "." and "," - hence it is a german talbe the dec is",", but in R it is always a character. I tried already with gsub and as.numeric, but it always failed. Pleasse note: I already put dec=","
Could someone help me? If possible it should be a solution to run it on a lot of data.frames (or html imports or what ever) because I have many such tables...
Thank you very much!

You can use readr::parse_number :
Ca <- html_table(page, fill = TRUE, dec = ",")[[1]]
Ca$`Calciumgehalt in mg` <- readr::parse_number(Ca$`Calciumgehalt in mg`, locale = locale(decimal_mark = ",", grouping_mark = "."))
str(Ca)
# 'data.frame': 82 obs. of 4 variables:
# $ Lebensmittel : chr "Basilikum, getrocknet" "Majoran, getrocknet" "Thymian, getrocknet" "Selleriesamen" ...
# $ Kategorie : chr "Gewürze" "Gewürze" "Gewürze" "Gewürze" ...
# $ Mengenangabe : chr "je 100 Gramm" "je 100 Gramm" "je 100 Gramm" "je 100 Gramm" ...
# $ Calciumgehalt.in.mg: num 2240 1990 1890 1767 1597 ...

Related

How to fix "Error: arguments imply differing number of rows"?

I'm trying to get an API response using a URL that exists in an API data frame I just got, but I'm receiving the error:
"Error: arguments imply differing number of rows"
Does someone now how to fix it?
SCRIPT SO FAR
install.packages("jsonlite", "httr")
library(jsonlite)
library(httr)
### Generating URL and first request
url_deputados <- "https://dadosabertos.camara.leg.br/api/v2/deputados?idLegislatura=57&ordem=ASC&ordenarPor=nome"
get_deputados <- GET(url_deputados)
### Transforming it to text
deputados_text <- content(get_deputados, "text")
deputados_text
### Converting
deputados_json <- fromJSON(deputados_text, flatten = TRUE)
deputados_json
### Transforming it to table
deputados_df <- as.data.frame(deputados_json)
deputados_df
### And removing the two last columns which I don't need
deputados_df <- deputados_df[1:9]
### Now for the secondary requisitions, I'm creating a URL with the Id that is present in the first column of the data frame I just got
url_base <- "``https://dadosabertos.camara.leg.br/api/v2/``"
url_deputados <- "deputados/"
url_id <- deputados_df$dados.id
id_list <- c(url_id)
i <- 1
url <- paste0(url_base, url_deputados, id_list[i])
url
### Up to this point everything works, but I need to make sequential requests so I can GET the info for the next line of the existing data frame
while (i <= 531) {
print("Próxima página encontrada, baixando...")
get_deputados_id <- GET(paste0(url_base, url_deputados, id_list[i]))
deputados_id_text <- content(get_deputados_id, "text")
deputados_id_json <- fromJSON(deputados_id_text, flatten = TRUE)
deputados_id_df <- as.data.frame(deputados_id_json)
i <- i + 1
}
And this is where I receive the message error
When you run into problems at one line in your code, stop and look at the previous results. For instance, for me (since you didn't specify), I'm getting an error here:
deputados_df <- as.data.frame(deputados_json)
# Error in (function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE, :
# arguments imply differing number of rows: 532, 3
So ... let's look at deputados_json:
str(deputados_json)
# List of 2
# $ dados:'data.frame': 532 obs. of 9 variables:
# ..$ id : int [1:532] 220593 204379 220714 221328 204560 204528 121948 74646 160508 136811 ...
# ..$ uri : chr [1:532] "https://dadosabertos.camara.leg.br/api/v2/deputados/220593" "https://dadosabertos.camara.leg.br/api/v2/deputados/204379" "https://dadosabertos.camara.leg.br/api/v2/deputados/220714" "https://dadosabertos.camara.leg.br/api/v2/deputados/221328" ...
# ..$ nome : chr [1:532] "Abilio Brunini" "Acácio Favacho" "Adail Filho" "Adilson Barroso" ...
# ..$ siglaPartido : chr [1:532] "PL" "MDB" "REPUBLICANOS" "PL" ...
# ..$ uriPartido : chr [1:532] "https://dadosabertos.camara.leg.br/api/v2/partidos/37906" "https://dadosabertos.camara.leg.br/api/v2/partidos/36899" "https://dadosabertos.camara.leg.br/api/v2/partidos/37908" "https://dadosabertos.camara.leg.br/api/v2/partidos/37906" ...
# ..$ siglaUf : chr [1:532] "MT" "AP" "AM" "SP" ...
# ..$ idLegislatura: int [1:532] 57 57 57 57 57 57 57 57 57 57 ...
# ..$ urlFoto : chr [1:532] "https://www.camara.leg.br/internet/deputado/bandep/220593.jpg" "https://www.camara.leg.br/internet/deputado/bandep/204379.jpg" "https://www.camara.leg.br/internet/deputado/bandep/220714.jpg" "https://www.camara.leg.br/internet/deputado/bandep/221328.jpg" ...
# ..$ email : chr [1:532] "dep.abiliobrunini#camara.leg.br" "dep.acaciofavacho#camara.leg.br" "dep.adailfilho#camara.leg.br" "dep.adilsonbarroso#camara.leg.br" ...
# $ links:'data.frame': 3 obs. of 2 variables:
# ..$ rel : chr [1:3] "self" "first" "last"
# ..$ href: chr [1:3] "https://dadosabertos.camara.leg.br/api/v2/deputados?idLegislatura=57&ordem=ASC&ordenarPor=nome" "https://dadosabertos.camara.leg.br/api/v2/deputados?idLegislatura=57&ordem=ASC&ordenarPor=nome&pagina=1&itens=1000" "https://dadosabertos.camara.leg.br/api/v2/deputados?idLegislatura=57&ordem=ASC&ordenarPor=nome&pagina=1&itens=1000"
(Hint: that's not unambiguously converted into a frame.)
My guess is that you just need to access $dados:
head(deputados_json$dados)
# id uri nome siglaPartido uriPartido siglaUf idLegislatura urlFoto email
# 1 220593 https://dadosabertos.camara.leg.br/api/v2/deputados/220593 Abilio Brunini PL https://dadosabertos.camara.leg.br/api/v2/partidos/37906 MT 57 https://www.camara.leg.br/internet/deputado/bandep/220593.jpg dep.abiliobrunini#camara.leg.br
# 2 204379 https://dadosabertos.camara.leg.br/api/v2/deputados/204379 Acácio Favacho MDB https://dadosabertos.camara.leg.br/api/v2/partidos/36899 AP 57 https://www.camara.leg.br/internet/deputado/bandep/204379.jpg dep.acaciofavacho#camara.leg.br
# 3 220714 https://dadosabertos.camara.leg.br/api/v2/deputados/220714 Adail Filho REPUBLICANOS https://dadosabertos.camara.leg.br/api/v2/partidos/37908 AM 57 https://www.camara.leg.br/internet/deputado/bandep/220714.jpg dep.adailfilho#camara.leg.br
# 4 221328 https://dadosabertos.camara.leg.br/api/v2/deputados/221328 Adilson Barroso PL https://dadosabertos.camara.leg.br/api/v2/partidos/37906 SP 57 https://www.camara.leg.br/internet/deputado/bandep/221328.jpg dep.adilsonbarroso#camara.leg.br
# 5 204560 https://dadosabertos.camara.leg.br/api/v2/deputados/204560 Adolfo Viana PSDB https://dadosabertos.camara.leg.br/api/v2/partidos/36835 BA 57 https://www.camara.leg.br/internet/deputado/bandep/204560.jpg dep.adolfoviana#camara.leg.br
# 6 204528 https://dadosabertos.camara.leg.br/api/v2/deputados/204528 Adriana Ventura NOVO https://dadosabertos.camara.leg.br/api/v2/partidos/37901 SP 57 https://www.camara.leg.br/internet/deputado/bandep/204528.jpg dep.adrianaventura#camara.leg.br
After that, make sure you fix your url_base, It should almost certainly not contain so many backticks.
Finally, you should do the same thing in your while loop:
while (i <= 531) {
get_deputados_id <- GET(paste0(url_base, url_deputados, id_list[i]))
deputados_id_text <- content(get_deputados_id, "text")
deputados_id_json <- fromJSON(deputados_id_text, flatten = TRUE)
# deputados_id_df <- as.data.frame(deputados_id_json)
deputados_id_df <- deputados_id_json$dados
i <- i + 1
}

How to read file with irregularly nested quotations?

I have a file with irregular quotes like the following:
"INDICATOR,""CTY_CODE"",""MGN_CODE"",""EVENT_NR"",""EVENT_NR_CR"",""START_DATE"",""PEAK_DATE"",""END_DATE"",""MAX_EXT_ON"",""DURATION"",""SEVERITY"",""INTENSITY"",""AVERAGE_AREA"",""WIDEST_AREA_PERC"",""SCORE"",""GRP_ID"""
"Spi-3,""AFG"","""",1,1,""1952-10-01"",""1952-11-01"",""1953-06-01"",""1952-11-01"",9,6.98,0.78,19.75,44.09,5,1"
It seems irregular because the first column is only wrapped in single quotes, whereas every subsequent column is wrapped in double quotes. I'd like to read it so that every column is imported without quotes (neither in the header, nor the data).
What I've tried is the following:
# All sorts of tidyverse imports
tib <- readr::read_csv("file.csv")
And I also tried the suggestions offered here:
# Base R import
DF0 <- read.table("file.csv", as.is = TRUE)
DF <- read.csv(text = DF0[[1]])
# Data table import
DT0 <- fread("file.csv", header =F)
DT <- fread(paste(DT0[[1]], collapse = "\n"))
But even when it imports the file in the latter two cases, the variable names and some of the elements are wrapped in quotation marks.
I used data.table::fread with the quote="" option (which is "as is").
Then I cleaned the names and data by eliminating all the quotes.
The dates could be converted too, but I didn't do that.
library(data.table)
library(magrittr)
DT0 <- fread('file.csv', quote = "")
DT0 %>% setnames(names(.), gsub('"', '', names(.)))
string_cols <- which(sapply(DT0, class) == 'character')
DT0[, (string_cols) := lapply(.SD, function(x) gsub('\\"', '', x)),
.SDcols = string_cols]
str(DT0)
Classes ‘data.table’ and 'data.frame': 1 obs. of 16 variables:
$ INDICATOR : chr "Spi-3"
$ CTY_CODE : chr "AFG"
$ MGN_CODE : chr ""
$ EVENT_NR : int 1
$ EVENT_NR_CR : int 1
$ START_DATE : chr "1952-10-01"
$ PEAK_DATE : chr "1952-11-01"
$ END_DATE : chr "1953-06-01"
$ MAX_EXT_ON : chr "1952-11-01"
$ DURATION : int 9
$ SEVERITY : num 6.98
$ INTENSITY : num 0.78
$ AVERAGE_AREA : num 19.8
$ WIDEST_AREA_PERC: num 44.1
$ SCORE : int 5
$ GRP_ID : chr "1"
- attr(*, ".internal.selfref")=<externalptr>

R: read.fwf defines integer as numeric

I have a .txt file and am using Rstudio.
200416657210340 1665721 20040608 20090930 20060910 20070910 20080827 20090804
200416657210345 1665721 20040907 20090203 20070331 20080719
200416657210347 1665721 20040914 20091026 20070213 20080114 20090302
200416657210352 1665721 20041111 20100315 20070123 20071205 20081202
I am trying to read in the .txt file using read.fwf :
gripalisti <- read.fwf(file = "gripalisti.txt",
widths = c(15,8,9,9,9,9,9,9),
header = FALSE,
#stringsAsFactors = FALSE,
col.names = c("einst","bu","faeding","forgun","burdur1",
"burdur2","burdur3","burdur4"))
This works and the columns are the correct lenght.
However the "einst" and "bu" are supposed to be integer values and the rest are supposed to be dates.
When imported all the values in the first column (ID variables) look like this:
2.003140e+14
I have been trying to search for a way to change the imported column to integer (or character?) values and I have not found anything that does not result in an error.
An example, that I tried after a google:
gripalisti <- read.fwf(file = "gripalisti.txt",
widths = c(15,8,9,9,9,9,9,9),
header = FALSE,
#stringsAsFactors = FALSE,
col.names = c("einst","bu","faeding","forgun","burdur1",
"burdur2","burdur3","burdur4"),
colclasses = c("integer", "integer", "Date", "Date",
"Date", "Date", "Date", "Date"))
results in the error:
Error in read.table(file = FILE, header = header, sep = sep, row.names = row.names, :
unused argument (colclasses = c("integer", "integer", "Date", "Date", "Date", "Date", "Date", "Date"))
There are many missing values in the dataset that is over 100.000 lines. So other ways of importing have not worked for me. The dataset is NOT tab delimited.
Sorry if this is obvious, I am a very new R user.
edit:
Thanks for the help, I changed it to:
colClasses = c("character",
And now it look good.
As suggested in the comments:
it is colClasses=, not colclasses=, typo;
that first field cannot be stored as "integer", it must either be "numeric" or "character";
(additionally) those dates are not in the default format of %Y-%m-%d, you will need to convert them after reading in the data.
Prep:
writeLines("200416657210340 1665721 20040608 20090930 20060910 20070910 20080827 20090804\n200416657210345 1665721 20040907 20090203 20070331 20080719 \n200416657210347 1665721 20040914 20091026 20070213 20080114 20090302 \n200416657210352 1665721 20041111 20100315 20070123 20071205 20081202",
con = "gripalisti.txt")
Execution:
dat <- read.fwf("gripalisti.txt", widths = c(15,8,9,9,9,9,9,9), header = FALSE,
col.names = c("einst","bu","faeding","forgun","burdur1", "burdur2","burdur3","burdur4"),
colClasses = c("character", "integer", "character", "character", "character", "character", "character", "character"))
str(dat)
# 'data.frame': 4 obs. of 8 variables:
# $ einst : chr "200416657210340" "200416657210345" "200416657210347" "200416657210352"
# $ bu : int 1665721 1665721 1665721 1665721
# $ faeding: chr " 20040608" " 20040907" " 20040914" " 20041111"
# $ forgun : chr " 20090930" " 20090203" " 20091026" " 20100315"
# $ burdur1: chr " 20060910" " 20070331" " 20070213" " 20070123"
# $ burdur2: chr " 20070910" " 20080719" " 20080114" " 20071205"
# $ burdur3: chr " 20080827" " " " 20090302" " "
# $ burdur4: chr " 20090804" " " " " " 20081202"
dat[,3:8] <- lapply(dat[,3:8], as.Date, format = "%Y%m%d")
dat
# einst bu faeding forgun burdur1 burdur2 burdur3 burdur4
# 1 200416657210340 1665721 2004-06-08 2009-09-30 2006-09-10 2007-09-10 2008-08-27 2009-08-04
# 2 200416657210345 1665721 2004-09-07 2009-02-03 2007-03-31 2008-07-19 <NA> <NA>
# 3 200416657210347 1665721 2004-09-14 2009-10-26 2007-02-13 2008-01-14 2009-03-02 <NA>
# 4 200416657210352 1665721 2004-11-11 2010-03-15 2007-01-23 2007-12-05 <NA> 2008-12-02
str(dat)
# 'data.frame': 4 obs. of 8 variables:
# $ einst : chr "200416657210340" "200416657210345" "200416657210347" "200416657210352"
# $ bu : int 1665721 1665721 1665721 1665721
# $ faeding: Date, format: "2004-06-08" "2004-09-07" "2004-09-14" "2004-11-11"
# $ forgun : Date, format: "2009-09-30" "2009-02-03" "2009-10-26" "2010-03-15"
# $ burdur1: Date, format: "2006-09-10" "2007-03-31" "2007-02-13" "2007-01-23"
# $ burdur2: Date, format: "2007-09-10" "2008-07-19" "2008-01-14" "2007-12-05"
# $ burdur3: Date, format: "2008-08-27" NA "2009-03-02" NA
# $ burdur4: Date, format: "2009-08-04" NA NA "2008-12-02"
here the number in the first column is very large number, if you import it in term of integer or numeric it will automatically shown in exponent format. The way to resolve this to set scipen before reading the file. use below code :
options(scipen = 999)
I think this should resolve your problem.
Below is code I run, of course for date columns you need to to work. For that you can use simple command like as.Date(gripalisti$burdur1, format = "%Y%m%d")

Access first line only when output has two lines in R

I am using a package in R called linkcomm and here's the documentation for it https://cran.r-project.org/web/packages/linkcomm/linkcomm.pdf
This is what I run so far
library(linkcomm)
g <- read.table("sample.txt", header = FALSE)
lc <- getLinkCommunities(g)
mc=meta.communities(lc, hcmethod = "ward.D2", deepSplit = FALSE)
cc <- getCommunityCentrality(x, type = "commconn")
tmp = head(sort(cc, decreasing = TRUE))
print(tmp)
Output: 1e+14 5712365 12815415 511042 12815383 512594
3388.230 1493.165 1375.577 1350.684 1312.197 1302.445
Now the question is, how do I access the first row only in tmp, which is the actual nodes in the network data?
When I do tmp[1], it produces
1e+14
3388.23 where I only need 1e+14.
dput(a)
structure(c(3388.22995373249, 1493.16521374732, 1375.57742835837,
1350.68389440675, 1312.19704460178, 1302.44518389222), .Names = c("1e+14",
"5712365", "12815415", "511042", "12815383", "512594"))
You have a named numeric vector as you can see below when using str.
str(a)
Named num [1:6] 3388 1493 1376 1351 1312 ...
- attr(*, "names")= chr [1:6] "1e+14" "5712365" "12815415" "511042" ...
#To select the 1st element
a[1]
1e+14
3388.23
#To select the 1st element value without name
unname(a[1])
3388.23
#To select the 1st element name
names(a[1])
[1] "1e+14"
For all names/values in the vector, you can use names(a) / unname(a).

Bad interpretation of #N/A using `fread`

I am using data.table fread() function to read some data which have missing values and they were generated in Excel, so the missing values string is "#N/A". However, when I use the na.strings command the final str of the read data is still character. To replicate this, here is code and data.
Data:
Date,a,b,c,d,e,f,g
1/1/03,#N/A,0.384650146,0.992190069,0.203057232,0.636296656,0.271766148,0.347567706
1/2/03,#N/A,0.461486974,0.500702057,0.234400718,0.072789936,0.060900352,0.876749487
1/3/03,#N/A,0.573541006,0.478062582,0.840918789,0.061495666,0.64301024,0.939575302
1/4/03,#N/A,#N/A,#N/A,#N/A,#N/A,#N/A,#N/A
1/5/03,#N/A,#N/A,#N/A,#N/A,#N/A,#N/A,#N/A
1/6/03,#N/A,0.66678429,0.897482818,0.569609033,0.524295691,0.132941158,0.194114347
1/7/03,#N/A,0.576835985,0.982816576,0.605408973,0.093177815,0.902145012,0.291035649
1/8/03,#N/A,0.100952961,0.205491093,0.376410642,0.775917986,0.882827749,0.560508499
1/9/03,#N/A,0.350174456,0.290225065,0.428637309,0.022947911,0.7422805,0.354776101
1/10/03,#N/A,0.834345466,0.935128099,0.163158666,0.301310627,0.273928596,0.537167776
1/11/03,#N/A,#N/A,#N/A,#N/A,#N/A,#N/A,#N/A
1/12/03,#N/A,#N/A,#N/A,#N/A,#N/A,#N/A,#N/A
1/13/03,#N/A,0.325914633,0.68192633,0.320222677,0.249631582,0.605508964,0.739263677
1/14/03,#N/A,0.715104989,0.639040211,0.004186366,0.351412982,0.243570606,0.098312443
1/15/03,#N/A,0.750380716,0.264929325,0.782035411,0.963814327,0.93646428,0.453694758
1/16/03,#N/A,0.282389354,0.762102103,0.515151803,0.194083842,0.102386764,0.569730516
1/17/03,#N/A,0.367802161,0.906878948,0.848538256,0.538705673,0.707436236,0.186222899
1/18/03,#N/A,#N/A,#N/A,#N/A,#N/A,#N/A,#N/A
1/19/03,#N/A,#N/A,#N/A,#N/A,#N/A,#N/A,#N/A
1/20/03,#N/A,0.79933188,0.214688799,0.37011313,0.189503843,0.294051763,0.503147404
1/21/03,#N/A,0.620066341,0.329949446,0.123685075,0.69027192,0.060178071,0.599825005
(data saved in temp.csv)
Code:
library(data.table)
a <- fread("temp.csv", na.strings="#N/A")
gives (I have larger dataset so neglect the number of observations):
Classes ‘data.table’ and 'data.frame': 144 obs. of 8 variables:
$ Date: chr "1/1/03" "1/2/03" "1/3/03" "1/4/03" ...
$ a : chr NA NA NA NA ...
$ b : chr "0.384650146" "0.461486974" "0.573541006" NA ...
$ c : chr "0.992190069" "0.500702057" "0.478062582" NA ...
$ d : chr "0.203057232" "0.234400718" "0.840918789" NA ...
$ e : chr "0.636296656" "0.072789936" "0.061495666" NA ...
$ f : chr "0.271766148" "0.060900352" "0.64301024" NA ...
$ g : chr "0.347567706" "0.876749487" "0.939575302" NA ...
- attr(*, ".internal.selfref")=<externalptr>
This code works fine
a <- read.csv("temp.csv", header=TRUE, na.strings="#N/A")
Is it a bug? Is there some smart workaround?
The documentation from ?fread for na.strings reads:
na.strings A character vector of strings to convert to NA_character_. By default for columns read as type character ",," is read as a blank string ("") and ",NA," is read as NA_character_. Typical alternatives might be na.strings=NULL or perhaps na.strings = c("NA","N/A","").
You should convert them to numeric yourself after, I suppose. At least this is what I understand from the documentation.
Something like this?
cbind(a[, 1], a[, lapply(.SD[, -1], as.numeric)])

Resources