Scraping data from pdf file in R - r
I need to extract tables from a pdf. Here's the link
https://www.acea.be/uploads/statistic_documents/ACEA_Report_Vehicles_in_use-Europe_2018.pdf
I want first table from this pdf.
Here is my code
Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre1.8.0_201') # for 64-bit version
# install.packages("devtools")
library(tabulizer)
library(tabulizerjars)
library(tidyverse)
tab <- extract_tables("https://www.acea.be/uploads/statistic_documents/ACEA_Report_Vehicles_in_use-Europe_2018.pdf")
tab[[1]]
head(tab[[1]])
But in o/p column of year 2012,2013,2015,2016 are getting append into one column.
I want table as in pdf file .
o/p of my code.
[,1] [,2] [,3]
[1,] "Croatia" "1,445,0001,433,5631,458,1491,489,3381,540,2603.4" ""
[2,] "Czech Republic" "4,698,8004,787,8494,893,5625,115,3165,368,6605.0" ""
[3,] "Denmark" "2,225,1642,265,3492,320,9822,391,7552,477,4783.6" ""
[4,] "Estonia" "602,133628,562652,949676,592703,1513.9" ""
[5,] "Finland" "2,560,1902,575,9512,595,8672,612,9222,629,4320.6" ""
[6,] "France" "31,600,00031,650,00031,799,00031,915,49331,999,9530.3" ""
Here is an alternative solution :
library(RDCOMClient)
################################################
#### Step 1 : We convert the image to a PDF ####
################################################
path_PDF <- "C:\\ACEA_Report_Vehicles_in_use-Europe_2018.pdf"
path_Word <- "C:\\temp.docx"
####################################################################
#### Step 2 : We use the OCR of Word to convert the PDF in word ####
####################################################################
wordApp <- COMCreate("Word.Application")
wordApp[["Visible"]] <- TRUE
wordApp[["DisplayAlerts"]] <- FALSE
doc <- wordApp[["Documents"]]$Open(normalizePath(path_PDF),
ConfirmConversions = FALSE)
doc$SaveAs2(path_Word)
##############################################################
#### Step 3 : We extract the table from the word document ####
##############################################################
nb_Table <- doc$tables()$count()
list_Table <- list()
for(l in 1 : nb_Table)
{
nb_Row <- doc$tables(l)$Rows()$Count()
nb_Col <- doc$tables(l)$Columns()$Count()
mat_Temp <- matrix(NA, nrow = nb_Row, ncol = nb_Col)
for(i in 1 : nb_Row)
{
for(j in 1 : nb_Col)
{
mat_Temp[i, j] <- tryCatch(doc$tables(l)$cell(i, j)$range()$text(), error = function(e) NA)
}
}
list_Table[[l]] <- mat_Temp
}
list_Table[[1]]
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,] "Austria\r\a" "4,584,202\r\a" "4,641,308\r\a" "4,694,921\r\a" "4,748,048\r\a" "4,821,557\r\a" "1.5\r\a"
[2,] "Belgium\r\a" "5,392,909\r\a" "5,439,295\r\a" "5,511,080\r\a" "5,587,415\r\a" "5,669,764\r\a" "1.5\r\a"
[3,] "Croatia\r\a" "1,445,000\r\a" "1,433,563\r\a" "1,458,149\r\a" "1,489,338\r\a" "1,540,260\r\a" "3.4\r\a"
[4,] "Czech Republic\r\a" "4,698,800\r\a" "4,787,849\r\a" "4,893,562\r\a" "5,115,316\r\a" "5,368,660\r\a" "5.0\r\a"
[5,] "Denmark\r\a" "2,225,164\r\a" "2,265,349\r\a" "2,320,982\r\a" "2,391,755\r\a" "2,477,478\r\a" "3.6\r\a"
[6,] "Estonia\r\a" "602,133\r\a" "628,562\r\a" "652,949\r\a" "676,592\r\a" "703,151\r\a" "3.9\r\a"
[7,] "Finland\r\a" "2,560,190\r\a" "2,575,951\r\a" "2,595,867\r\a" "2,612,922\r\a" "2,629,432\r\a" "0.6\r\a"
[8,] "France\r\a" "31,600,000\r\a" "31,650,000\r\a" "31,799,000\r\a" "31,915,493\r\a" "31,999,953\r\a" "0.3\r\a"
[9,] "Germany\r\a" "43,431,124\r\a" "43,851,230\r\a" "44,403,124\r\a" "45,071,209\r\a" "45,803,560\r\a" "1.6\r\a"
[10,] "Greece\r\a" "5,138,745\r\a" "5,109,435\r\a" "5,102,203\r\a" "5,104,908\r\a" "5,126,024\r\a" "0.4\r\a"
[11,] "Hungary\r\a" "2,978,745\r\a" "3,035,764\r\a" "3,101,752\r\a" "3,192,132\r\a" "3,308,495\r\a" "3.6\r\a"
[12,] "Ireland\r\a" "1,882,550\r\a" "1,910,165\r\a" "1,943,868\r\a" "1,985,130\r\a" "2,026,977\r\a" "2.1\r\a"
[13,] "Italy\r\a" "37,078,274\r\a" "36,962,934\r\a" "37,080,753\r\a" "37,351,233\r\a" "37,876,138\r\a" "1.4\r\a"
[14,] "Latvia\r\a" "618,000\r\a" "634,214\r\a" "657,487\r\a" "677,561\r\a" "663,091\r\a" "-2.1\r\a"
[15,] "Lithuania\r\a" "1,753,000\r\a" "1,837,661\r\a" "1,113,445\r\a" "1,153,859\r\a" "1,190,146\r\a" "3.1\r\a"
[16,] "Luxembourg\r\a" "344,951\r\a" "355,358\r\a" "362,879\r\a" "372,538\r\a" "380,860\r\a" "2.2\r\a"
[17,] "Netherlands\r\a" "8,142,000\r\a" "8,154,000\r\a" "8,192,570\r\a" "8,336,414\r\a" "8,439,318\r\a" "1.2\r\a"
[18,] "Poland\r\a" "18,744,412\r\a" "19,389,446\r\a" "20,003,863\r\a" "20,723,423\r\a" "21,675,388\r\a" "4.6\r\a"
[19,] "Portugal\r\a" "4,497,000\r\a" "4,480,000\r\a" "4,496,000\r\a" "4,538,000\r\a" "4,600,000\r\a" "1.4\r\a"
[20,] "Romania\r\a" "4,485,148\r\a" "4,693,651\r\a" "4,905,630\r\a" "5,153,182\r\a" "5,470,578\r\a" "6.2\r\a"
[21,] "Slovakia\r\a" "1,826,393\r\a" "1,882,577\r\a" "1,952,002\r\a" "2,037,772\r\a" "2,124,972\r\a" "4.3\r\a"
[22,] "Slovenia\r\a" "1,080,001\r\a" "1,085,347\r\a" "1,096,920\r\a" "1,116,006\r\a" "1,143,218\r\a" "2.4\r\a"
[23,] "Spain\r\a" "22,247,528\r\a" "22,024,538\r\a" "22,029,512\r\a" "22,355,549\r\a" "22,876,247\r\a" "2.3\r\a"
[24,] "Sweden\r\a" "4,447,165\r\a" "4,495,473\r\a" "4,585,519\r\a" "4,669,063\r\a" "4,768,060\r\a" "2.1\r\a"
[25,] "United Kingdom\r\a" "31,481,823\r\a" "31,917,885\r\a" "32,612,782\r\a" "33,542,448\r\a" "34,378,386\r\a" "2.5\r\a"
[26,] "EUROPEAN UNION\r\a" "243,285,257\r\a" "245,241,555\r\a" "247,566,819\r\a" "251,917,306\r\a" "257,061,713\r\a" "2.0\r\a"
[27,] "Norway\r\a" "2,433,147\r\a" "2,487,254\r\a" "2,539,513\r\a" "2,592,324\r\a" "2,639,245\r\a" "1.8\r\a"
[28,] "Switzerland\r\a" "4,300,036\r\a" "4,366,895\r\a" "4,430,375\r\a" "4,503,865\r\a" "4,571,994\r\a" "1.5\r\a"
[29,] "EFTA\r\a" "6,733,183\r\a" "6,854,149\r\a" "6,969,888\r\a" "7,096,189\r\a" "7,211,239\r\a" "1.6\r\a"
[30,] "Russia\r\a" "38,482,000\r\a" "39,322,526\r\a" "40,844,535\r\a" "40,859,866\r\a" "41,614,430\r\a" "1.8\r\a"
[31,] "Turkey\r\a" "8,648,875\r\a" "9,283,923\r\a" "9,857,915\r\a" "10,589,337\r\a" "11,317,998\r\a" "6.9\r\a"
[32,] "Ukraine\r\a" "9,910,004\r\a" "9,958,943\r\a" "9,581,401\r\a" "9,602,581\r\a" "9,679,279\r\a" "0.8\r\a"
Related
How do I convert a Matrix in R to a Vector
Say I have the following matrix a (dput below): > a [,1] [,2] [,3] [,4] [1,] 26.96318 136.3067 237.9886 399.4045 [2,] 55.36927 182.4738 246.5615 385.9003 [3,] 97.10758 122.8290 285.2592 325.1235 [4,] 80.23753 197.0916 203.1537 328.1359 [5,] 17.92519 109.8690 251.1775 328.4357 [6,] 71.45958 118.3602 249.6157 338.3364 [7,] 16.78765 161.3456 206.6488 385.8919 [8,] 32.80423 105.7197 234.8725 367.6513 [9,] 73.56880 108.8779 215.4797 316.6170 [10,] 41.08425 146.8893 211.2142 338.3543 How does one linearize(get a vector) it: 1. column by column 2. row by row For example a column-by-column result would look like: [1] 26.96318 55.36927 97.10758 80.23753 17.92519 71.45958 16.78765 32.80423 73.56880 41.08425 136.30665 182.47378 [13] 122.82902 197.09157 109.86903 118.36019 161.34561 105.71971 108.87791 146.88933 237.98864 246.56151 285.25918 203.15367 [25] 251.17751 249.61567 206.64882 234.87252 215.47969 211.21424 399.40453 385.90034 325.12346 328.13589 328.43573 338.33636 [37] 385.89189 367.65128 316.61704 338.35429 > dput(a) structure(c(26.9631815841421, 55.369265563786, 97.1075813053176, 80.237529752776, 17.9251873865724, 71.4595773722976, 16.7876488063484, 32.8042338369414, 73.568799556233, 41.0842498764396, 136.306652054191, 182.473776396364, 122.829020931385, 197.091567260213, 109.869031514972, 118.360190931708, 161.345613677986, 105.719711235724, 108.877905877307, 146.889329864644, 237.988638831303, 246.561505645514, 285.259176045656, 203.153668926097, 251.177511387505, 249.615669064224, 206.648816983216, 234.872522787191, 215.479686786421, 211.214235564694, 399.404528317973, 385.90033929795, 325.123458285816, 328.135887836106, 328.43573493883, 338.336359220557, 385.891888826154, 367.651279014535, 316.617037914693, 338.354286877438), .Dim = c(10L, 4L))
as.vector() will convert your matrix into a vector column-by-column: > as.vector(a) [1] 26.96318 55.36927 97.10758 80.23753 17.92519 71.45958 16.78765 [8] 32.80423 73.56880 41.08425 136.30665 182.47378 122.82902 197.09157 ... To get a row-by-row result, transpose it first, using t(): > as.vector(t(a)) [1] 26.96318 136.30665 237.98864 399.40453 55.36927 182.47378 246.56151 [8] 385.90034 97.10758 122.82902 285.25918 325.12346 80.23753 197.09157 ...
What are the equivalents of MCA variables coordinates and supplementary variables coordinates in mjca?
I would like to use mjca (package 'ca') on my data in order to estimate explained variation more realistically for the dimensions. The problem is that I would like to extract the coordinates of the active and supplementary variables in order to edit them in a data frame. However, the names of the variables and the dimensions are not given in the output of mjca. In MCA (package 'FactoMineR') the output is given as follows: > mca$var$coord Dim 1 Dim 2 Dim 3 Dim 4 Dim 5 a 0.620468268 0.011534137 -0.542655702 0.47922448 0.15548571 cl 1.231043177 4.591555841 -0.323929172 0.19597918 -0.41446395 np -0.347646238 0.003735466 -0.006099464 0.02238883 0.16510343 num 0.417635652 -0.351884061 -0.760499677 0.60590774 -0.35647256 pr 0.945109906 -0.227098798 3.411969743 2.70823750 -0.64981046 vp 0.809895398 -0.303805822 0.048900811 -0.50023568 -0.53191069 EMB_no 0.396034450 -0.046768029 -0.058069978 0.05448188 0.06326411 EM_yes -1.009887848 0.119258474 0.148078445 -0.13892880 -0.16132349 ca -0.345163332 -0.088791765 -0.222907122 0.16679404 -0.12407031 to 0.375618920 0.096626332 0.242575397 -0.18151117 0.13501769 ART_no -0.006456155 0.021963298 0.049258256 -0.05919682 -0.07539649 ART_yes 0.044475732 -0.151302718 -0.339334655 0.40780032 0.51939806 > mca$quali.sup$coord Dim 1 Dim 2 Dim 3 Dim 4 Dim 5 ipva -0.1508708 0.04768873 -0.0233159 0.08795449 0.01645747 isv 0.6731160 -0.21276510 0.1040248 -0.39241234 -0.07342562 Is there a way to extract and paste the names of the variables to the coordinates in mjca? In mjca the output is not easily interpretable: > mjca$colcoord [,1] [,2] [,3] [,4] [,5] [,6] [,7] [1,] -1.14877950 -0.03284730 1.85933139 1.71512222 0.59783898 2.33329527 -1.96334559 [2,] -2.27924173 -13.07598521 1.10989653 0.70140040 -1.59360435 -0.53239520 0.59206669 [3,] 0.64365721 -0.01063798 0.02089893 0.08012859 0.63481889 -0.18685503 0.16878106 [4,] -0.77324063 1.00210711 2.60574231 2.16851574 -1.37062878 1.00967151 -7.97223158 [5,] -1.74984434 0.64673950 -11.69062156 9.69265663 -2.49850629 -0.05490287 -0.25510472 [6,] -1.49949849 0.86518831 -0.16755157 -1.79032033 -2.04518437 -0.20949668 0.65583755 [7,] -0.73324662 0.13318755 0.19896839 0.19498813 0.24324906 0.05337135 0.15195814 [8,] 1.86977887 -0.33962824 -0.50736940 -0.49721974 -0.62028511 -0.13609695 -0.38749326 [9,] 0.63906018 0.25286414 0.76375906 0.59694816 -0.47704750 -0.12352216 -0.44654719 [10,] -0.69544784 -0.27517568 -0.83114957 -0.64962006 0.51913993 0.13442117 0.48594841 [11,] 0.01195339 -0.06254781 -0.16877630 -0.21186268 -0.28989778 -0.27854165 -0.39280823 [12,] -0.08234556 0.43088491 1.16268119 1.45949845 1.99707361 1.91884250 2.70601223 [13,] -0.76304235 0.48124458 1.23544015 1.02426084 0.57083939 1.79253772 -0.83528819 [14,] -1.64355033 -5.87863606 0.76112230 0.59773026 -0.22298799 0.40707187 -0.23302386 [15,] -0.40920789 0.55803109 1.06737902 0.94054386 0.10598994 -0.60665661 -1.11993976 I think I should first extract the coordinates like this: coord.mjca<-as.data.frame(mjca$colcoord) row.names(coord.mjca)<-mjca$levelnames colnames(coord.mjca)<-c("Dim 1", "Dim 2", "Dim 3", "Dim 4", "Dim 5", "Dim 6", "Dim 7") Do you think I should do it like this? Thank you for your help!
In ca package, you can use cacoord for this. For example cacoord(mca, type='rowprincipal', rows=T) cacoord(mca, type='symmetric', cols=T) I hope this helps.
How to select a value from a table in R
I have the following data, called fit.2.sim: An object of class "sim" Slot "coef": fit.2.sim [,1] [,2] [1,] -1.806363 5.148728 [2,] -3.599123 5.183769 [3,] 4.192562 4.855095 [4,] 2.658218 4.967007 [5,] -2.304084 5.220325 [6,] -1.010406 5.071663 [7,] 2.601671 5.129750 [8,] 5.977764 4.757826 [9,] 3.873432 4.932319 [10,] 1.281331 5.138091 Slot "sigma": [1] 8.285497 10.659971 9.568340 8.649106 8.611894 9.041444 8.316859 7.990499 8.985450 [10] 7.947142 The command I have been using, to no avail unfortunately is: fit.2.sim$coef[i,j] i,j being the respective rows and columns. The error I get is: "Error in fit.2.sim$coef : $ operator not defined for this S4 class" Could you please tell me if there is another way to make this work?
S4 classes use # not $ to access slots, so you probably wanted fit.t.sim#coef[i,j]
Switch Element Type from Character to Numeric?
I have 3D matrix with numbers, but R treat numeric data as character, somehow. Files I load are numeric vectors. But once I put them into 3D vector, all data numbers shows up as "character" like this: [,1] [,2] [,3] [,4] [,5] [,6] [1,] "3.79" "3.79" "2.33" "2.33" "2.79" "2.79" [2,] "3.79" "3.79" "2.33" "2.33" "2.79" "2.79" [3,] "3.02" "3.02" "4.94" "4.94" "4.33" "4.33" [4,] "3.02" "3.02" "4.94" "4.94" "4.33" "4.33" [5,] "4.25" "4.25" "4.06" "4.06" "4.98" "4.98" [6,] "4.25" "4.25" "4.06" "4.06" "4.98" "4.98" [7,] "4.25" "4.25" "4.06" "4.06" "4.98" "4.98" [8,] "2.07" "2.07" "2.09" "2.09" "2.92" "2.92" but before I put in 3D matrix, data shows like this: [39965] 3.68230769 3.68230769 3.68230769 2.96454545 [39969] 2.96454545 3.93600000 3.93600000 3.93600000 [39973] 3.67769231 3.67769231 3.67769231 5.12750000 [39977] 5.12750000 5.12750000 3.05083333 3.05083333 [39981] 3.05083333 1.94166667 1.94166667 1.69000000 [39985] 1.69000000 1.69000000 2.01769231 2.01769231 [39989] 2.01769231 3.05692308 3.05692308 3.05692308 [39993] 3.72916667 3.72916667 3.72916667 2.65454545 [39997] 2.65454545 2.45583333 2.45583333 2.45583333 Here is my code: for (i in 1: length(precipitation)) { precip <- read.csv(precipitation[i]) precip[is.na(precip)] <- 0 precip2<- precip[,-1] precip3<-as.vector(unlist(precip2)) prep_data[,,i]<-matrix(precip3,ncol=200,nrow=200) } Is it possible to add some coding to fix this problem, so all my 3D matrix elements are numeric, not "numeric".
Use as.numeric to convert something to numeric. In general, as.class converts to that class (numeric, character, factor, Date, data.frame, matrix, and many many more).
You can coerce input data to a particular class with the colClasses argument. The code below which might be substituted for the read.csv call in your code will generate warnings if it encounters non-numeric entries, but the good data will be ensured to be numeric: precip <- read.csv(precipitation[i], colClasses="numeric" )
Make a repeating alpha-numeric list
I want to make a list like this: "A001:A048", "B001:B048", ..., "Z001:Z048", "AA001:AA048", "BB001:BB048", ... I looked at this thread, but couldn't figure how to adapt it for my repeating letters. Thanks for the help.
c( sprintf("%s001:%s048", LETTERS,LETTERS), sprintf("%s%s001:%s%s048", LETTERS,LETTERS,LETTERS, LETTERS) ) Here is an example with using "indexed substitution" (my term) with sprintf: outer(LETTERS, 1:26, FUN=sprintf, fmt="%1$s%1$s%2$03d:%1$s%1$s%2$03d") # [,1] [,2] [,3] [,4] [,5] [1,] "AA001:AA001" "AA002:AA002" "AA003:AA003" "AA004:AA004" "AA005:AA005" [2,] "BB001:BB001" "BB002:BB002" "BB003:BB003" "BB004:BB004" "BB005:BB005" [3,] "CC001:CC001" "CC002:CC002" "CC003:CC003" "CC004:CC004" "CC005:CC005" [4,] "DD001:DD001" "DD002:DD002" "DD003:DD003" "DD004:DD004" "DD005:DD005" [5,] "EE001:EE001" "EE002:EE002" "EE003:EE003" "EE004:EE004" "EE005:EE005" snipped a couple of pages of output And one further shot with the A:AF 1:48 combo: outer( c(LETTERS,paste("A",LETTERS[1:6],sep="")), 1:48, FUN=sprintf, fmt="%1$s%1$s%2$03d") #----------------------------------- # [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [1,] "A001" "A002" "A003" "A004" "A005" "A006" "A007" "A008" "A009" "A010" [2,] "B001" "B002" "B003" "B004" "B005" "B006" "B007" "B008" "B009" snipped [,41] [,42] [,43] [,44] [,45] [,46] [,47] [,48] snipped [31,] "AE041" "AE042" "AE043" "AE044" "AE045" "AE046" "AE047" "AE048" [32,] "AF041" "AF042" "AF043" "AF044" "AF045" "AF046" "AF047" "AF048"
I think this is what you want, even though your question isn't clear. I use sprintf because it makes padding with leading zeros easier. prefix <- c(LETTERS,paste("A",LETTERS[1:6],sep="")) out <- sapply(prefix, function(x) sprintf("%s%03d",x,1:48)) as.vector(out) # if you want a vector instead